Browse files

first commit

  • Loading branch information...
0 parents commit 959eb632c9ed6d16de5379f446ac8ad883cd1e9a @nitindhar7 committed Mar 31, 2011
4 .gitignore
@@ -0,0 +1,4 @@
+*.layout
+*.depend
+bin
+obj
92 Query.cpp
@@ -0,0 +1,92 @@
+#include <iostream>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <sstream>
+#include "lib/Query.h"
+using namespace std;
+
+Query::Query(string query)
+{
+ text = query;
+ count = 0;
+}
+
+node* Query::open_list(Lexicon &lexicon, LexiconCursor &lexicon_cursor)
+{
+ int offset = 0, doc_id = 0, frequency = 0;
+ string inverted_list;
+ stringstream parseable_inverted_list;
+ node* head = NULL;
+
+ offset = get_offset( lexicon, lexicon_cursor );
+ inverted_list = get_inverted_list( offset ); // remember to seek to beg after done
+
+ parseable_inverted_list << inverted_list;
+
+ while( parseable_inverted_list >> doc_id >> frequency ) {
+ node *temp, *temp2;
+
+ count++;
+
+ temp = new node;
+ temp->doc_id = doc_id;
+ temp->frequency = frequency;
+ temp->next = NULL;
+
+ if (head == NULL)
+ head = temp;
+ else {
+ temp2 = head;
+
+ while (temp2->next != NULL)
+ temp2 = temp2->next;
+
+ temp2->next = temp;
+ }
+ }
+
+ return head;
+}
+
+int Query::get_offset(Lexicon &lexicon, LexiconCursor &lexicon_cursor)
+{
+ lexicon_cursor = lexicon.find( text );
+ return ( *lexicon_cursor ).second;
+}
+
+string Query::get_inverted_list(int offset)
+{
+ string inverted_list;
+ ifstream inverted_index;
+
+ inverted_index.open( "structures/inverted_index" );
+ inverted_index.seekg( offset, ios::beg );
+ getline( inverted_index, inverted_list );
+ inverted_index.close();
+
+ return inverted_list;
+}
+
+/*
+void Query::close_list(node* head)
+{
+ free( head );
+}
+
+int Query::get_frequency(int document_id)
+{
+ int doc_id = 0, frequency = 0;
+ string inverted_list;
+ stringstream parseable_inverted_list;
+
+ while( parseable_inverted_list >> doc_id >> frequency ) {
+ if( doc_id == document_id )
+ return frequency;
+ }
+
+ return 0;
+}
+
+*/
62 QueryProcessor.cpp
@@ -0,0 +1,62 @@
+#include <iostream>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <sstream>
+#include <cmath>
+#include "lib/QueryProcessor.h"
+
+#define CONSTANT_K 1.2
+#define CONSTANT_B 0.75
+using namespace std;
+
+QueryProcessor::QueryProcessor(int query_count, vector<Query> user_queries)
+{
+ num_queries = query_count;
+ queries = user_queries;
+}
+
+/*int QueryProcessor::nextGEQ( node* list_head, int doc_id)
+{
+ node* head = NULL;
+
+ head = list_head;
+
+ if( head->doc_id >= doc_id )
+ return doc_id;
+ else
+ head = head->next;
+
+ return -1;
+}
+
+double QueryProcessor::calculate_rank(int doc_id)
+{
+ int total_pages, total_pages_with_queryword, freq_of_query_in_doc, doc_length;
+ double K, page_rank, log_result,freq_result,avg_doc_length;
+
+ K = CONSTANT_K* ( (1-CONSTANT_B) + ( CONSTANT_B * (doc_length/avg_doc_length) ) );
+
+ total_pages_with_queryword = Query.count();
+ freq_of_query_in_doc = Query.get_frequency();
+
+ log_result = log( (total_pages - freq_of_query_in_doc + 0.5) / ( freq_of_query_in_doc + 0.5) );
+ freq_result = ( ((CONSTANT_K+1) * total_pages_with_queryword) / (K+total_pages_with_queryword) );
+
+ page_rank = log_result * freq_result;
+
+ return page_rank;
+}
+
+string QueryProcessor::get_url(int doc_id)
+{
+ url_table_iterator = kyon_url_table.find( doc_id );
+ return ( *url_table_iterator ).second.url;
+}
+
+void QueryProcessor::add_to_heap(string url, int doc_id)
+{
+ //add url and doc_id
+}
+*/
41,560 data/inverted_index
41,560 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
41,560 data/lexicon
41,560 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
25,892 data/url_table
25,892 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
BIN docs/query processor 1.jpg
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
BIN docs/query processor 2.jpg
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
BIN docs/query ranking 1.jpg
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
BIN docs/query ranking 2.jpg
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
35 kyon.cbp
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
+<CodeBlocks_project_file>
+ <FileVersion major="1" minor="6" />
+ <Project>
+ <Option title="kyon" />
+ <Option pch_mode="2" />
+ <Option compiler="gcc" />
+ <Build>
+ <Target title="Debug">
+ <Option output="bin\Debug\kyon" prefix_auto="1" extension_auto="1" />
+ <Option object_output="obj\Debug\" />
+ <Option type="1" />
+ <Option compiler="gcc" />
+ <Compiler>
+ <Add option="-g" />
+ </Compiler>
+ </Target>
+ </Build>
+ <Compiler>
+ <Add option="-Wall" />
+ <Add option="-fexceptions" />
+ </Compiler>
+ <Unit filename="Query.cpp" />
+ <Unit filename="QueryProcessor.cpp" />
+ <Unit filename="lib\Query.h" />
+ <Unit filename="lib\QueryProcessor.h" />
+ <Unit filename="lib\boot.h" />
+ <Unit filename="lib\util.h" />
+ <Unit filename="main.cpp" />
+ <Extensions>
+ <code_completion />
+ <debugger />
+ </Extensions>
+ </Project>
+</CodeBlocks_project_file>
29 lib/Query.h
@@ -0,0 +1,29 @@
+#ifndef QUERY_H
+#define QUERY_H
+
+#include <iostream>
+#include <map>
+using namespace std;
+
+struct node {
+ int doc_id;
+ int frequency;
+ node *next;
+};
+
+class Query
+{
+ public:
+ string text;
+ int count;
+ Query(string);
+ node* open_list(Lexicon &, LexiconCursor &);
+ private:
+ int get_offset(Lexicon &, LexiconCursor &);
+ string get_inverted_list(int);
+};
+
+#endif
+
+//void close_list(node*);
+ //int get_frequency(int);
22 lib/QueryProcessor.h
@@ -0,0 +1,22 @@
+#ifndef QUERY_PROCESSOR_H
+#define QUERY_PROCESSOR_H
+
+#include <iostream>
+#include <map>
+#include <vector>
+#include "Query.h"
+using namespace std;
+
+class QueryProcessor
+{
+ public:
+ QueryProcessor(int, vector<Query>);
+ /*int nextGEQ(node*, int);
+ double calculate_rank(int);
+ string get_url(int);
+ void add_to_heap(string, int);*/
+ vector<Query> queries;
+ int num_queries;
+};
+
+#endif
69 lib/boot.h
@@ -0,0 +1,69 @@
+#ifndef BOOT_H
+#define BOOT_H
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <map>
+using namespace std;
+
+struct page_stats {
+ string url;
+ int page_size;
+};
+
+typedef map<string, int> Lexicon;
+typedef map<string, int>::iterator LexiconCursor;
+typedef map<int, page_stats> UrlTable;
+typedef map<int, page_stats>::iterator UrlTableCursor;
+
+namespace boot
+{
+ void load_lexicon(Lexicon &lexicon)
+ {
+ string line, word;
+ int offset;
+ ifstream lexicon_file;
+
+ lexicon_file.open( "data/lexicon" );
+ cout << "* Loading Lexicon... ";
+
+ while( !lexicon_file.eof() ) {
+ lexicon_file >> word >> offset;
+ lexicon[word] = offset;
+ }
+
+ cout << "[DONE]" << endl;
+ lexicon_file.close();
+ }
+
+ void load_url_table(UrlTable &url_table)
+ {
+ page_stats stats;
+ string url;
+ int doc_id, page_size;
+ ifstream url_table_file;
+
+ url_table_file.open( "data/url_table" );
+ cout << "* Loading URL Table... ";
+
+ while( !url_table_file.eof() ) {
+ url_table_file >> doc_id >> page_size >> url;
+ stats.page_size = page_size;
+ stats.url = url;
+ url_table[doc_id] = stats;
+ }
+
+ cout << "[DONE]" << endl << endl;
+ url_table_file.close();
+ }
+
+ void init(Lexicon &lexicon, UrlTable &url_table)
+ {
+ cout << "Booting Query Processor" << endl;
+ load_lexicon( lexicon );
+ load_url_table( url_table );
+ }
+}
+
+#endif
53 lib/util.h
@@ -0,0 +1,53 @@
+#ifndef UTIL_H
+#define UTIL_H
+
+#include <iostream>
+#include <sstream>
+#include <map>
+#include <algorithm>
+#include <string>
+#include "Query.h"
+using namespace std;
+
+namespace util
+{
+ void quit_if_requested(string user_input, Lexicon &lexicon, UrlTable &url_table)
+ {
+ if( !user_input.compare( "quit" ) ) {
+ lexicon.clear();
+ url_table.clear();
+ exit( 0 );
+ }
+ }
+
+ int collect_queries(string user_input, vector<Query> &queries)
+ {
+ int num_queries = 0;
+ string tmp_query;
+ stringstream user_input_stream( user_input );
+
+ while( user_input_stream >> tmp_query ) {
+ Query query( tmp_query );
+ queries.push_back( query );
+ num_queries++;
+ }
+
+ return num_queries;
+ }
+
+ string search_or_quit(Lexicon &lexicon, UrlTable &url_table)
+ {
+ string user_input;
+
+ cout << "Search or 'QUIT' to quit: ";
+ getline( cin, user_input );
+
+ transform( user_input.begin(), user_input.end(), user_input.begin(), ::tolower );
+
+ quit_if_requested( user_input, lexicon, url_table );
+
+ return user_input;
+ }
+}
+
+#endif
50 main.cpp
@@ -0,0 +1,50 @@
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <map>
+#include <vector>
+#include "lib/boot.h"
+#include "lib/util.h"
+#include "lib/QueryProcessor.h"
+#include "lib/Query.h"
+using namespace std;
+
+Lexicon lexicon;
+LexiconCursor lexicon_cursor;
+UrlTable url_table;
+UrlTableCursor url_table_cursor;
+
+vector<Query> queries;
+
+int main()
+{
+ boot::init( lexicon, url_table );
+ // get average page length somehow.
+
+ while( true ) {
+ vector<node*> inverted_lists;
+ string user_input = util::search_or_quit( lexicon, url_table );
+ int num_queries = util::collect_queries( user_input, queries );
+ QueryProcessor query_processor( num_queries, queries );
+
+ // GET LIST POINTERS
+ for( int i = 0; i < query_processor.num_queries; i++ ) {
+ inverted_lists.push_back( query_processor.queries[i].open_list( lexicon, lexicon_cursor ) );
+ //get_max_page_num_by_travering_to_end_of_lists_and_comparing_values();
+ }
+
+ // CALC SCORES USING BM25, nextGEQ, freq, add to heap etc, etc
+ //while() {
+
+ //}
+
+ // loop num_queries: queryies[i].close_list()
+
+ // display results
+ }
+
+ return 0;
+}

0 comments on commit 959eb63

Please sign in to comment.