Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

moved inverted lists vector to QP. updated method signatures to accom…

…odate for QP ILs

refactored QP header. moved inverted_lists completely into QP
created function to display results
created min heap class. refactored results queue into min heap and cleaned main function. moved overloaded op to min heap
  • Loading branch information...
commit 88aace79e202556229459250a9b39f63ea07ed9d 1 parent 79a9c31
@nitindhar7 authored
View
33 MinHeap.cpp
@@ -0,0 +1,33 @@
+#include <iostream>
+#include "MinHeap.h"
+using namespace std;
+
+MinHeap::MinHeap(int size)
+{
+ max_size = size;
+}
+
+void MinHeap::push(Score score)
+{
+ if( heap.size() < max_size )
+ heap.push( score );
+ else {
+ if( heap.top().get_score() < score.get_score() ) {
+ heap.pop();
+ heap.push( score );
+ }
+ }
+}
+
+void MinHeap::display()
+{
+ while ( !heap.empty() ) {
+ cout << heap.top().get_score() << " => " << heap.top().get_url() << endl;
+ heap.pop();
+ }
+}
+
+bool operator<(const Score &a, const Score &b)
+{
+ return a.get_score() < b.get_score();
+}
View
22 MinHeap.h
@@ -0,0 +1,22 @@
+#ifndef MIN_HEAP_H
+#define MIN_HEAP_H
+
+#include <iostream>
+#include <queue>
+#include "Score.h"
+using namespace std;
+
+bool operator<(const Score &, const Score &);
+
+class MinHeap
+{
+ public:
+ MinHeap(int);
+ void push(Score);
+ void display();
+ private:
+ int max_size;
+ priority_queue<Score> heap;
+};
+
+#endif
View
45 QueryProcessor.cpp
@@ -37,6 +37,7 @@ vector<Query> & QueryProcessor::get_queries()
void QueryProcessor::reset()
{
queries.clear();
+ inverted_lists.clear();
num_queries = 0;
max_doc_id = 0;
}
@@ -56,7 +57,7 @@ int QueryProcessor::get_avg_doc_length()
return avg_doc_length;
}
-void QueryProcessor::set_max_doc_id(vector<node*> inverted_lists)
+void QueryProcessor::set_max_doc_id()
{
for( int i = 0; i < num_queries; i++ ) {
node* temp = inverted_lists[i];
@@ -84,12 +85,12 @@ void QueryProcessor::set_avg_doc_length()
cout << "[" << avg_doc_length << "]" << endl << endl;
}
-int QueryProcessor::next_geq(node* head, int doc_id)
+int QueryProcessor::next_geq(int index, int doc_id)
{
- while( head != NULL ) {
- if( head->doc_id >= doc_id )
- return head->doc_id;
- head = head->next;
+ while( inverted_lists[index] != NULL ) {
+ if( inverted_lists[index]->doc_id >= doc_id )
+ return inverted_lists[index]->doc_id;
+ inverted_lists[index] = inverted_lists[index]->next;
}
return -1;
@@ -101,7 +102,7 @@ void QueryProcessor::clear_structures()
url_table.clear();
}
-void QueryProcessor::close_query_lists(vector<node*> &inverted_lists)
+void QueryProcessor::close_query_lists()
{
for( int i = 0; i < num_queries; i++ )
queries[i].close_list( inverted_lists[i] );
@@ -119,10 +120,22 @@ void QueryProcessor::collect_queries(string user_input)
}
}
-void QueryProcessor::show_queries()
+int QueryProcessor::collect_inverted_lists()
{
- for( int i = 0; i < num_queries; i++ )
- cout << i << ". " << queries[i].get_text() << endl;
+ int do_restart = 0;
+
+ for( int i = 0; i < num_queries; i++ ) {
+ node* head = queries[i].open_list( lexicon );
+
+ if( head == NULL ) {
+ do_restart = 1;
+ break;
+ }
+
+ inverted_lists.push_back( head );
+ }
+
+ return do_restart;
}
void QueryProcessor::load_lexicon()
@@ -195,7 +208,7 @@ int QueryProcessor::get_doc_length(int doc_id)
return ( *url_table_cursor ).second.page_size;
}
-double QueryProcessor::calculate_rank(int doc_id, vector<node*> &inverted_list)
+double QueryProcessor::calculate_rank(int doc_id)
{
double page_rank = 0, temp_rank = 0;
int total_pages = url_table.size();
@@ -207,7 +220,7 @@ double QueryProcessor::calculate_rank(int doc_id, vector<node*> &inverted_list)
double log_result = 0,freq_result = 0;
total_pages_with_queryword = queries[i].get_count();
- freq_of_query_in_doc = queries[i].get_frequency(doc_id, inverted_list[i]);
+ freq_of_query_in_doc = queries[i].get_frequency(doc_id, inverted_lists[i]);
log_result = log( (total_pages - freq_of_query_in_doc + 0.5) / ( freq_of_query_in_doc + 0.5) );
freq_result = ( ( (CONSTANT_K + 1) * total_pages_with_queryword) / (K + total_pages_with_queryword) );
@@ -219,9 +232,13 @@ double QueryProcessor::calculate_rank(int doc_id, vector<node*> &inverted_list)
return page_rank;
}
-/*
+
void QueryProcessor::add_to_heap(string url, int doc_id)
{
//add url and doc_id
}
-*/
+
+vector<node*> & QueryProcessor::get_inverted_lists()
+{
+ return inverted_lists;
+}
View
28 QueryProcessor.h
@@ -14,39 +14,35 @@ using namespace std;
class QueryProcessor
{
public:
- // per object
QueryProcessor();
~QueryProcessor();
- vector<Query> & get_queries();
int get_num_queries();
int get_max_doc_id();
- void set_max_doc_id(vector<node*>);
- int next_geq(node*, int);
- void collect_queries(string);
- void show_queries();
+ int next_geq(int, int);
+ int collect_inverted_lists();
+ double calculate_rank(int);
+ int get_doc_length(int);
+ vector<Query> & get_queries();
+ vector<node*> & get_inverted_lists();
string get_url(int);
- void close_query_lists(vector<node*> &);
+ void collect_queries(string);
+ void set_max_doc_id();
+ void close_query_lists();
void reset();
void set_avg_doc_length();
- double calculate_rank(int, vector<node*> &);
- int get_doc_length(int);
- //void add_to_heap(string, int);
-
- // for all query processors
+ void add_to_heap(string, int);
static void clear_structures();
static int get_avg_doc_length();
static map<string, int> & get_lexicon();
private:
- // per object
void boot();
void load_lexicon();
void load_url_table();
- vector<Query> queries;
int num_queries;
int max_doc_id;
-
- // for all query processors
+ vector<Query> queries;
+ vector<node*> inverted_lists;
static map<string, int> lexicon;
static map<int, page_stats> url_table;
static int avg_doc_length;
View
1  docs/.gitignore
@@ -1,2 +1,3 @@
*.jpg
*.txt
+*.png
View
78 main.cpp
@@ -10,43 +10,33 @@
#include <algorithm>
#include <string>
#include "Score.h"
+#include "MinHeap.h"
#include "QueryProcessor.h"
#include "Query.h"
+#define MAX_NUM_RESULTS 10
using namespace std;
string search_or_quit();
-bool operator<(const Score &, const Score &);
int main()
{
QueryProcessor query_processor;
+ MinHeap results_heap( MAX_NUM_RESULTS );
while( true ) {
int do_restart = 0;
- priority_queue<Score> results;
- vector<node*> inverted_lists;
+
string user_input = search_or_quit();
+
query_processor.collect_queries( user_input );
- // GET LIST POINTERS
- for( int i = 0; i < query_processor.get_num_queries(); i++ ) {
- node* head = query_processor.get_queries()[i].open_list( QueryProcessor::get_lexicon() );
-
- if( head == NULL ) {
- do_restart = 1;
- break;
- }
-
- inverted_lists.push_back( head );
- }
-
- if( do_restart == 1) {
+ do_restart = query_processor.collect_inverted_lists();
+ if( do_restart == 1 ) {
query_processor.reset();
- inverted_lists.clear();
continue;
}
- query_processor.set_max_doc_id( inverted_lists );
+ query_processor.set_max_doc_id();
// CALC SCORES USING BM25, nextGEQ, freq, add to heap etc, etc
int doc_id = 0;
@@ -58,68 +48,41 @@ int main()
if( query_processor.get_num_queries() == 1 ) {
while( doc_id <= max_doc_id ) {
- doc_id = query_processor.next_geq( inverted_lists[0], doc_id );
- bm25_score = query_processor.calculate_rank( doc_id, inverted_lists );
-
- if( results.size() < 10 ) {
- results.push( Score( query_processor.get_url( doc_id ), bm25_score ) );
- }
- else {
- if( results.top().get_score() < bm25_score ) {
- results.pop();
- results.push( Score( query_processor.get_url( doc_id ), bm25_score ) );
- }
- }
-
+ doc_id = query_processor.next_geq( 0, doc_id );
+ bm25_score = query_processor.calculate_rank( doc_id );
+ results_heap.push( Score( query_processor.get_url( doc_id ), bm25_score ) );
doc_id++;
}
}
else {
while( doc_id <= max_doc_id ) {
- doc_id = query_processor.next_geq( inverted_lists[0], doc_id );
+ doc_id = query_processor.next_geq( 0, doc_id );
if( doc_id == -1 )
break;
- for( int i = 1; i < query_processor.get_num_queries() && ( new_doc_id = query_processor.next_geq( inverted_lists[i], doc_id ) ) == doc_id; i++ );
+ for( int i = 1; i < query_processor.get_num_queries() && ( new_doc_id = query_processor.next_geq( i, doc_id ) ) == doc_id; i++ );
if( new_doc_id == -1 )
break;
-
else if( new_doc_id > doc_id )
doc_id = new_doc_id;
-
else {
-
- bm25_score = query_processor.calculate_rank( new_doc_id, inverted_lists );
-
- if( results.size() < 10 ) {
- results.push( Score( query_processor.get_url( new_doc_id ), bm25_score ) );
- }
- else {
- if( results.top().get_score() < bm25_score ) {
- results.pop();
- results.push( Score( query_processor.get_url( new_doc_id ), bm25_score ) );
- }
- }
-
+ bm25_score = query_processor.calculate_rank( new_doc_id );
+ results_heap.push( Score( query_processor.get_url( new_doc_id ), bm25_score ) );
doc_id++;
}
}
}
- while ( !results.empty() ) {
- cout << results.top().get_score() << " => " << results.top().get_url() << endl;
- results.pop();
- }
+ results_heap.display();
- query_processor.close_query_lists( inverted_lists );
+ query_processor.close_query_lists();
query_processor.reset();
- inverted_lists.clear();
+ cout << endl;
}
- cout << endl;
return 0;
}
@@ -137,8 +100,3 @@ string search_or_quit()
return user_input;
}
-
-bool operator<(const Score &a, const Score &b)
-{
- return a.get_score() < b.get_score();
-}
View
6 makefile
@@ -1,8 +1,8 @@
CXX= g++
CFLAGS=
-SRCS= Score.cpp Query.cpp QueryProcessor.cpp main.cpp
-OBJS= Score.o Query.o QueryProcessor.o main.o
-HDRS= page_stats.h Score.h QueryProcessor.h Query.h
+SRCS= MinHeap.cpp Score.cpp Query.cpp QueryProcessor.cpp main.cpp
+OBJS= MinHeap.o Score.o Query.o QueryProcessor.o main.o
+HDRS= page_stats.h MinHeap.h Score.h QueryProcessor.h Query.h
BINS= inQuery
all: $(BINS) done
Please sign in to comment.
Something went wrong with that request. Please try again.