In [None]:
"""
  Part 1: Preprocess the data
  
  1. Read in qrel file so we have a set of relevant and non-relevant docs for each query
  2. Using query-processing notebook, run es search for all queries from HW1 to get a set of 2000 docs
  3. Create a pandas df for each query-docid pair and include colums for:
      - query-docid
      - bm25
      - laplace
      - jm
      - tf_idf
      - okapi

"""

In [13]:
import pandas as pd

In [14]:
# functions for data preprocessing

def read_qrel_file(file_path):
    """
      Read qrel file into dictionary structure of relevant and non-relevant
      docs for each query id
      
      query id --> dict: relevant set and nonrelevant set
    """
    data = {}
    with open(file_path, encoding="ISO-8859-1", errors='ignore') as f:
        for line in f:
            # q_id 0 doc_id relevance
            line_list = line.split()
            q_id = line_list[0]
            doc_id = line_list[2]
            rel = line_list[3]
            if q_id in queries:
                if q_id not in data:
                    data[q_id] = {'rel' : set([]), 'non-rel' : set([])}
                if rel == '1':
                    data[q_id]['rel'].add(doc_id)
                elif rel == '0':
                    data[q_id]['non-rel'].add(doc_id)
    f.close()
    return data


def expand_qrel_data(file_path):
    """
      Read in an es results file and expand the qrel nonrelevant docs set for each query
      with docs that are not already included until we reach a set of 1000. 
    """
    with open(file_path, encoding="ISO-8859-1", errors='ignore') as f:
        for line in f:
            line_list = line.split()
            q_id = line_list[0]
            doc_id = line_list[2]
            rank = line_list[3]
            score = line_list[4]
            if len(qrel_data[q_id]['non-rel']) < 1000:
                if doc_id not in qrel_data[q_id]['non-rel']:
                    qrel_data[q_id]['non-rel'].add(doc_id)
    f.close()
    
def create_dataframe():
    """
      Create an initial state for the dataframe with each query-docid pair and label.  
    """
    df = pd.DataFrame(columns=('query-docid', 'q_id', 'label'))

    k = 0
    for key, value in qrel_data.items():
        rels = list(value['rel'])
        nonrels = list(value['non-rel'])

        for i in range(len(rels)):
            values_to_add = {'query-docid': str(key + "-" + rels[i]), 'q_id': str(key), 'label': 1}
            row_to_add = pd.Series(values_to_add, name=k)
            df = df.append(row_to_add)
            k += 1
        for j in range(len(nonrels)):
            values_to_add = {'query-docid': str(key + "-" + nonrels[j]), 'q_id': str(key), 'label': 0}
            row_to_add = pd.Series(values_to_add, name=k)
            df = df.append(row_to_add)
            k += 1
    return df


def create_score_dict(file_path):
    """
      Create a dictionary maping the query-docid pair to the feature (score)
      from the model file that is passed in. 
    """
    feature_dict = {}
    with open(file_path, encoding="ISO-8859-1", errors='ignore') as f:
        for line in f:
            line_list = line.split()
            q_id = line_list[0]
            doc_id = line_list[2]
            score = line_list[4]
            pair = q_id + "-" + doc_id
            feature_dict[pair] = score
    f.close()
    return feature_dict
            
            
   

In [15]:
""" Main Code """

# the set of query ids that we are working with
queries = set(['85', '59', '56', '71', '64', '62', '93', '99', '58', '77', '54', '87', '94',
              '100', '89', '61', '95', '68', '57', '97', '98', '60', '80', '63', '91'])

# data from qrel file
qrel_data = read_qrel_file('C:/6200-IR/homework-6-mplatt27/qrels.adhoc.51-100.AP89.txt')

# use es search of 2000 files per query to expand sets
expand_qrel_data('C:/6200-IR/homework-6-mplatt27/es_ranking_model_results/es_results_2000.txt')

# check that the non-rel set is size 1000 for each query
for key, value in qrel_data.items():
    print("query: ", key, ", ", "non-rel docs: ", len(value['non-rel']))

query:  54 ,  non-rel docs:  1000
query:  56 ,  non-rel docs:  1000
query:  57 ,  non-rel docs:  1000
query:  58 ,  non-rel docs:  1000
query:  59 ,  non-rel docs:  1000
query:  60 ,  non-rel docs:  1000
query:  61 ,  non-rel docs:  1000
query:  62 ,  non-rel docs:  1000
query:  63 ,  non-rel docs:  1000
query:  64 ,  non-rel docs:  1000
query:  68 ,  non-rel docs:  1000
query:  71 ,  non-rel docs:  1000
query:  77 ,  non-rel docs:  1000
query:  80 ,  non-rel docs:  1000
query:  85 ,  non-rel docs:  1000
query:  87 ,  non-rel docs:  1000
query:  89 ,  non-rel docs:  1000
query:  91 ,  non-rel docs:  1000
query:  93 ,  non-rel docs:  1000
query:  94 ,  non-rel docs:  1000
query:  95 ,  non-rel docs:  1000
query:  97 ,  non-rel docs:  1000
query:  98 ,  non-rel docs:  1000
query:  99 ,  non-rel docs:  1000
query:  100 ,  non-rel docs:  1000


In [16]:
# create dictionaries for each docs --> score for each of the features that we want to add by reading in files
bm25_scores = create_score_dict('C:/6200-IR/homework-6-mplatt27/es_ranking_model_results/okapi_bm25_results.txt')
laplace_scores = create_score_dict('C:/6200-IR/homework-6-mplatt27/es_ranking_model_results/unigram_lm_laplace_results.txt')
jm_scores = create_score_dict('C:/6200-IR/homework-6-mplatt27/es_ranking_model_results/unigram_lm_jm_results.txt')
tf_idf_scores = create_score_dict('C:/6200-IR/homework-6-mplatt27/es_ranking_model_results/tfidf_results.txt')
okapi_scores = create_score_dict('C:/6200-IR/homework-6-mplatt27/es_ranking_model_results/okapi_tf_results.txt')

In [17]:
# create the pandas data frame
features = create_dataframe()
features.head()

Unnamed: 0,query-docid,q_id,label
0,54-AP890306-0169,54,1
1,54-AP890328-0062,54,1
2,54-AP890622-0082,54,1
3,54-AP890118-0061,54,1
4,54-AP891220-0132,54,1


In [18]:
# add features to df
pairs = features['query-docid'].tolist()
bm25_col = []
laplace_col = []
jm_col = []
tfidf_col = []
okapi_col = []
for i in range(len(pairs)):
    bm25_col.append(bm25_scores.get(pairs[i],0))
    laplace_col.append(laplace_scores.get(pairs[i],-5000))
    jm_col.append(jm_scores.get(pairs[i],-5000))
    tfidf_col.append(tf_idf_scores.get(pairs[i],0))
    okapi_col.append(okapi_scores.get(pairs[i],0))
    
features['bm25'] = bm25_col
features['laplace'] = laplace_col
features['jm'] = jm_col
features['tfidf'] = tfidf_col
features['okapi'] = okapi_col

features.head()

Unnamed: 0,query-docid,q_id,label,bm25,laplace,jm,tfidf,okapi
0,54-AP890306-0169,54,1,27.93181923959083,-92.00940045264448,-40.25917339337422,11.417692169513714,2.178448785756117
1,54-AP890328-0062,54,1,15.629973021595433,-93.46886897075817,-44.08130350267189,6.20699874050587,1.3565806572635963
2,54-AP890622-0082,54,1,25.71216470891023,-93.21181583683192,-40.934508048381446,10.204884857531145,1.9216976291600212
3,54-AP890118-0061,54,1,26.097545389194455,-91.50404564446,-41.498398578603016,10.33264168893027,2.038043524462453
4,54-AP891220-0132,54,1,23.73257361057009,-94.0214553894942,-41.175104323512535,9.069492504901008,1.7875537855093042


In [19]:
# save to csv
features.to_csv('C:/6200-IR/homework-6-mplatt27/features.csv')