In [1]:
import re
from bs4 import BeautifulSoup
import pickle
import pandas as pd
from nltk.tokenize import word_tokenize
from os import listdir
from os.path import isfile, join
from tqdm import tqdm_notebook as tqdm
pd.set_option('display.max_colwidth', -1)

In [9]:
def casefolding(data):
    return data.lower()

def get_sentences(result):
    doc   = open("test-collection/cacm/"+ result + ".html","r")
    data = doc.read()
    data = BeautifulSoup(data, "lxml").text
    data = casefolding(data)
    sentences  = re.split(r"\.[\s\n]+ | \n",data)
    sentences = [s.replace("\n"," ").replace("\t"," ").replace("\d+"," ") for s in sentences]
    return sentences

def getDocWordFreq(word,document):
    value = 0
    if unigram_invertedlist_count.get(word):
        for val in unigram_invertedlist_count[word][1]:
            if(val[0]==document):
                value = val[1]
    return value

def calculate_significant_words(sentences, document, \
                                query_words, inverted_index):
    
    significant_words = []
    words             = []
    sd                = len(sentences)
    
    if (sd < 25): thresh = 4 - 0.1 * (25 - sd)
    
    elif ((25 <= sd) and (sd <= 40)): thresh = 4
    
    else: thresh = 4 + 0.1 * (sd - 40)
    
    for sentence in sentences:
        words.extend(word_tokenize(sentence))
    
    for word in words:   
        f_dw = getDocWordFreq(word, document)
        if f_dw >= thresh: significant_words.append(word)
            
    significant_words.extend(query_words)
    significant_words = list(set(significant_words))


    return significant_words


def calculate_significance_factor(sentences, significant_words):
    
    word_count = 0
    significance_factor = {}
    
    for sentence in sentences:
        tokenized_sentence          = word_tokenize(sentence)
        filtered_significance_words = set(tokenized_sentence)\
                                        .intersection(set(significant_words))
        min_index = 100000;
        max_index = -1;
        
        filtered_significance_words = [x for x in filtered_significance_words if x not in commonwords]
    
        for token in tokenized_sentence:
            #print("token",token)
            if token in filtered_significance_words: 
                
                new_max_index = max([i for i, x in enumerate(tokenized_sentence) if x == token])
                new_min_index = min([i for i, x in enumerate(tokenized_sentence) if x == token])
                
                if(min_index > new_min_index):
                    min_index = new_min_index
                
                if(max_index < new_max_index):
                    max_index = new_max_index
                
                text_span = (max_index - min_index) + 0.0001
        
                count = sum([1 for w in tokenized_sentence[min_index: max_index] if w in filtered_significance_words])
                significance_factor[sentence] = count**2/text_span
    
    return significance_factor



In [20]:
def get_text_summary(results, query):
    query_words     = query.split(' ')
    query_words     = [x for x in query_words if x not in commonwords]
    significance_df = pd.DataFrame()
    
    
    for result in results:
        significance_dict = []
        sentences         = get_sentences(result)
        significant_words = calculate_significant_words(sentences, 
                                                        result, query_words, 
                                                        inverted_index)
        
        for sent, score in calculate_significance_factor(sentences, significant_words).items():
            
            for query_word in query_words:
                
                sent = sent.replace(query_word, '<b> ' + query_word + '</b>')
        
            significance_dict.append({'result' : result, 
                                        'significance_factor': score,
                                         'sentence': sent})
        #print('\n\n\n')
#             print(significance_dict)
        #print(pd.DataFrame.from_dict(significance_dict).sort_values('significance_factor', ascending=False)[:3])
        
        significance_df = significance_df.append(pd.DataFrame.from_dict(significance_dict).sort_values('significance_factor', ascending=False)[:3])
    
    return significance_df

def get_query_id_map():
    
    with open('./test-collection/cacm.query.txt') as f:
        queries = f.read()
    
    query_ids = re.findall(r'<DOCNO> \d+ </DOCNO>', queries)
    query_ids = [re.findall(r'\d+', q)[0] for q in query_ids]
    
    queries = re.split(r'</DOC>', queries)
    queries = [l.replace('</DOCNO>', '').replace('\n', ' ')\
               .replace('</DOC>', '').replace('<DOC>', '')\
               .replace('<DOCNO>', '') for l in queries]
    
    #queries = [re.sub(r'^\d*\s\s', '',l) for l in queries]

    queries = [re.sub(r'\s{2,5}', '',l) for l in queries]
    queries = [re.sub(r'\d{1,2}', '',l) for l in queries]
    
    queries  = pd.DataFrame({'query_ids': query_ids, 
                             'queries': queries[:-1]}).set_index('query_ids')
    
    return query_ids,queries




def iterate_queries(queries, path = 'baseline-runs/task1-bm25'):
    
    queries_ids = [f.split('.')[0].replace('Q', '') for f in listdir(path) if isfile(join(path, f))]

    print(queries_ids)
    for query_id in tqdm(queries_ids):
        print(query_id)
        query_results = pd.read_csv(path + 'Q' + query_id + '.txt', sep='\t', names = ['qid', 'Q0', 'doc_id',
                                                                                         'rank','score', 'system'])
       
        get_text_summary(list(query_results['doc_id']),\
                         queries.loc[queries.index == query_id, 'queries'][query_id])[['result', 'sentence']]\
                         .to_html('phase2-output/snippets/' + query_id + '.html',
                                  index = False, escape = False, border = 0, max_rows = None, max_cols = None)
        print(list(query_results['doc_id']))
        
        
    
    return

In [21]:
# significance_factor = calculate_significance_factor(sentences, significant_words)
unigram_invertedlist_count = {}
inverted_index = {}
commonwords = []
query_ids,queries = get_query_id_map()
queries.to_csv('queries.csv')
with open ('reusable_data/unigram_invertedlist_count.pkl', 'rb') as f: 
    unigram_invertedlist_count = pickle.load(f)
    
with open('reusable_data/inverted_index.pkl', 'rb') as f:
    inverted_index = pickle.load(f)
commonwords     = open('test-collection/common_words', "r")
commonwords     = commonwords.read().split('\n')

In [22]:
# significance_factor = calculate_significance_factor(sentences, significant_words)
iterate_queries(queries, path = 'baseline-runs/task1-bm25/')


['38', '10', '11', '39', '13', '12', '16', '', '9', '8', '17', '15', '29', '28', '14', '58', '64', '59', '61', '49', '48', '60', '62', '63', '46', '52', '53', '47', '51', '45', '44', '50', '54', '40', '41', '55', '43', '57', '56', '42', '19', '25', '31', '6', '7', '30', '24', '18', '32', '26', '5', '4', '27', '33', '37', '23', '1', '22', '36', '20', '34', '3', '2', '35', '21']


A Jupyter Widget

38
['CACM-2867', 'CACM-3031', 'CACM-2941', 'CACM-2470', 'CACM-3105', 'CACM-1698', 'CACM-3177', 'CACM-2582', 'CACM-0595', 'CACM-3060', 'CACM-3142', 'CACM-2247', 'CACM-2931', 'CACM-2989', 'CACM-1861', 'CACM-3148', 'CACM-0497', 'CACM-2082', 'CACM-2970', 'CACM-3054', 'CACM-2139', 'CACM-3162', 'CACM-1637', 'CACM-3033', 'CACM-2579', 'CACM-1323', 'CACM-1489', 'CACM-0321', 'CACM-2400', 'CACM-1873', 'CACM-2939', 'CACM-0483', 'CACM-0867', 'CACM-3140', 'CACM-2815', 'CACM-0718', 'CACM-2356', 'CACM-2305', 'CACM-2609', 'CACM-1352', 'CACM-2369', 'CACM-2986', 'CACM-1643', 'CACM-2912', 'CACM-1931', 'CACM-2876', 'CACM-2958', 'CACM-3014', 'CACM-2705', 'CACM-1359', 'CACM-3069', 'CACM-1382', 'CACM-1536', 'CACM-2957', 'CACM-3103', 'CACM-2733', 'CACM-0670', 'CACM-2390', 'CACM-1248', 'CACM-1867', 'CACM-2327', 'CACM-2309', 'CACM-1769', 'CACM-3052', 'CACM-2109', 'CACM-3009', 'CACM-2956', 'CACM-1958', 'CACM-2184', 'CACM-3179', 'CACM-2707', 'CACM-2480', 'CACM-2750', 'CACM-0492', 'CACM-1572', 'CACM-3008', 'CACM-28

KeyError: 'significance_factor'