In [620]:
import re
from bs4 import BeautifulSoup
import pickle
import pandas as pd
from nltk.tokenize import word_tokenize
from os import listdir
from os.path import isfile, join
from tqdm import tqdm_notebook as tqdm
pd.set_option('display.max_colwidth', -1)

In [266]:
def get_sentences(result):
    doc   = open("test-collection/cacm/"+ result + ".html","r")
    data = doc.read()
    data = BeautifulSoup(data, "lxml").text
    data = casefolding(data)
    sentences  = re.split(r"\.[\s\n]+ | \n",data)
    sentences = [s.replace("\n"," ").replace("\t"," ") for s in sentences]
    return sentences

In [257]:
def getDocWordFreq(word,document):
    
    #print(unigram_invertedlist_count.get(word))
    value = 0
    if unigram_invertedlist_count.get(word):
        for val in unigram_invertedlist_count[word][1]:
            if(val[0]==document):
                value = val[1]
    return value


def calculate_significant_words(sentences, document, \
                                query_words, inverted_index):
    
    significant_words = []
    words             = []
    sd                = len(sentences)
    
    if (sd < 25): thresh = 4 - 0.1 * (25 - sd)
    
    elif ((25 <= sd) and (sd <= 40)): thresh = 4
    
    else: thresh = 4 + 0.1 * (sd - 40)
    
    for sentence in sentences:
        words.extend(word_tokenize(sentence))
    
    for word in words:   
        f_dw = getDocWordFreq(word, document)
        if f_dw >= thresh: significant_words.append(word)
            
    significant_words.extend(query_words)
    significant_words = list(set(significant_words))


    return significant_words      

In [259]:
def calculate_significance_factor(sentences, significant_words):
    
    word_count = 0
    significance_factor = {}
    
    for sentence in sentences:
        tokenized_sentence          = word_tokenize(sentence)
        filtered_significance_words = set(tokenized_sentence)\
                                        .intersection(set(significant_words))
        min_index = 100000;
        max_index = -1;
        
        filtered_significance_words = [x for x in filtered_significance_words if x not in commonwords]
    
        for token in tokenized_sentence:
            #print("token",token)
            if token in filtered_significance_words: 
                
                new_max_index = max([i for i, x in enumerate(tokenized_sentence) if x == token])
                new_min_index = min([i for i, x in enumerate(tokenized_sentence) if x == token])
                
                if(min_index > new_min_index):
                    min_index = new_min_index
                
                if(max_index < new_max_index):
                    max_index = new_max_index
                
                text_span = (max_index - min_index) + 0.0001
        
                count = sum([1 for w in tokenized_sentence[min_index: max_index] if w in filtered_significance_words])
                significance_factor[sentence] = count**2/text_span
    
    return significance_factor

In [260]:
significance_factor = calculate_significance_factor(sentences, significant_words)

In [600]:
def get_text_summary(results, query):
    
    with open ('unigram_invertedlist_count.pkl', 'rb') as f: 
        unigram_invertedlist_count = pickle.load(f)
    
    with open('inverted_index.pkl', 'rb') as f:
        inverted_index = pickle.load(f)
    
    commonwords     = open('test-collection/common_words', "r")
    commonwords     = commonwords.read().split('\n')
    query_words     = query.split(' ')
    query_words     = [x for x in query_words if x not in commonwords]
    significance_df = pd.DataFrame()
    
    
    
    for result in results:
        significance_dict = []
        sentences         = get_sentences(result)
        significant_words = calculate_significant_words(sentences, 
                                                        result, query_words, 
                                                        inverted_index)
        
        for sent, score in calculate_significance_factor(sentences, significant_words).items():
            
            for query_word in query_words:
                
                sent = sent.replace(query_word, '<b> ' + query_word + '</b>')
        
            significance_dict.append({'result' : result, 
                                        'significance_factor': score,
                                         'sentence': sent})
        #print('\n\n\n')
        #print(pd.DataFrame.from_dict(significance_dict).sort_values('significance_factor', ascending=False)[:3])
        
        significance_df = significance_df.append(pd.DataFrame.from_dict(significance_dict)\
                                    .sort_values('significance_factor', ascending=False)[:3])
    
    return significance_df

In [504]:
def get_query_id_map():
    
    with open('./test-collection/cacm.query.txt') as f:
        queries = f.read()
    
    query_ids = re.findall(r'<DOCNO> \d+ </DOCNO>', queries)
    query_ids = [re.findall(r'\d+', q)[0] for q in query_ids]
    
    queries = re.split(r'</DOC>', queries)
    queries = [l.replace('</DOCNO>', '').replace('\n', ' ')\
               .replace('</DOC>', '').replace('<DOC>', '')\
               .replace('<DOCNO>', '') for l in queries]
    
    #queries = [re.sub(r'^\d*\s\s', '',l) for l in queries]

    queries = [re.sub(r'\s{2,5}', '',l) for l in queries]
    queries = [re.sub(r'\d{1,2}', '',l) for l in queries]
    
    queries  = pd.DataFrame({'query_ids': query_ids, 
                             'queries': queries[:-1]}).set_index('query_ids')
    
    return query_ids,queries

In [505]:
query_ids,queries = get_query_id_map()
queries.to_csv('queries.csv')

In [624]:
def iterate_queries(queries, path = './task1-JMQL/'):
    
    queries_ids = [f.split('.')[0].replace('Q', '') for f in listdir(path) if isfile(join(path, f))]

    for query_id in tqdm(queries_ids):

        query_results = pd.read_csv(path + 'Q' + query_id + '.txt', sep='\t', names = ['qid', 'dumb_thing', 'doc_id',
                                                                                         'rank','score', 'system'])
        
        get_text_summary(list(query_results['doc_id']),\
                         queries.loc[queries.index == query_id, 'queries'][query_id])[['result', 'sentence']]\
                         .to_html('./snippets/' + query_id + '.html',
                                  index = False, escape = False, border = 0, max_rows = None, max_cols = None)
        
        
        
    
    return


In [625]:
iterate_queries(queries, path = './task1-JMQL/')

A Jupyter Widget




In [358]:
def write_snippets(significance_df, query):
    with open('snippets/' + query + '.results', 'w') as f:   
        for result in significance_df['result'].unique():
            for index, sentence in significance_df.loc[significance_df['result'] == result].iterrows():
                f.write(sentence['sentence'])



by providing efficient interprocess communication primitives.  cacm february, 1979  cheriton, d. malcolm, m. melen, l. sager, g.  portability, real time, operating systems, minicomputer  3.80 4.30 4.35  ca790206 dh april 12, 1979  9:10 am  2319 4 3127 2378 4 3127 2320 4 3127 2632 4 3127 2738 4 3127 2740 4 3127 2868 4 3127 2928 4 3127 3127 4 3127 3127 4 3127 2080 5 3127 2277 5 3127 3127 5 3127 3127 5 3127 3127 5 3127    
    thoth, a portable real-time operating system
 thoth isa real-time operating system which is designed to be portable over a large set of machines


chairman  cacm june, 1961  bright, h. s.  ca610603 jb march 16, 1978  10:55 pm  322 5 322 322 5 322 322 5 322    
the problem of operating a computer efficiently in view of the growing number of programming systems
incompatibilities are currently resolved by manually setting up the computer for each system as required


of the system is with an interpretive translator on an ibm 1620 computer.  cacm july, 1964  hellerman