In [1]:
'''

The words_before/after list would look like 

[['~'], '598'],
 ['-'], '143'],
 ... 
 

'''

with open('index_before.txt', encoding="utf8") as f:
    # before lower casing
    words_before = [line.split() for line in f.read().split('\n') if line]
    

with open('index_after.txt', encoding="utf8") as f:
    # after lower casing 
    words_after = [line.split() for line in f.read().split('\n') if line]

In [2]:
print(len(words_before)) # the length of words before lower casing - 52788
print(len(words_after)) # the length of words after lower casing  - 50966

52788
50966


In [3]:
#  to get the 10 most frequent words from the lowercased test collections
def stop_words(dictionary, top_n_words): 
    stop_words = []
    i = 0
    for k in sorted(dictionary, key=lambda k: len(dictionary[k]), reverse=True):
        if i < top_n_words:
            stop_words.append(k)
        i += 1
        
    return stop_words

def inverted_index(indexlist, stop_words_processing = False, query=None):
    vocab = {} #make new dict in format of {word: doc1, doc10, doc7    word2: doc 8, doc 19,    wordN: etc.}
    for i, word in enumerate(indexlist): #i = index, word is list of ['~', '598'] (words_after), indexlist = words_after
        if word[0] in vocab: #word[0] is the character/word/feature, check if already in vocab dict
            vocab.get(word[0]).append(word[1]) #if so, get the value of that word and append the new doc# to it
        else:
            vocab[word[0]] = [word[1]] #else, make a new dict entry with that word and set value = to doc#
            
    # stop_word processing if the 2nd parameter was given 'True'    
    if stop_words_processing == True:
        stopwords = stop_words(vocab, 10) #pass in your constructed vocab and desired top 10 most words
        for k, v in list(vocab.items()): #traverse vocab dict, k = words, v = doc ID #s
            if k in stopwords: #if vocab word in stopwords
                del vocab[k] #delete the vocab dict entry of that vocab word    
   

    if query:
        for k, v in list(vocab.items()): #traverse vocab dict, k = words, v = doc ID #s
            if k == query: #if vocab word is the word you passed thru into function
                print(f"The doc ID(s) for your query '{k}' is   {v}") #print word and word's value list of doc IDs
                return
    
    # sum of the length of the postings lists 
    sum_len_values = sum(len(dct) for dct in vocab.values())           
    return len(vocab), sum_len_values # len(vocab) for the size of the dictionary     

In [4]:
# before stop words processing 
print(inverted_index(words_after, False))

# after stop words processing 
print(inverted_index(words_after, True))

# simple query 
print(inverted_index(words_after, True, 'school'))

(3684, 50966)
(3674, 43163)
The doc ID(s) for your query 'school' is   ['72', '111', '223', '224', '268', '385', '431', '494', '532', '553', '554', '564', '581', '582', '996']
None


In [9]:
'''
function that performs the intersection 
between two or more posting lists 
'''
from itertools import zip_longest

def intersect(lst1, lst2):
    # e.g. str '2' -> int 2 
    lst1 = list(map(int, lst1))
    lst2 = list(map(int, lst2))
    
    res = [];
    i=0 ; j = 0; compare_count = 0

    while i<len(lst1) and j<len(lst2):
        if lst1[i] == lst2[j]:
            res.append(lst1[i])
            i += 1
            j += 1
            compare_count += 1
        
        elif lst1[i] < lst2[j]:
            i += 1
            compare_count += 1 
        
        elif lst1[i]:
            j += 1
            compare_count += 1 

    print("No.of comparison: ", compare_count)
    return res


def intersect_w_skip(lst1, lst2):
    lst1 = list(map(int, lst1))
    lst2 = list(map(int, lst2))
    
    res = [];
    i=0 ; j = 0; compare_count = 0;
    
    while i<len(lst1) and j<len(lst2):
        if lst1[i] == lst2[j]:
            res.append(lst1[i])
            i += 1
            j += 1
            compare_count += 1
        
        elif lst1[i] < lst2[j]:
            i += 1
            compare_count += 1 
        
        elif lst1[i]:
            j += 1
            compare_count += 1 

    print("No.of comparison: ", compare_count)
    return res


def query(indexlist):
    vocab = {} #make a vocab dict of the indexlist
    for i, word in enumerate(indexlist):
        if word[0] in vocab:
            vocab.get(word[0]).append(word[1])       
        else:
            vocab[word[0]] = [word[1]]
    
    stopwords = stop_words(vocab, 10) #get top 10 stopwords
    
    for k, v in list(vocab.items()): #remove stopwords
        if k in stopwords:
            del vocab[k]    
    
    lst = []; #new list to hold doc IDs
    
    user_input = input("Enter an intersection query (format: a AND b): ") #ask user input
    user_input = user_input.lower() #to account for user not capitalizing 'AND' condition
    input_list = user_input.split(' and ') #split user input into list around the 'and' condition. result is list of words they want to intersect
    

    for word in input_list: #check if all user input words are in the vocab
        if word not in vocab:
            print(f"Query entry '{word}' not in vocabulary. Try again.")
            return
    
    for i, query in enumerate(input_list): #traverse thru user list, i =index of user list, query = word user asking for
        for k, v in list(vocab.items()): #traverse thru vocab words, k = vocab word, v = doc IDs
            if k == query: #if vocab word == word user wants
                lst.append(v) #append doc IDs into list
    
    sorted_list = sorted(lst, key=len) #sort by the length of the # of docs each word has
    res = intersect(sorted_list[0], sorted_list[1]) #get interest of both words
    
    if len(lst) == 2: #if only 2 entries
        return res #return intersection

    else: #more than 2 words
        k = 2
        while k < len(sorted_list)-1:
            res = intersect(res, sorted_list[k]) #compare the first 2 words' intersection with the next one until you're at the max words
            k += 1 #increment k
            
        return res #return intersection

In [10]:
# school AND kids AND really
query(words_after)

Enter an intersection query (format: a AND b):  school and kids


72 32
111 72
223 175
224 224
268 336
385 385
431 459
494 460
532 475
553 523
554 538
564 539
581 553
582 583
996 584
None 593
None 613
None 786
None 806
None 970


In [11]:
# Ranked queries for the query 'school AND kids AND really' 

with open('index_rank.txt', encoding = "utf8") as f:
    # after lower casing 
    words_after_rank = [line.split() for line in f.read().split('\n') if line]
    
    
for i in range(len(words_after)):
    words_after_rank[i][0] = int(words_after_rank[i][0])
    words_after_rank[i][2] = int(words_after_rank[i][2])

In [12]:
from math import log
import pandas as pd
import numpy as np

In [13]:
len_total_doc = 3 
doc_id = ['72', '224', '385']


vocab = []

for i in words_after_rank:
    if i[2] == 72:
        vocab.append(i[1])
        
    elif i[2] == 224:
        vocab.append(i[1])
        
    elif i[2] == 385:
        vocab.append(i[1])
        
vocab = sorted(list(set(vocab)))

df1 = pd.DataFrame({'doc_id': doc_id})
df = pd.DataFrame(vocab).T
df = df.rename(columns=df.iloc[0])
df = df.drop(df.index[0])
df = pd.concat([df1, df])

 

for i in words_after_rank:
    if i[2] == 72:
        pass




In [14]:
df.iloc[:, 1:]

Unnamed: 0,',",",-,.,...,?,a,always,anyone,bad,....1,was,we,well,what',you,you',–,“,”,…
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# 1. skip pointer 
# 2. tf-idf 