In [1]:
import pandas as pd
import string
import math 
import numpy as np 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


pd.set_option('display.max_colwidth', 0)
df = pd.read_csv("publications_data.csv")

df['doc_content'] = df['name']+ ' ' + df['doc_title'] +" "+ df['doc_abstract']
df = df.dropna()
one_word_titles = df[df['doc_title'].str.count('\s') == 0]['doc_title']
df = df[~df['doc_title'].isin(one_word_titles)]
df = df.reset_index()
df = df.drop(columns = 'index')

df_final = df.drop(columns = ['doc_content'])

def clean_docs(i, doc):
    stops = stopwords.words('english')
    words = doc.split()

    final = []

    for word in words:
        word = word.lower()
        word = word.replace('-', ' ')
        if word not in stops:
            final.append(word)
        
    final = " ". join(final)
    final = final.translate(str.maketrans("", "", string.punctuation))
    df_final.loc[i, 'doc_content'] = final

for i in range(0, len(df)):
    data = df.loc[i, 'doc_content']
    clean_docs(i, data)

documents = list(df_final['doc_content'])

In [21]:
def tokenize(doc):
    words = word_tokenize(doc)
    return words

def index_doc(doc):
    tok_pos = dict()
    for t_index,token in enumerate(doc):
        if token in tok_pos:
            tok_pos[token].append(t_index)
        else:
            tok_pos[token] = [t_index]
    return tok_pos

In [23]:
final_docs = []
for doc in documents:
    to_app = tokenize(doc)
    final_docs.append(to_app)

In [28]:
# final_docs[0]

In [38]:
inverted_index = dict()

for d_idx,doc in enumerate(final_docs):    
    poslists = index_doc(doc) # get positions of each token in the doc
    for tok,poslist in poslists.items():
        if tok in inverted_index:
            inverted_index[tok][d_idx] = poslist # update
        else:
            inverted_index[tok] = dict()
            inverted_index[tok][d_idx] = poslist# initialize

In [41]:
counter = 0
for key, value in inverted_index.items():
    if counter < 5:
        print(f"{key}: {value}")
        counter += 1
    else:
        break

mohamad: {0: [0], 8807: [0]}
nazri: {0: [1], 8807: [1]}
abd: {0: [2], 8807: [2]}
karim: {0: [3], 8807: [3]}
stock: {0: [4, 16, 44, 63], 1: [47, 119], 9: [27], 109: [8, 41, 64], 110: [5, 37], 262: [45, 59], 811: [39, 64], 1217: [4, 30, 41, 122], 1352: [6, 9, 31, 88], 1806: [46, 104], 1920: [45], 1922: [38], 1923: [13, 35, 63], 1930: [100], 1932: [55], 2534: [32], 2973: [51], 3248: [74, 123], 3252: [16, 26, 37, 53], 3253: [9, 18], 3260: [36, 51], 3578: [13], 3593: [65], 3723: [9, 17, 28], 3729: [8], 3785: [27], 3815: [364], 3823: [62], 4029: [31], 4179: [50], 4411: [4], 4640: [52, 70], 4960: [55], 5008: [3, 9, 18, 35, 40, 75, 80], 5016: [3], 5220: [55, 103], 5365: [9, 49, 71], 5598: [22], 5823: [10, 20], 5825: [7, 14], 5832: [10], 5835: [3], 5841: [28], 6104: [77], 6193: [30], 6198: [41], 6199: [2, 9, 74, 113], 6202: [66, 85], 6211: [45], 6223: [6, 11, 20], 6255: [63, 80], 6364: [54], 6486: [31], 6487: [29, 65, 81, 85, 100, 104], 6488: [13, 27, 43, 68, 94, 105], 6493: [38, 79, 101], 6501

The above full inverted index contains the positional lists of each term in each document of the collection.

In [42]:
inverted_index

{'mohamad': {0: [0], 8807: [0]},
 'nazri': {0: [1], 8807: [1]},
 'abd': {0: [2], 8807: [2]},
 'karim': {0: [3], 8807: [3]},
 'stock': {0: [4, 16, 44, 63],
  1: [47, 119],
  9: [27],
  109: [8, 41, 64],
  110: [5, 37],
  262: [45, 59],
  811: [39, 64],
  1217: [4, 30, 41, 122],
  1352: [6, 9, 31, 88],
  1806: [46, 104],
  1920: [45],
  1922: [38],
  1923: [13, 35, 63],
  1930: [100],
  1932: [55],
  2534: [32],
  2973: [51],
  3248: [74, 123],
  3252: [16, 26, 37, 53],
  3253: [9, 18],
  3260: [36, 51],
  3578: [13],
  3593: [65],
  3723: [9, 17, 28],
  3729: [8],
  3785: [27],
  3815: [364],
  3823: [62],
  4029: [31],
  4179: [50],
  4411: [4],
  4640: [52, 70],
  4960: [55],
  5008: [3, 9, 18, 35, 40, 75, 80],
  5016: [3],
  5220: [55, 103],
  5365: [9, 49, 71],
  5598: [22],
  5823: [10, 20],
  5825: [7, 14],
  5832: [10],
  5835: [3],
  5841: [28],
  6104: [77],
  6193: [30],
  6198: [41],
  6199: [2, 9, 74, 113],
  6202: [66, 85],
  6211: [45],
  6223: [6, 11, 20],
  6255: [63, 80