In [502]:
import pandas as pd
import string
import math 

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [189]:
df = pd.read_csv("publications_data.csv")
len(df)

17524

In [190]:
df = df.dropna()

In [193]:
one_word_titles = df[df['doc_title'].str.count('\s') == 0]['doc_title']
one_word_titles

180               Cut-up
265         Introduction
266         Introduction
667             Religion
679             Religion
              ...       
16338    auto-invitation
16634               Lash
16981         Assessment
17164              Ghana
17265       Introduction
Name: doc_title, Length: 129, dtype: object

In [194]:
df = df[~df['doc_title'].isin(one_word_titles)]

In [195]:
len(df)

16712

In [203]:
ids = []
for i in range(1, len(df)+1):
    ids.append(i)

In [205]:
df['doc_id'] = ids

In [206]:
df

Unnamed: 0,name,job_title,organization,doc_link,doc_title,doc_id
0,Mohammed Ahmed,Lecturer in Project Management,School of Strategy and Leadership,https://pureportal.coventry.ac.uk/en/publicati...,Autonomous Vehicle and its Adoption: Challenge...,1
1,Mohammed Ahmed,Lecturer in Project Management,School of Strategy and Leadership,https://pureportal.coventry.ac.uk/en/publicati...,Predicting the Public Adoption of Connected an...,2
2,Mohammed Ahmed,Lecturer in Project Management,School of Strategy and Leadership,https://pureportal.coventry.ac.uk/en/publicati...,Investigating the Effect of Mass Variation for...,3
3,Mohammed Ahmed,Lecturer in Project Management,School of Strategy and Leadership,https://pureportal.coventry.ac.uk/en/publicati...,A new method towards achieving FES-induced mov...,4
4,Mohammed Ahmed,Lecturer in Project Management,School of Strategy and Leadership,https://pureportal.coventry.ac.uk/en/publicati...,Restoration of Movement using FES: An Introduc...,5
...,...,...,...,...,...,...
17519,Jackie Abell,Associate Professor Research,"Centre for Agroecology, Water and Resilience",https://pureportal.coventry.ac.uk/en/publicati...,Mad Cows & British Politicians: The Role of Sc...,16708
17520,Jackie Abell,Associate Professor Research,"Centre for Agroecology, Water and Resilience",https://pureportal.coventry.ac.uk/en/publicati...,Broadcasting the royal role: Constructing cult...,16709
17521,Jackie Abell,Associate Professor Research,"Centre for Agroecology, Water and Resilience",https://pureportal.coventry.ac.uk/en/publicati...,The Discursive (Re)Construction of Events,16710
17522,Jackie Abell,Associate Professor Research,"Centre for Agroecology, Water and Resilience",https://pureportal.coventry.ac.uk/en/publicati...,"I take full responsibility, I take some respon...",16711


In [257]:
doc_sep = df.loc[:,['doc_id','doc_title']]

In [258]:
doc_sep.head()

Unnamed: 0,doc_id,doc_title
0,1,Autonomous Vehicle and its Adoption: Challenge...
1,2,Predicting the Public Adoption of Connected an...
2,3,Investigating the Effect of Mass Variation for...
3,4,A new method towards achieving FES-induced mov...
4,5,Restoration of Movement using FES: An Introduc...


In [481]:
doc_id_list = list(doc_sep['doc_id'])
doc_title_list = list(doc_sep['doc_title'])

corpus_dict = {doc_id_list[i] : doc_title_list[i] for i in range(len(doc_sep))}

In [482]:
corpus_dict

{1: 'Autonomous Vehicle and its Adoption: Challenges, Opportunities, and Future Implications',
 2: 'Predicting the Public Adoption of Connected and Autonomous Vehicles',
 3: 'Investigating the Effect of Mass Variation for Sliding Mode Control of Functional Electrical Stimulation Aided Sit-to-Stand in Paraplegia',
 4: 'A new method towards achieving FES-induced movement',
 5: 'Restoration of Movement using FES: An Introductory Study I',
 6: 'Kinematic modelling of FES induced sit-to-stand movement in Paraplegia',
 7: 'SMC scheme for FES aided restoration of STS movement in paraplegics',
 8: 'Evolution of intelligent and nonlinear control approaches for FES induced movement generation of the lower limb',
 9: 'New Concept for FES-Induced Movements',
 10: 'Assessing the Use of Gold as a Zero-Beta Asset in Empirical Asset Pricing: Application to the US Equity Market',
 11: 'Cross-Layer Multipath Multichannel MAC protocol for MANETs',
 12: 'Performance evaluation of Receiver Directed Transmi

In [501]:
list(corpus_dict.values())

['Autonomous Vehicle and its Adoption Challenges Opportunities and Future Implications',
 'Predicting the Public Adoption of Connected and Autonomous Vehicles',
 'Investigating the Effect of Mass Variation for Sliding Mode Control of Functional Electrical Stimulation Aided SittoStand in Paraplegia',
 'A new method towards achieving FESinduced movement',
 'Restoration of Movement using FES An Introductory Study I',
 'Kinematic modelling of FES induced sittostand movement in Paraplegia',
 'SMC scheme for FES aided restoration of STS movement in paraplegics',
 'Evolution of intelligent and nonlinear control approaches for FES induced movement generation of the lower limb',
 'New Concept for FESInduced Movements',
 'Assessing the Use of Gold as a ZeroBeta Asset in Empirical Asset Pricing Application to the US Equity Market',
 'CrossLayer Multipath Multichannel MAC protocol for MANETs',
 'Performance evaluation of Receiver Directed Transmission protocol with a single transceiver in MANETs',

In [483]:
def text_preprocessor(corpus_dict):
    temp = {}
    translator  = str.maketrans('', '', string.punctuation)
    for kee in corpus_dict:
        corpus_dict[kee] = corpus_dict[kee].translate(translator)  
        
        tokens = word_tokenize(corpus_dict[kee])
        
        stop_words = set(stopwords.words('english'))
        filtered = [word for word in tokens if word.casefold() not in stop_words]
        
        temp[kee] = filtered
        
    return temp

clean_tokenized_corpus = text_preprocessor(corpus_dict)

In [484]:
clean_tokenized_corpus

{1: ['Autonomous',
  'Vehicle',
  'Adoption',
  'Challenges',
  'Opportunities',
  'Future',
  'Implications'],
 2: ['Predicting',
  'Public',
  'Adoption',
  'Connected',
  'Autonomous',
  'Vehicles'],
 3: ['Investigating',
  'Effect',
  'Mass',
  'Variation',
  'Sliding',
  'Mode',
  'Control',
  'Functional',
  'Electrical',
  'Stimulation',
  'Aided',
  'SittoStand',
  'Paraplegia'],
 4: ['new', 'method', 'towards', 'achieving', 'FESinduced', 'movement'],
 5: ['Restoration', 'Movement', 'using', 'FES', 'Introductory', 'Study'],
 6: ['Kinematic',
  'modelling',
  'FES',
  'induced',
  'sittostand',
  'movement',
  'Paraplegia'],
 7: ['SMC',
  'scheme',
  'FES',
  'aided',
  'restoration',
  'STS',
  'movement',
  'paraplegics'],
 8: ['Evolution',
  'intelligent',
  'nonlinear',
  'control',
  'approaches',
  'FES',
  'induced',
  'movement',
  'generation',
  'lower',
  'limb'],
 9: ['New', 'Concept', 'FESInduced', 'Movements'],
 10: ['Assessing',
  'Use',
  'Gold',
  'ZeroBeta',
  

In [485]:
inverted_index = {}

# Iterate over each document
for doc_id, words in clean_tokenized_corpus.items():
    # Iterate over each word in the document
    for word in words:
        # If the word is already in the posting list, add the current document ID to its set of IDs
        if word in inverted_index:
            inverted_index[word].add(doc_id)
        # If the word is not in the posting list, create a new key-value pair with the word and the set containing the current document ID
        else:
            inverted_index[word] = {doc_id}

sorted_inverted_index = {word: sorted(doc_ids) for word, doc_ids in sorted(inverted_index.items())}

# Print the posting list
for word, doc_ids in sorted_inverted_index.items():
    print(f"{word}: {doc_ids}")

00: [5069]
002: [11809, 11814]
018: [11809, 11814]
02: [11799, 11801]
021008: [7911]
025: [3516, 11809, 14019]
02O2δ: [11812]
02O2−: [11813]
03: [11802]
04: [11799, 11801, 11814]
05: [4138, 11809]
054116: [10099]
06: [11799, 11801, 11814, 12429]
075: [11809]
08: [11809, 11814]
1: [76, 100, 146, 164, 170, 281, 327, 1564, 1566, 1633, 1636, 1838, 1846, 1862, 2205, 2565, 2572, 2624, 2698, 2952, 3280, 3316, 3573, 4285, 4590, 4604, 5733, 5793, 6004, 6137, 6520, 6845, 7428, 8662, 8849, 9224, 9227, 9255, 9607, 9698, 9865, 9922, 11122, 11146, 11662, 11799, 11893, 11919, 11982, 12492, 12728, 12736, 13034, 13042, 13232, 14411, 14549, 14557, 14606, 14857, 14863, 15048, 15325, 15333, 15398, 15550, 16042, 16274]
10: [167, 716, 2872, 3829, 4402, 5098, 9407, 11147, 11196, 14078, 14332, 14425]
100: [573, 2160, 5108, 5118, 6512, 9212, 15049]
1000: [12933]
10000: [5585]
100000: [1160]
100m: [5102, 5107, 10264]
101021acsenergyfuels1c02316: [11974, 16506]
101038s42005020004921: [3316]
101057s413010160035z:

In [497]:
query = input()

sdfs


In [499]:
# Calculate the TF-IDF scores for each term in each document
tfidf_scores = {}
for term, doc_ids in inverted_index.items():
    idf = math.log(len(inverted_index) / len(doc_ids))  # Calculate IDF
    for doc_id in doc_ids:
        tf = 1 + math.log(len(inverted_index[term]))  # Calculate TF
        tfidf_scores[(term, doc_id)] = tf * idf  # Calculate TF-IDF

NameError: name 'documents' is not defined

In [None]:
def query_preprocessor(_dict):
    temp = {}
    translator  = str.maketrans('', '', string.punctuation)
    for kee in corpus_dict:
        corpus_dict[kee] = corpus_dict[kee].translate(translator)  
        
        tokens = word_tokenize(corpus_dict[kee])
        
        stop_words = set(stopwords.words('english'))
        filtered = [word for word in tokens if word.casefold() not in stop_words]
        
        temp[kee] = filtered
        
    return temp