In [518]:
import pandas as pd
import string
import math 

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [519]:
df = pd.read_csv("publications_data.csv")
len(df)

17524

In [520]:
df = df.dropna()

In [521]:
one_word_titles = df[df['doc_title'].str.count('\s') == 0]['doc_title']
one_word_titles

180               Cut-up
265         Introduction
266         Introduction
667             Religion
679             Religion
              ...       
16338    auto-invitation
16634               Lash
16981         Assessment
17164              Ghana
17265       Introduction
Name: doc_title, Length: 129, dtype: object

In [522]:
df = df[~df['doc_title'].isin(one_word_titles)]

In [523]:
len(df)

16712

In [524]:
ids = []
for i in range(1, len(df)+1):
    ids.append(i)

In [525]:
df['doc_id'] = ids

In [526]:
df

Unnamed: 0,name,job_title,organization,doc_link,doc_title,doc_id
0,Mohammed Ahmed,Lecturer in Project Management,School of Strategy and Leadership,https://pureportal.coventry.ac.uk/en/publicati...,Autonomous Vehicle and its Adoption: Challenge...,1
1,Mohammed Ahmed,Lecturer in Project Management,School of Strategy and Leadership,https://pureportal.coventry.ac.uk/en/publicati...,Predicting the Public Adoption of Connected an...,2
2,Mohammed Ahmed,Lecturer in Project Management,School of Strategy and Leadership,https://pureportal.coventry.ac.uk/en/publicati...,Investigating the Effect of Mass Variation for...,3
3,Mohammed Ahmed,Lecturer in Project Management,School of Strategy and Leadership,https://pureportal.coventry.ac.uk/en/publicati...,A new method towards achieving FES-induced mov...,4
4,Mohammed Ahmed,Lecturer in Project Management,School of Strategy and Leadership,https://pureportal.coventry.ac.uk/en/publicati...,Restoration of Movement using FES: An Introduc...,5
...,...,...,...,...,...,...
17519,Jackie Abell,Associate Professor Research,"Centre for Agroecology, Water and Resilience",https://pureportal.coventry.ac.uk/en/publicati...,Mad Cows & British Politicians: The Role of Sc...,16708
17520,Jackie Abell,Associate Professor Research,"Centre for Agroecology, Water and Resilience",https://pureportal.coventry.ac.uk/en/publicati...,Broadcasting the royal role: Constructing cult...,16709
17521,Jackie Abell,Associate Professor Research,"Centre for Agroecology, Water and Resilience",https://pureportal.coventry.ac.uk/en/publicati...,The Discursive (Re)Construction of Events,16710
17522,Jackie Abell,Associate Professor Research,"Centre for Agroecology, Water and Resilience",https://pureportal.coventry.ac.uk/en/publicati...,"I take full responsibility, I take some respon...",16711


In [527]:
doc_sep = df.loc[:,['doc_id','doc_title']]

In [528]:
doc_sep.head()

Unnamed: 0,doc_id,doc_title
0,1,Autonomous Vehicle and its Adoption: Challenge...
1,2,Predicting the Public Adoption of Connected an...
2,3,Investigating the Effect of Mass Variation for...
3,4,A new method towards achieving FES-induced mov...
4,5,Restoration of Movement using FES: An Introduc...


In [529]:
doc_id_list = list(doc_sep['doc_id'])
doc_title_list = list(doc_sep['doc_title'])

corpus_dict = {doc_id_list[i] : doc_title_list[i] for i in range(len(doc_sep))}

In [530]:
corpus_dict

{1: 'Autonomous Vehicle and its Adoption: Challenges, Opportunities, and Future Implications',
 2: 'Predicting the Public Adoption of Connected and Autonomous Vehicles',
 3: 'Investigating the Effect of Mass Variation for Sliding Mode Control of Functional Electrical Stimulation Aided Sit-to-Stand in Paraplegia',
 4: 'A new method towards achieving FES-induced movement',
 5: 'Restoration of Movement using FES: An Introductory Study I',
 6: 'Kinematic modelling of FES induced sit-to-stand movement in Paraplegia',
 7: 'SMC scheme for FES aided restoration of STS movement in paraplegics',
 8: 'Evolution of intelligent and nonlinear control approaches for FES induced movement generation of the lower limb',
 9: 'New Concept for FES-Induced Movements',
 10: 'Assessing the Use of Gold as a Zero-Beta Asset in Empirical Asset Pricing: Application to the US Equity Market',
 11: 'Cross-Layer Multipath Multichannel MAC protocol for MANETs',
 12: 'Performance evaluation of Receiver Directed Transmi

In [None]:
list(corpus_dict.values())

In [None]:
def text_preprocessor(corpus_dict):
    temp = {}
    translator  = str.maketrans('', '', string.punctuation)
    for kee in corpus_dict:
        corpus_dict[kee] = corpus_dict[kee].translate(translator)  
        
        tokens = word_tokenize(corpus_dict[kee])
        
        stop_words = set(stopwords.words('english'))
        filtered = [word for word in tokens if word.casefold() not in stop_words]
        
        temp[kee] = filtered
        
    return temp

clean_tokenized_corpus = text_preprocessor(corpus_dict)

In [None]:
clean_tokenized_corpus

In [None]:
inverted_index = {}

# Iterate over each document
for doc_id, words in clean_tokenized_corpus.items():
    # Iterate over each word in the document
    for word in words:
        # If the word is already in the posting list, add the current document ID to its set of IDs
        if word in inverted_index:
            inverted_index[word].add(doc_id)
        # If the word is not in the posting list, create a new key-value pair with the word and the set containing the current document ID
        else:
            inverted_index[word] = {doc_id}

sorted_inverted_index = {word: sorted(doc_ids) for word, doc_ids in sorted(inverted_index.items())}

# Print the posting list
for word, doc_ids in sorted_inverted_index.items():
    print(f"{word}: {doc_ids}")

In [None]:
len(sorted_inverted_index['00'])

In [None]:
# Calculate the TF-IDF scores for each term in each document
tfidf_scores = {}
for term, doc_ids in sorted_inverted_index.items():
    idf = math.log(len(corpus_dict) / len(doc_ids))  # total no. of docs / doc freq of term 't'
    for doc_id in doc_ids:
        tf = 1 + math.log(len(sorted_inverted_index[term]))  # Calculate TF
        tfidf_scores[(term, doc_id)] = tf * idf  # Calculate TF-IDF

In [None]:
tfidf_scores

In [None]:
tfidf_scores.get(('00', 5069), 0)

In [None]:
query = "obesity in kids"

query_terms = query.split()
query_vector = [tfidf_scores.get((term, 0), 0) for term in query_terms]

In [None]:
query_vector

In [None]:
similarities = {}
for doc_id in range(1, len(corpus_dict)):  # Assuming there are 4 documents with IDs 1, 2, 3, 4
    doc_vector = [tfidf_scores.get((term, doc_id), 0) for term in query_terms]
    dot_product = sum(x * y for x, y in zip(query_vector, doc_vector))
    query_norm = math.sqrt(sum(x ** 2 for x in query_vector))
    doc_norm = math.sqrt(sum(x ** 2 for x in doc_vector))
    similarity = dot_product / (query_norm * doc_norm)
    similarities[doc_id] = similarity

# Sort documents by cosine similarity
sorted_documents = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

# Print the sorted documents
print("Search Results:")
for doc_id, similarity in sorted_documents:
    print(f"Document {doc_id}: Similarity = {similarity}")