# $Required Libraries$

In [2]:
import nltk
import pandas as pd
import math
from nltk.stem.porter import *
import os 
import sys
import re
import string
import nltk
nltk.download('stopwords')

## $Reading Data$

## $DictionaryOfLinks$

In [3]:
dictionary_of_links = {'mark-graham': ('https://www.oii.ox.ac.uk/people/mark-graham/?profile','Data Science','OII-University of Oxford'), 
                      'bernie-hogan': ('https://www.oii.ox.ac.uk/people/bernie-hogan/?profile','Data Science','OII-University of Oxford'),
                       'brent-mittelstadt' : ('https://www.oii.ox.ac.uk/people/brent-mittelstadt/?profile','Data Science','OII-University of Oxford'),
                       'gina-neff' : ('https://www.oii.ox.ac.uk/people/gina-neff/?profile','Data Science','OII-University of Oxford'), 
                       'ralph-schroeder' : ('https://www.oii.ox.ac.uk/people/ralph-schroeder/?profile','Data Science','OII-University of Oxford'), 
                       'mariarosaria-taddeo' : ('https://www.oii.ox.ac.uk/people/mariarosaria-taddeo/?profile','Data Science','OII-University of Oxford'),
                       'sandra-wachter' : ('https://www.oii.ox.ac.uk/people/sandra-wachter/?profile','Data Science','OII-University of Oxford'),
                       'vasile-palade' : ('https://pureportal.coventry.ac.uk/en/persons/vasile-palade','Data Science','Coventry University'),
                       'alireza-daneshkhah' : ('https://pureportal.coventry.ac.uk/en/persons/alireza-daneshkhah','Data Science','Coventry University'),
                       'nader-sohrabi-safa' : ('https://pureportal.coventry.ac.uk/en/persons/nader-sohrabi-safa','Data Science','Coventry University'),
                       'james-brusey' : ('https://pureportal.coventry.ac.uk/en/persons/james-brusey','Data Science','Coventry University'),
                       'ye-liu' : ('https://pureportal.coventry.ac.uk/en/persons/ye-liu','Data Science','Coventry University'),
                       'elena-gaura' : ('https://pureportal.coventry.ac.uk/en/persons/elena-gaura','Data Science','Coventry University'),
                       'dr-hager-weslati' : ('https://www.kingston.ac.uk/staff/profile/dr-hager-weslati-662/','Arts','Kingston University'),
                       'professor-john-oacute-maoilearca' : ('https://www.kingston.ac.uk/staff/profile/professor-john-oacute-maoilearca-124/','Arts','Kingston University'),
                       'dr-matthew-melia' : ('https://www.kingston.ac.uk/staff/profile/dr-matthew-melia-648/''Arts','Kingston University'),
                       'dr-mariacutea-menciacutea' : ('https://www.kingston.ac.uk/staff/profile/dr-mariacutea-menciacutea-629/','Arts','Kingston University'),
                       'dr-landeacute-pratt' : ('https://www.kingston.ac.uk/staff/profile/dr-landeacute-pratt-598/','Arts','Kingston University'),
                       'dr-reza-zanjirani-farahani' : ('https://www.kingston.ac.uk/staff/profile/dr-reza-zanjirani-farahani-341/','Business','Kingston University'),
                       'dr-george-alexandrou' : ('https://www.kingston.ac.uk/staff/profile/dr-george-alexandrou-432/','Business','Kingston University'),
                       'dr-rahul-chawdhary' : ('https://www.kingston.ac.uk/staff/profile/dr-rahul-chawdhary-146/','Business','Kingston University'),
                       'dr-john-sebastian-pereira' : ('https://www.kingston.ac.uk/staff/profile/dr-john-sebastian-pereira-356/','Business','Kingston University'),
                       'dr-pauline-parker' : ('https://www.kingston.ac.uk/staff/profile/dr-pauline-parker-932/','Business','Kingston University'),
                       'dr-fatima-annan-diab' : ('https://www.kingston.ac.uk/staff/profile/dr-fatima-annan-diab-381/','Business','Kingston University'),
                       'dr-barry-avery' : ('https://www.kingston.ac.uk/staff/profile/dr-barry-avery-147/','Business','Kingston University'),
                       'dr-marvyn-boatswain' : ('https://www.kingston.ac.uk/staff/profile/dr-marvyn-boatswain-126/','Business','Kingston University')
                      }

In [4]:
# Reading text files having each supervisor's textual information (biography, research, information etc.)

def ReadingSupervisorsDocuments():
    
    #creating a list with supervisor name and respective textual information 
    list_of_data = []
    
    #reading files
    for file in os.listdir('supervisors/'):
        
        prof_text = open(os.path.join('supervisors/', file), 'r').read() # joining path of dir with file name to read text
        prof_text = prof_text.replace('\n',' ')
        prof_name =  file.split('.')[0]
        
        list_of_data.append((prof_name,dictionary_of_links[prof_name][0],prof_text))  #list with name, link and textual data
        
    return list_of_data  #returning list of tuples (name,link,data)

## $RemovingPunctuations$

In [5]:
# Function to remove punctuations from the text and extact only textual information from the prof_text

def CleanData(text):
    
    # using the string.punctuations excluding punctuations
    clean_text = text.translate(str.maketrans('', '', string.punctuation))
    clean_text = re.sub(r'[^A-Za-z\s]+', '', clean_text)  # Getting only textual information for the 
                                                          # safe side used the regular expression as wel
    clean_text = clean_text.lower()  # normalizing the text
    return clean_text

## $Stemming$

In [6]:
# Function to do stemming on each professor's text, playing

def StemmingText(list_of_tokens):
    
    #using the PorterStemmer, we will achive the stemming part
    s = PorterStemmer()
    
    # using the list comprehension to create and return the stemmed list of tokens
    words_after_stemming = [s.stem(token) for token in list_of_tokens]
    return words_after_stemming

## $Tokenization$

In [7]:
# Function to tokenize the data

def TokenizeProfessorsText(text):
    
    #splitting to get the list of tokens
    list_of_tokens = text.split()
    return list_of_tokens

## $StopWordsRemoval$

In [14]:
# Function to remove stop words from each professor's text

def RemoveStopWords(list_of_tokens):
    
    # Getting stop words using nltk library (english version) and excluding from our text
    with open(r'C:/Users/faisal.maqbool/AppData/Roaming/nltk_data/corpora/stopwords/english') as stopFile:
        stop_words = [line.rstrip('\n') for line in stopFile]
    list_without_sw = []
    for word in list_of_tokens:
        if word.lower() not in stop_words:  # checking if word present in stopwords from nltk
            list_without_sw.append(word)  # appending only valueable tokens
            
    return (list_without_sw)

## $CreatingDictionaryOfTokens$

In [9]:
# Function to create a dictionary of tokens from each professor's text

def CreatingDictionaryOfTokens(list_of_data):
    dict_of_tokens = {}
    all_words = set()
    for each_tuple in list_of_data:
        
        clean_text = CleanData(each_tuple[2])  #Cleaning
        list_of_tokens = TokenizeProfessorsText(clean_text) # Tokenizing the text
        list_of_tokens = RemoveStopWords(list_of_tokens)
        #list_of_tokens = StemmingText(list_of_tokens)  # Getting the stemmed list of tokens
        
        dict_of_tokens[each_tuple[0]] = list_of_tokens
        all_words |= set(list_of_tokens)
        
    return dict_of_tokens, all_words

# $Generate VocabularyForTFidf$

In [10]:
# Function to get the vocabulary of the tokens

def VocabularyCreation(dictOfTokens):
    
    vocabulary = []
        
    return sum(list(dictOfTokens.values()),[])

# $Full-Inverted-Index$

In [11]:
def CreateFullInvertedIndex(dictOfTokens):
    # Creating full inverted index

    full_inverted_index = {word:set((prof_name,word_count) for prof_name,list_of_tokens in dictOfTokens.items() for word_count in (index for index, wordInUse in enumerate(list_of_tokens) if word == wordInUse) if word in list_of_tokens) for word in vocabulary}
    
    return full_inverted_index

# $Reading Documents$

In [12]:
documents_information = ReadingSupervisorsDocuments()

##### $Creating-Dictionary-of-Tokens$

In [15]:
dictOfTokens, all_words = CreatingDictionaryOfTokens(documents_information)

##### $Vocabulary-Generation$

In [16]:
vocabulary = VocabularyCreation(dictOfTokens)

# $Creating-Full-Inverted-Index$

In [None]:
full_inverted_index = CreateFullInvertedIndex(dictOfTokens)

In [79]:
full_inverted_index['oxford']

{('bernie-hogan', 314),
 ('brent-mittelstadt', 12),
 ('brent-mittelstadt', 139),
 ('dr-matthew-melia', 364),
 ('dr-reza-zanjirani-farahani', 100),
 ('dr-reza-zanjirani-farahani', 241),
 ('dr-reza-zanjirani-farahani', 382),
 ('dr-reza-zanjirani-farahani', 523),
 ('gina-neff', 9),
 ('gina-neff', 15),
 ('mariarosaria-taddeo', 6),
 ('mark-graham', 4),
 ('mark-graham', 254),
 ('mark-graham', 380),
 ('mark-graham', 415),
 ('mark-graham', 435),
 ('mark-graham', 494),
 ('mark-graham', 495),
 ('mark-graham', 1269),
 ('mark-graham', 1519),
 ('mark-graham', 1698),
 ('mark-graham', 1757),
 ('mark-graham', 1758),
 ('professor-john-oacute-maoilearca', 941),
 ('professor-john-oacute-maoilearca', 943),
 ('ralph-schroeder', 33),
 ('ralph-schroeder', 375),
 ('ralph-schroeder', 425),
 ('sandra-wachter', 18),
 ('sandra-wachter', 22),
 ('sandra-wachter', 140),
 ('vasile-palade', 27),
 ('ye-liu', 55)}

# $Term-Frequency$

In [18]:
def TermFrequency(list_of_words):
    
    # Setting a dictionary for term frequency of each 
    term_frequency = {}
    
    for word in list_of_words:
        
        term_frequency[word] = list_of_words.count(word)
        
    return term_frequency

# $Inverted-Index-And-Idf Scores$

In [19]:
def CreateInvertedIndexAndIdfScores(dictOfTokens):
    
    #create list for vocabulary for tf - idf score extraction
    
    vocabulary = VocabularyCreation(dictOfTokens)
    
    inverted_index = {}
    
    score_idf = {}
    
    #setting total number of documents
    number_of_documents = len(dictOfTokens)
    
    for word in vocabulary:
        
        #creating a count for calculating the tf score
        word_count = 0
        for professor_name, list_of_words in dictOfTokens.items():  # Getting tokens from created dict of tokens
            
            if word in list_of_words :  # checking if the vocabulary word is present in prof's list of tokens
                word_count = word_count + 1 
                
                if word in inverted_index.keys():   # checking if the word already exist and a document, then append the next 
                                                    # professor name
                    
                    inverted_index[word].append(professor_name)
                    
                else:
                    inverted_index[word] = [professor_name]     # else only append the one prof_name
            
        #calculating the idf score - word level
          
        score_idf[word] = math.log10(number_of_documents / word_count)    #inverse doc frequency  = No of docs / tf_for word
          
    return inverted_index, score_idf

In [20]:
inverted_index, idf_scores = CreateInvertedIndexAndIdfScores(dictOfTokens)

# $Calculating-Tfidf Scores$

In [21]:
def CalculateTfidfScores(dictOfTokens,score_idf):
    
    tf_idf_score = {}
    
    for key,value in dictOfTokens.items():   #traversing our list of tuple, containing prof name, link and text
        
        tf_idf_score[key] = TermFrequency(value)    # First story the term frequency score against each key (word)
        
    for prof_name,tf_scores in tf_idf_score.items():  # Traversing the newly created tf_idf_score dictionary
        
        for word, score in tf_scores.items():         # Traversing the internally each, word and its respective score
            
            term_frequency = score                    # Setting the score
             
            inverse_doc_frequency = score_idf[word]   # Extracting the idf score from our idf_scores dictionary for word
            
            tf_scores[word] = term_frequency * inverse_doc_frequency   # storing the tf-idf score in the same 
                                                                        # manner Word : tf-idf instead of word : tf
            
    return tf_idf_score                               # returning the tf_idf_score dictionary

In [22]:
tfidfscores = CalculateTfidfScores(dictOfTokens,idf_scores)

# $Basic Search$

In [23]:
from pprint import pprint as pp
from functools import reduce

In [24]:
def FindQueryWordsInInvertedIndex(list_of_queryWords):
    # for each output from inverted index against a word, we are reducing the intersection and passing our dictionary of tokens
    
    return reduce(set.intersection,(inverted_index[queryWord] for queryWord in list_of_queryWords),set(dictOfTokens.keys()))

## $Getting-User-Input$

In [25]:
query_search_tokens = input('Search Supervisor: ').lower().split()  # Getting user input, 
                                                                    #normalizing and creating list of tokens

Search Supervisor: data science


In [26]:
pp(FindQueryWordsInInvertedIndex(query_search_tokens))

{'alireza-daneshkhah',
 'bernie-hogan',
 'dr-marvyn-boatswain',
 'elena-gaura',
 'gina-neff',
 'james-brusey',
 'mariarosaria-taddeo',
 'nader-sohrabi-safa',
 'ralph-schroeder',
 'sandra-wachter',
 'vasile-palade'}


# $Ranking$

In [27]:
query_dic = {1 : query_search_tokens}

In [28]:
query_scores = CalculateTfidfScores(query_dic,idf_scores)

In [29]:
import operator

In [30]:
for key, list_of_query_words in query_dic.items():
    # traversing the query words and one by one calculating the cosine similarity aginst each document it appears in
    
    # initializing a dic for professor similarity
    prof_sum = {}
    
    for queryWord in list_of_query_words:                    # traversing the list of words from query
        if queryWord in inverted_index.keys():  # checking if that word appears in inverted index
            
            professors = inverted_index[queryWord]   # getting professors from inverted index against each query word

            for prof in professors:                  # Traversing each professor it appears in 

                prof_score = tfidfscores[prof][queryWord]   # calculating the tfidf score against each professor

                prof_length = math.sqrt(sum(x ** 2 for x in tfidfscores[prof].values()))  # Normalizing the professor doc length

                query_score = query_scores[key][queryWord]  # same step done for query 

                query_length = math.sqrt(query_score ** 2)  # Normalizing the query score

                similarity = (prof_score * query_score) / (prof_length * query_length)   # Calculating the cosine similarity

                if prof in prof_sum.keys():               # finding if already professor available in similarity dic 

                    prof_sum[prof] = prof_sum[prof] + similarity   # add similarity for new coming word 

                else:
                    prof_sum[prof] = similarity          # if not, then add the similarity for first time
        else:
            print("The "+queryWord + " is not available in the inverted index.")   

    ranked_professors = sorted(prof_sum.items(), key=operator.itemgetter(1), reverse = True) # Sorting against Ranking

# $Search-Engine-Result$

In [32]:
print('Relevant supervisors available against your query in UK are :')
print('----------------------------------------------------------------')
print('----------------------------------------------------------------\n')

for index, prof in enumerate(ranked_professors):
    print(index + 1)
    print('Name: '+ prof[0].title() + '\n')
    print('University: '+ dictionary_of_links[prof[0]][2] + '\n')
    print('Profile Link: '+ dictionary_of_links[prof[0]][0] + '\n')
    print('Department: ' + dictionary_of_links[prof[0]][1] + '\n')
    print('tf-idf Score: ' + str(prof[1]) + '\n')
    print('-----------------------------------------------------')
    print('-----------------------------------------------------')

Relevant supervisors available against your query in UK are :
----------------------------------------------------------------
----------------------------------------------------------------

1
Name: Brent-Mittelstadt

University: OII-University of Oxford

Profile Link: https://www.oii.ox.ac.uk/people/brent-mittelstadt/?profile

Department: Data Science

tf-idf Score: 9.994601148099505

-----------------------------------------------------
-----------------------------------------------------
2
Name: Gina-Neff

University: OII-University of Oxford

Profile Link: https://www.oii.ox.ac.uk/people/gina-neff/?profile

Department: Data Science

tf-idf Score: 8.026371639393075

-----------------------------------------------------
-----------------------------------------------------
3
Name: Alireza-Daneshkhah

University: Coventry University

Profile Link: https://pureportal.coventry.ac.uk/en/persons/alireza-daneshkhah

Department: Data Science

tf-idf Score: 7.116543037270774

------------