In [1]:
import csv
import re
import itertools
import nltk
import math
import json
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from pathlib import Path

   
def getDocumentCollection(csv_path):
    """ Return the content of the csv file passed in parameter in a list format """
    csv_file_lines = []
    with open(csv_path, 'r', encoding="utf8") as file:
        csv_reader = csv.reader(file)
        for line in csv_reader:
            # if there is an empty line just go to the next iteration
            if line[0] == '':
                continue
            else:
                content = ''
                # "clean" the text of the line
                for word in re.sub("[^\w]", " ",  line[0]).split():
                    content += cleanWord(word)+ ' '

                #append the clean line
                line[0] = content 
                csv_file_lines.append(line)
    # return the list, except the headers   
    return csv_file_lines[1:]


def cleanWord(word):
    """cleans word (i.e. stemming, lowercase, removing stop words)"""   
    ps = PorterStemmer()
    # stem word + put in lowercase
    word = ps.stem(word, to_lowercase=True)
    # remove unrelevant words
    stop_words = set(stopwords.words('english'))
    if word in stop_words:
        return ''
    return word
        
        
        
def createIndex(documentCollection):
    """
    returns an index over the given documentCollection. Maps every word to a
    a dictionnary whit all the document/URL who contains it, associated with
    their tfidf score
    """

    # before computing the index, check if its JSON file exists.
    # if so, read JSON instead of recomputing
    path = Path("index.json")
    if path.is_file():
        with open("index.json") as index_file:
            index = json.load(index_file)
        return index


    allWords = set(getAllWordsOfDocumentCollection(documentCollection))
    index = {}
    # build the index with, for the moment, the number of occurences of the word
    # in each document/URL
    for word in allWords:
        for document in documentCollection:
            if word in document[0]:
                tf = re.sub("[^\w]", " ",  document[0]).split().count(word)  #tf = term frequency
                if tf == 0: continue
                if word not in index.keys():
                    index[word] = {document[1]: tf}
                index[word][document[1]] = tf
   
    N = len(documentCollection) # N = number of documents in the corpus (ted talks)
    print('number all docs: ' + str(N))

    # replace each occurences by the tfidf score
    for word in index.keys():
        numDocsContainingWord = len(index[word].keys())
        if numDocsContainingWord == 0: 
            idf = 0
        else: 
            idf = math.log(1+ N/numDocsContainingWord)
        for doc in index[word].keys():
            index[word][doc] = index[word][doc] * idf # tf * idf

        
    # write index as JSON file
    with open('index.json', 'w') as file:
        json.dump(index, file)

    return index

   
def getQuery(keywordString, index):
    """
    the query as list of words from a string. 
    Strategy applied: lowerCase and stemming 
    """   
    print(f"You search:\n{keywordString}")
    keywordList = re.sub("[^\w]", " ",  keywordString.lower()).split()
    cleanedList = [cleanWord(keyword) for keyword in keywordList]
    while('' in cleanedList):
        cleanedList.remove('')
    
    #remove keywords that are not in index.keys()
    indexKeys = index.keys()
    toRemoveList = []
    for word in cleanedList:
        if word not in indexKeys:
            toRemoveList.append(word)

    for wordToRemove in toRemoveList:
        cleanedList.remove(wordToRemove)
   
    return cleanedList


def search(index, query):
    """
    returns the list of documents who match all the keywords set of documents
    matching the query, i.e. that contain all keywords in the query. The return
    list is also sort in order of relevance with the tfidf method
    """   

    listOfSets = []
    for keyword in query:
        # append all urls who match the keyword
        listOfSets.append(set(index[keyword].keys()))
    if len(listOfSets) == 0:
        matchingDocs = set()
    else:
        #keep only the urls who match all the keyword 
        matchingDocs = set.intersection(*listOfSets)
    
    #sort with the tfidf method
    return rankResults(matchingDocs, query, index)

def getAllWordsOfDocumentCollection(documentCollection):
    """
    returns a list who contains all words of the document collection
    """
    words_lists = []
    for line in documentCollection:
        words_list = re.sub("[^\w]", " ",  line[0]).split()
        words_lists.append(words_list)
    
    return list(itertools.chain.from_iterable(words_lists))

def rankResults(matchingDocs, query, index):
    """
    rank the matching documents order of relevance with the tfidf method 
    """
    results = []
    # sum the score of the words by documents
    for doc in matchingDocs:
        tfidfSum = 0
        for keyword in query:
            tfidfSum += index[keyword][doc]
        results.append((doc, tfidfSum))
    # sort the documents by the previously calculated sums    
    results.sort(key=lambda x: x[1], reverse=True)
    return [doc[0] for doc in results]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\olitk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# initialize document collection (read and create index)
path = 'TEDtranscripts.csv'
documentCollection = getDocumentCollection(path)
  
N = len(documentCollection)
index = createIndex(documentCollection)

In [3]:
# interaction loop with the user
while (True):
    print("Type: 'exit' to stop program, 'search' to search")
    userCommand = input()
    if(userCommand ==  'exit'):
        break
    elif(userCommand == 'search'):
        query = getQuery(input('enter keywords seperated by spaces (e.g. test talk example): '), index)
        print('Searched among ' + str(N) + ' TED Talks')
        print('Your TF-IDF ranked search results:')
        searchResult = search(index, query)
        for URL in searchResult:
            print(URL)
    else:
        print('invalid command')

Type: 'exit' to stop program, 'search' to search
search
enter keywords seperated by spaces (e.g. test talk example): work happiness managment burnout
You search:
work happiness managment burnout
Searched among 2467 TED Talks
Your TF-IDF ranked search results:
https://www.ted.com/talks/yves_morieux_as_work_gets_more_complex_6_rules_to_simplify

Type: 'exit' to stop program, 'search' to search
search
enter keywords seperated by spaces (e.g. test talk example): art compose music creativity human
You search:
art compose music creativity human
Searched among 2467 TED Talks
Your TF-IDF ranked search results:
https://www.ted.com/talks/mihaly_csikszentmihalyi_on_flow

https://www.ted.com/talks/billy_collins_everyday_moments_caught_in_time

https://www.ted.com/talks/claron_mcfadden_singing_the_primal_mystery

Type: 'exit' to stop program, 'search' to search
exit
