In [63]:
import pandas as pd
from tokenize import tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string

In [25]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michalpurtak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [53]:
stop_words = set(stopwords.words('english'))

In [64]:
books_df = pd.read_csv("books.csv")

In [56]:
def remove_stop_words(row):
    return [w for w in row['abstract'] if not w in stop_words and not w in string.punctuation]

In [27]:
def tokenize(row):
    return word_tokenize(row['abstract'])

In [28]:
def stemWords(row):
    porter = SnowballStemmer("english", ignore_stopwords=True)
    words = []
    for word in row['abstract']:
        words.append(porter.stem(word))
    return words

In [29]:
tokenized_abstracts = books_df.apply(tokenize, axis=1)

In [30]:
books_tokenized = list(zip(books_df['name'], tokenized_abstracts))

In [31]:
books_tokenized_df = pd.DataFrame({'name':books_df['name'], 'abstract':tokenized_abstracts})

In [12]:
books_tokenized_df.to_csv("books_tokenized.csv")

In [32]:
stemmed_abstracts = books_tokenized_df.apply(stemWords, axis = 1)

In [34]:
stemmed_abstracts[:5]

0    [1066, and, all, that, :, a, memor, histori, o...
1    [1066, and, all, that, :, a, memor, histori, o...
2    [2010, :, odyssey, two, is, a, 1982, scienc, f...
3    [a, crown, of, sword, (, abbrevi, as, aco, by,...
4    [publish, on, 15, april, 1755, and, written, b...
dtype: object

In [43]:
books_tokenized_stemmed_df = pd.DataFrame({'name':books_tokenized_df['name'], 'abstract':stemmed_abstracts})

In [44]:
books_tokenized_stemmed_df[:5]

Unnamed: 0,name,abstract
0,1066 and All That,"[1066, and, all, that, :, a, memor, histori, o..."
1,1066 and All That: A Memorable History of Eng...,"[1066, and, all, that, :, a, memor, histori, o..."
2,2010: Odyssey Two,"[2010, :, odyssey, two, is, a, 1982, scienc, f..."
3,A Crown of Swords,"[a, crown, of, sword, (, abbrevi, as, aco, by,..."
4,A Dictionary of the English Language,"[publish, on, 15, april, 1755, and, written, b..."


In [57]:
stopwords_removed_abstracts = books_tokenized_stemmed_df.apply(remove_stop_words, axis=1)

In [58]:
stopwords_removed_abstracts[:5]

0    [1066, memor, histori, england, compris, part,...
1    [1066, memor, histori, england, compris, part,...
2    [2010, odyssey, two, 1982, scienc, fiction, no...
3    [crown, sword, abbrevi, aco, fan, seventh, boo...
4    [publish, 15, april, 1755, written, samuel, jo...
dtype: object

In [59]:
books_tokenized_stemmed_df = pd.DataFrame({'name':books_tokenized_df['name'], 'abstract':stopwords_removed_abstracts})

In [62]:
books_tokenized_stemmed_df.to_csv("books_tokenized_stemmed.csv")

In [191]:
search_index = dict()

In [69]:
book_names = books_tokenized_stemmed_df['name']

In [70]:
def fillIndexWithWordsFromAbstract(row):
    for word in row['abstract']:
        if word not in search_index:
            search_index[word] = dict()
        if row['name'] not in search_index:
            search_index[word][row['name']] = 1
        else:
            search_index[word][row['name']] += 1

In [90]:
no_documents = len(book_names)

In [91]:
documents = books_tokenized_stemmed_df['abstract']
documents[:5]

0    [1066, memor, histori, england, compris, part,...
1    [1066, memor, histori, england, compris, part,...
2    [2010, odyssey, two, 1982, scienc, fiction, no...
3    [crown, sword, abbrevi, aco, fan, seventh, boo...
4    [publish, 15, april, 1755, written, samuel, jo...
Name: abstract, dtype: object

In [192]:
number_of_doc = 0
for document in documents:
    for word in document:
        if word not in search_index:
            search_index[word] = dict()
        if book_names[number_of_doc] not in search_index[word]:
            search_index[word][book_names[number_of_doc]] = 1
        else:
            search_index[word][book_names[number_of_doc]] += 1
    number_of_doc += 1

In [103]:
query = "science fiction novel"

In [159]:
def get_max_freq(document):
    max_freq = 0
    for key in search_index.keys():
        try:
            frequency = search_index[key][document]
        except KeyError:
            continue
        if frequency > max_freq:
            max_freq = frequency
    return max_freq

In [165]:
get_max_freq("Blue Murder")

3

In [172]:
from math import log10
def get_idf(term):
    n_t = len(list(search_index[term]))
    n = len(book_names)
    return log10(n_t/n)
    

In [173]:
get_idf("blue")

-1.853871964321762

In [178]:
def calculate_tf_idf(term, document):
    freq = search_index[term][document]
    tf = freq/get_max_freq(document)
    idf = get_idf(term)
    return tf*idf
    

In [180]:
calculate_tf_idf("novel", "Death Star")

-0.03379600091958039

In [194]:
import copy
search_index_tf_idf = copy.deepcopy(search_index)
type(search_index_tf_idf)

dict

In [195]:
for term in search_index_tf_idf.keys():
    for document in search_index_tf_idf[term]:
        freq = search_index_tf_idf[term][document]
        search_index_tf_idf[term][document] = (freq, calculate_tf_idf(term, document))

In [207]:
search_index_tf_idf["blue"]['Between Planets'][1]

-0.926935982160881

In [217]:
def execute_query(query):
    try:
        query_tokenized = word_tokenize(query)
        porter = SnowballStemmer("english", ignore_stopwords=True)
        query_tokenized_stemmed = []
        
        for word in query_tokenized:
            query_tokenized_stemmed.append(porter.stem(word))
        search_results_for_each_term_tf_idf = [search_index_tf_idf[term] for term in query_tokenized_stemmed]
        search_results_for_each_term_tf_idf_set = [set(search_index_tf_idf[term].keys()) for term in query_tokenized_stemmed]
        search_results_for_query = set.intersection(*search_results_for_each_term_tf_idf_set)
        
        results = []
        for document in search_results_for_query:
            tf_idf = 0
            for term_search_result in search_results_for_each_term_tf_idf:
                tf_idf += term_search_result[document][1]
            results.append((document, tf_idf))
        results = sorted(results, key = lambda x: x[1], reverse=True)
        return results
    except KeyError:
        return "no results"

In [218]:
execute_query("science fiction novel")

[('Death Star', -0.2366057390712093),
 ('Space Cadet', -0.33124803469969305),
 ("Ender's Game", -0.40866730719838584),
 ('Heart of Gold', -0.41406004337461627),
 ('The Invisible Man', -0.41406004337461627),
 ('Fiasko', -0.42587683727451814),
 ('Hunted', -0.42587683727451814),
 ('Fiasco', -0.42587683727451814),
 ('The Illuminatus! Trilogy', -0.4516908871648183),
 ('The Last Man', -0.473203044983882),
 ('Ragamuffin', -0.5205056398493433),
 ('2010: Odyssey Two', -0.5205056398493433),
 ('The Taint', -0.5520800578328217),
 ('The Shockwave Rider', -0.5520800578328217),
 ('Headlong', -0.5914890482024133),
 ('Cowl', -0.5914890482024133),
 ('Consider Phlebas', -0.5914890482024133),
 ('Vingt mille lieues sous les mers', -0.5914890482024133),
 ('Larger than Life', -0.6309373933118425),
 ('Kaleidoscope', -0.6309373933118425),
 ('Love and War', -0.6474080428711634),
 ('Stranger in a Strange Land', -0.7097947287908635),
 ('The Diamond Age', -0.7097947287908635),
 ('Now Wait for Last Year', -0.709794