In [63]:
import pandas as pd
from tokenize import tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string

In [25]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michalpurtak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [53]:
stop_words = set(stopwords.words('english'))

In [64]:
books_df = pd.read_csv("books.csv")

In [56]:
def remove_stop_words(row):
    return [w for w in row['abstract'] if not w in stop_words and not w in string.punctuation]

In [27]:
def tokenize(row):
    return word_tokenize(row['abstract'])

In [28]:
def stemWords(row):
    porter = SnowballStemmer("english", ignore_stopwords=True)
    words = []
    for word in row['abstract']:
        words.append(porter.stem(word))
    return words

In [29]:
tokenized_abstracts = books_df.apply(tokenize, axis=1)

In [30]:
books_tokenized = list(zip(books_df['name'], tokenized_abstracts))

In [31]:
books_tokenized_df = pd.DataFrame({'name':books_df['name'], 'abstract':tokenized_abstracts})

In [12]:
books_tokenized_df.to_csv("books_tokenized.csv")

In [32]:
stemmed_abstracts = books_tokenized_df.apply(stemWords, axis = 1)

In [34]:
stemmed_abstracts[:5]

0    [1066, and, all, that, :, a, memor, histori, o...
1    [1066, and, all, that, :, a, memor, histori, o...
2    [2010, :, odyssey, two, is, a, 1982, scienc, f...
3    [a, crown, of, sword, (, abbrevi, as, aco, by,...
4    [publish, on, 15, april, 1755, and, written, b...
dtype: object

In [43]:
books_tokenized_stemmed_df = pd.DataFrame({'name':books_tokenized_df['name'], 'abstract':stemmed_abstracts})

In [44]:
books_tokenized_stemmed_df[:5]

Unnamed: 0,name,abstract
0,1066 and All That,"[1066, and, all, that, :, a, memor, histori, o..."
1,1066 and All That: A Memorable History of Eng...,"[1066, and, all, that, :, a, memor, histori, o..."
2,2010: Odyssey Two,"[2010, :, odyssey, two, is, a, 1982, scienc, f..."
3,A Crown of Swords,"[a, crown, of, sword, (, abbrevi, as, aco, by,..."
4,A Dictionary of the English Language,"[publish, on, 15, april, 1755, and, written, b..."


In [57]:
stopwords_removed_abstracts = books_tokenized_stemmed_df.apply(remove_stop_words, axis=1)

In [58]:
stopwords_removed_abstracts[:5]

0    [1066, memor, histori, england, compris, part,...
1    [1066, memor, histori, england, compris, part,...
2    [2010, odyssey, two, 1982, scienc, fiction, no...
3    [crown, sword, abbrevi, aco, fan, seventh, boo...
4    [publish, 15, april, 1755, written, samuel, jo...
dtype: object

In [59]:
books_tokenized_stemmed_df = pd.DataFrame({'name':books_tokenized_df['name'], 'abstract':stopwords_removed_abstracts})

In [62]:
books_tokenized_stemmed_df.to_csv("books_tokenized_stemmed.csv")

In [99]:
search_index = dict()

In [69]:
book_names = books_tokenized_stemmed_df['name']

In [70]:
def fillIndexWithWordsFromAbstract(row):
    for word in row['abstract']:
        if word not in search_index:
            search_index[word] = dict()
        if row['name'] not in search_index:
            search_index[word][row['name']] = 1
        else:
            search_index[word][row['name']] += 1

In [90]:
no_documents = len(book_names)

In [91]:
documents = books_tokenized_stemmed_df['abstract']
documents[:5]

0    [1066, memor, histori, england, compris, part,...
1    [1066, memor, histori, england, compris, part,...
2    [2010, odyssey, two, 1982, scienc, fiction, no...
3    [crown, sword, abbrevi, aco, fan, seventh, boo...
4    [publish, 15, april, 1755, written, samuel, jo...
Name: abstract, dtype: object

In [100]:
number_of_doc = 0
for document in documents:
    for word in document:
        if word not in search_index:
            search_index[word] = dict()
        if book_names[number_of_doc] not in search_index[word]:
            search_index[word][book_names[number_of_doc]] = 1
        else:
            search_index[word][book_names[number_of_doc]] += 1
    number_of_doc += 1

In [89]:
nothing = books_tokenized_stemmed_df.apply(fillIndexWithWordsFromAbstract, axis=1)

In [102]:
search_index['death']

{'Anne of Green Gables': 1,
 'Commentarii de Bello Gallico': 1,
 '(Commentaries on the Gallic War)': 1,
 'How Green Was My Valley': 1,
 'Mort': 2,
 'Northanger Abbey': 2,
 'Stranger in a Strange Land': 1,
 'The Book of the Courtier': 1,
 'The Hound of the Baskervilles': 1,
 'The Salmon of Doubt': 2,
 'L’Étranger': 2,
 'The Stranger or The Outsider': 2,
 'The Transmigration of Timothy Archer': 1,
 'The Wasp Factory': 1,
 'Wuthering Heights': 1,
 'The Great Betrayal': 1,
 'The Sands of Time': 1,
 'All Fall Down,': 1,
 'The Brandon deWilde Story': 1,
 'All Fall Down': 1,
 'Between Heaven and Hell': 1,
 'Blue Murder': 1,
 'Closing Time': 1,
 'Come as You Are: The Story of Nirvana': 1,
 'Death Star': 4,
 'Death in Venice': 1,
 'Der Tod in Venedig': 1,
 'Gangster': 1,
 'Joe Cinque’s Consolation: A True Story of Death, Grief and the Law': 2,
 'Man Overboard': 1}

In [103]:
query = "science fiction novel"

In [107]:
query_tokenized = word_tokenize(query)
porter = SnowballStemmer("english", ignore_stopwords=True)
query_tokenized_stemmed = []
for word in query_tokenized:
    query_tokenized_stemmed.append(porter.stem(word))

query_tokenized_stemmed

['scienc', 'fiction', 'novel']

In [126]:
def execute_query(query):
    try:
        query_tokenized = word_tokenize(query)
        porter = SnowballStemmer("english", ignore_stopwords=True)
        query_tokenized_stemmed = []
        for word in query_tokenized:
            query_tokenized_stemmed.append(porter.stem(word))
        search_results_for_each_term = [set(search_index[term].keys()) for term in query_tokenized_stemmed]
        return set.intersection(*search_results_for_each_term)
    except KeyError:
        return "no results"

In [129]:
execute_query("down")

'no results'