In [3]:
import pandas as pd
from langdetect import detect
import string
import data_collector
import parser
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import defaultdict
import pickle

# 1. Data Collection

## Get the list of the books
We already have the list of books in the pc, so we won't do it again.

Set to `True` both dirs, bests and links parameters to create the correct directories and download the txt containing all the html links.

In [2]:
data_collector.download_books(dirs=False, bests=False, links=False)

## 1.1 Crawl books
We already have all the htmls in the pc, so we won't do it again.

Set to `True` both the books and fails parameters to download all the html pages and remove the ones with broken pages.

In [3]:
data_collector.download_books(books=False, fails=False)

## 1.2 Parse downloaded pages
Set to `True` the create parameter to parse the downloaded html pages and create the tsv file.

In [4]:
parser.create_tsv(create=False)

In [5]:
df = pd.read_csv('parsed_books.tsv', sep='\t')

In [6]:
df.shape

(29959, 12)

In [7]:
df.head()

Unnamed: 0,bookTitle,bookSeries,bookAuthors,ratingValue,ratingCount,reviewCount,Plot,numberOfPages,PublishingDate,Characters,Setting,Url
0,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,6408798.0,172554.0,"Could you survive on your own in the wild, wit...",374.0,September 14th 2008,Katniss Everdeen Peeta Mellark Cato (Hunger Ga...,"District 12, Panem Capitol, Panem Panem",https://www.goodreads.com/book/show/2767052-th...
1,Harry Potter and the Order of the Phoenix,Harry Potter #5,J.K. Rowling,4.5,2525157.0,42734.0,There is a door at the end of a silent corrido...,870.0,September 2004,Sirius Black Draco Malfoy Ron Weasley Petunia ...,Hogwarts School of Witchcraft and Wizardry Lon...,https://www.goodreads.com/book/show/2.Harry_Po...
2,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,4.28,4527405.0,91802.0,The unforgettable novel of a childhood in a sl...,324.0,May 23rd 2006,Scout Finch Atticus Finch Jem Finch Arthur Rad...,"Maycomb, Alabama",https://www.goodreads.com/book/show/2657.To_Ki...
3,Pride and Prejudice,,Jane Austen,4.26,3017830.0,67811.0,Alternate cover edition of ISBN 9780679783268S...,279.0,October 10th 2000,Mr. Bennet Mrs. Bennet Jane Bennet Elizabeth B...,"United Kingdom Derbyshire, England England Her...",https://www.goodreads.com/book/show/1885.Pride...
4,Twilight,The Twilight Saga #1,Stephenie Meyer,3.6,4989910.0,104912.0,About three things I was absolutely positive.F...,501.0,September 6th 2006,Edward Cullen Jacob Black Laurent Renee Bella ...,"Forks, Washington Phoenix, Arizona Washington ...",https://www.goodreads.com/book/show/41865.Twil...


## 1.3 Dataset cleaning [preliminary steps]
Before actually jumping into the work itself, we want our dataframe to be clean, meaning that there are some preliminary steps we need to perform on it. First of all, missing data is something we should pay attention to. Lot's of rows are going to have missing data somewhere, and dealing with missing data it's not that nice. Notice that this will include different strategies for each of the column we will be considering (more details below). Then there is the problem with punctuation, stopwords, stems and so on so forth, so basic text data preprocessing. Let's make a brief recap:

1. **Missing data**
    - `bookTitle`: if a book is missing the title, then we can safely just remove the instance. In fact, books that are missing the title are actually missing all the informations, meaning that there is a problem with the GoodReads specific link. Also, even if a book was missing just the title, we wouldn't have a way to refer to it, thus it wouldn't be really useful considering we're building a search engine.
    - `bookSeries`, `Authors`, `Plot`, `PublishingDate`, `Characters`, `Setting`: if a book is missing one of the above mentioned columns, we can still include the book in the data, since the search engine could for example work with just the title. Obviously, we cannot just leave the values missing, since it would be really hard to perform any operation on that. These are all text columns, therefore the best way to address the missing values prolem is to replace NaNs with empty strings.
    - `ratingValue`, `NumberofPages`: TODO?
2. **Text data preprocessing**
    - Punctuation removal: this is the first step we want to perform, since it is going to make the next steps much easier (e.g., language detection will be easier if there aren't plots composed just by punctuation symbols).
    - Language detection: before doing anything else, we want to remove the books that present the books for which the plot isn't in english.
    - Stopwords removal (of the `Plot` column only)
    - Stemming (of the `Plot` column only)
    - Lowercase

### Missing values

#### Title
There are 774 books that are completely empty, and these corresponds to the ones that are missing the `bookTitle` column. If you give a look at the url, you can see that these are not given by our python script to download and parse the books, but actually from the fact that the link is broken. Also, you can see that all the books that are missing the `bookTitle` are also missing all the remaining data.

This means that we can safely just remove all the rows that are missing the `bookTitle` column.

In [8]:
n_missing = df[(df['bookTitle'].isna())].shape[0]
print('There are {} instances that are missing the `bookTitle` column.'.format(n_missing))
print()
df[(df['bookTitle'].isna())].head()

There are 774 instances that are missing the `bookTitle` column.



Unnamed: 0,bookTitle,bookSeries,bookAuthors,ratingValue,ratingCount,reviewCount,Plot,numberOfPages,PublishingDate,Characters,Setting,Url
311,,,,,,,,,,,,https://www.goodreads.com/book/show/40937505\r\n
370,,,,,,,,,,,,https://www.goodreads.com/book/show/30528535\r\n
379,,,,,,,,,,,,https://www.goodreads.com/book/show/30528544\r\n
789,,,,,,,,,,,,https://www.goodreads.com/book/show/40941582\r\n
1141,,,,,,,,,,,,https://www.goodreads.com/book/show/5295735\r\n


In [9]:
# Remove empty books
df = df[(df['bookTitle'].notna())]

#### Text data

In [10]:
str_columns = ['bookSeries', 'bookAuthors', 'Plot', 'PublishingDate', 'Characters', 'Setting']

for col in str_columns:
    df[col] = df[col].fillna('')

### Text data preprocessing

#### Punctuation removal

**Observations**:

There are several ways to remove punctuations, including the use of exernal libraries (like nltk). But actually the fastest way to perform punctuation removal is the use of the internal methong translate, which is programmed in C and therefore it's much faster than the other options (give a look to this [link](https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string) for a nice performance analysis of the various options).

In [11]:
def remove_punctuation(s):
    return s.translate(str.maketrans('', '', string.punctuation + '’'))

In [12]:
for col in str_columns:
    df[col] = df[col].apply(remove_punctuation)

#### Language detection
There are four possibilities `Plot` column of a given book:
1. It is written in english
2. It is written in another language
3. It is empty
4. It contains symbols, numbers, and so on

We want to keep only the ones written in english or empty, so we are just going to discard the others.

In [13]:
def language(s):
    if s == '':
        return 'empty'
    try:
        return detect(s)
    except:
        return 'symbols'

In [14]:
df['plot_lang'] = df['Plot'].apply(language)

In [15]:
df = df[(df['plot_lang'] == 'en') | (df['plot_lang'] == 'empty')].drop(columns=['plot_lang'])

In [16]:
df.shape

(26999, 12)

#### Stopwords removal
We are not going to perform stopwords removal on all the columns, since we could remove important things (e.g., we don't want to remove anything from the names of the characters). The only column on which stopwords removal is necessary is `Plot`.

In [17]:
def remove_stopwords(s):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(s)
    return ' '.join([w for w in tokens if w not in stop_words])

In [18]:
df['Plot'] = df['Plot'].apply(remove_stopwords)

#### Stemming
As for the stopwords removal, the only column on which stemming is necessary is `Plot`.

In [19]:
def stemming(s):
    ps = PorterStemmer()
    tokens = word_tokenize(s)
    return ' '.join([ps.stem(w) for w in tokens])

In [20]:
df['Plot'] = df['Plot'].apply(stemming)

#### Lowercase
On the other hand, we want all the string columns to be lowercase, so that our search engine won't have problems with upper/lower case differences.

In [21]:
for col in str_columns:
    df[col] = df[col].apply(lambda w: w.lower())

In [22]:
df.head()

Unnamed: 0,bookTitle,bookSeries,bookAuthors,ratingValue,ratingCount,reviewCount,Plot,numberOfPages,PublishingDate,Characters,Setting,Url
0,The Hunger Games,the hunger games 1,suzanne collins,4.33,6408798.0,172554.0,could surviv wild everi one make sure dont liv...,374.0,september 14th 2008,katniss everdeen peeta mellark cato hunger gam...,district 12 panem capitol panem panem,https://www.goodreads.com/book/show/2767052-th...
1,Harry Potter and the Order of the Phoenix,harry potter 5,jk rowling,4.5,2525157.0,42734.0,there door end silent corridor and haunt harri...,870.0,september 2004,sirius black draco malfoy ron weasley petunia ...,hogwarts school of witchcraft and wizardry lon...,https://www.goodreads.com/book/show/2.Harry_Po...
2,To Kill a Mockingbird,to kill a mockingbird,harper lee,4.28,4527405.0,91802.0,the unforgett novel childhood sleepi southern ...,324.0,may 23rd 2006,scout finch atticus finch jem finch arthur rad...,maycomb alabama,https://www.goodreads.com/book/show/2657.To_Ki...
3,Pride and Prejudice,,jane austen,4.26,3017830.0,67811.0,altern cover edit isbn 9780679783268sinc immed...,279.0,october 10th 2000,mr bennet mrs bennet jane bennet elizabeth ben...,united kingdom derbyshire england england hert...,https://www.goodreads.com/book/show/1885.Pride...
4,Twilight,the twilight saga 1,stephenie meyer,3.6,4989910.0,104912.0,about three thing i absolut positivefirst edwa...,501.0,september 6th 2006,edward cullen jacob black laurent renee bella ...,forks washington phoenix arizona washington state,https://www.goodreads.com/book/show/41865.Twil...


In [109]:
df = df.reset_index(drop=True).reset_index()

### Save data

In [110]:
df.to_csv('clean_data.csv', index=False)

# 2. Search Engine

## 2.1 Conjunctive query

### Create your index!

In [2]:
df = pd.read_csv('clean_data.csv')

In [2]:
# To save and load python dictionaries

def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [85]:
def term_index(documents):
    words = set()
    for s in documents:
        tokens = set(word_tokenize(s))
        words.update(tokens)
        
    term_index = {}
    for i, word in enumerate(words):
        term_index[word] = i
    return term_index

In [97]:
term_indexes = term_index(df['Plot'])

In [88]:
save_obj(term_indexes, 'term_index')

In [94]:
from collections import defaultdict
def inverted_index(documents, term_indexes):
    inv_index = defaultdict(list)
    for i, s in enumerate(documents):
        tokens = set(word_tokenize(s))
        for token in tokens:
            token_index = term_indexes[token]
            inv_index[token_index].append('document_{}'.format(i))
    return inv_index

In [98]:
inv_indexes = inverted_index(df['Plot'], term_indexes)

In [103]:
save_obj(inv_indexes, 'inverted_index')

### Execute the query

In [4]:
def execute_query(query):
    df = pd.read_csv('clean_data.csv')
    term_indexes = load_obj('term_index')
    inv_indexes = load_obj('inverted_index')
    
    
    query_tokens = [ps.stem(w) for w in word_tokenize(query)]
    # Create term indexes for the query
    # notice: if one of the query element doesn't appear in the term_indexes dictionary
    # we can safely say that the **conjunctive** query has to return nothing
    term_indexes_tokens = []
    for token in query_tokens:
        try:
            term_indexes_tokens.append(term_indexes[token])
        except:
            return
    
    query_inv_indexes = {}
    for token_index in term_indexes_tokens:
        query_inv_indexes[token_index] = set(inv_indexes[token_index])
    
    # Since it is a conjuntive query, we need to intersect the results of each query token
    documents = set.intersection(*query_inv_indexes.values())
    documents_id = [int(document.split('_')[1]) for document in documents]
    documents_id.sort()
    
    return df[df['index'].isin(documents_id)][['bookTitle', 'Plot', 'Url']]

In [5]:
execute_query('capitol love')

NameError: name 'ps' is not defined