In [None]:
import json
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('default')

from tqdm import tqdm

import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

nltk.download('wordnet')
nltk.download('stopwords')

from utils.caching import cache, ucache

%load_ext autoreload
%autoreload 2

# Data reading and cleanup

The [Book Summary Dataset](https://www.cs.cmu.edu/~dbamman/booksummaries.html) is used.

The entries in the dataset (taken from its README file) are:
 1. Wikipedia article ID
 2. Freebase ID
 3. Book title
 4. Author
 5. Publication date
 6. Book genres (Freebase ID:name tuples)
 7. Plot summary

To save time, I rename them as follows:

| Original name     | renaming     |
| :---: | :---: |
| Wikipedia article ID | id |
| Freebase ID | f_id |
| Book title | title |
| Author | author |
| Publication date | date |
| Book genres (Freebase ID:name tuples) | genres |
| Plot summary | summary |




In [None]:
headers = ["id", "f_id", "title", "author", "date", "genres", "summary"]

df_original = pd.read_csv('dataset/booksummaries.txt', sep='\t', names=headers)
print(f'Number of documents: {len(df_original)}')
df_original.head()

## Data preprocessing

First make a dataframe for a nice visualization:
 - drop the Freebase ID column
 - convert the genres to lists of lowercase strings

Then add the tokenized columns for `summary` and `title`:
 - convert everything to lowercase
 - keep only alphabetic characters (drop digits and punctuation)
 - strip accents
 - On the `summary` column:
   - remove stopwords
   - remove words that make no sense
   - lemmatize words

Build bag-of-words of each document, where the format is:

 - {word: n_occurrences}


Finally, build the vocabulary of the summary. Only the words that appear in more than 1% of the documents are kept in the vocabulary, i.e the ones that appear in more than 17 documents.

In [None]:
# drop freebase ID
df = df_original.drop('f_id', axis=1)

# convert genres to lists of strings
df['genres'] = df['genres'].map(lambda x: list(json.loads(x.lower()).values()), na_action='ignore')

df.head()

In [None]:
# intitiate the tokenizer and lemmatizer
tokenizer = RegexpTokenizer(r'[a-z]+')
lemmatizer = WordNetLemmatizer()

# convert stopwords to set for better performance
stopwords_set = set(stopwords.words('english'))

# define all mapping functions for proper tokenization
def strip_accents(text: str):
   return ''.join(c for c in unicodedata.normalize('NFD', text)
                  if unicodedata.category(c) != 'Mn')

def tokenize(text: str):
    return tokenizer.tokenize(text)

def remove_single_chars(words: list):
    return [word for word in words if len(word) > 1]

def lemmatize(words: list):
    return [lemmatizer.lemmatize(word) for word in words]

def remove_stopwords(words: list):
    return [word for word in words if word not in stopwords_set]

def apply_preprocessing(text: str):
    return lemmatize(remove_stopwords(remove_single_chars(tokenize(strip_accents(text.lower())))))

tqdm.pandas() # defines the progress_map function

for key in ['title', 'summary']:
    df[key + '_t'] = cache(f'prep_{key}.pck', df[key].progress_map, apply_preprocessing)

df.head()

Build bag-of-words

In [None]:
# define function to build bag-of-words
def build_bow(doc: list):
    """Count the words in each and build its bag-of-words"""
    bow = {}
    for word in doc:
        if word not in bow.keys():
            bow[word] = 0
        bow[word]+=1
    return dict(sorted(bow.items(), key=lambda item: item[1], reverse=True))


# do it only on the summary
t0 = time.time()
# df['summary_set'] = df['summary_t'].map(lambda s: sorted(set(s))) # build the set just for better performance
df['summary_bow'] = df['summary_t'].map(build_bow) # this enables to build the tf part of the tf-idf matrix
t1 = time.time()
print(f'execution time: {t1-t0:.2f}s')

df.head()

Build vocabulary

In [None]:
# take the complete vocabulary and compute the document frequency
vocabulary_complete = [word for doc in df['summary_bow'] for word in doc.keys()]

from utils.document_frequency import compute_document_frequency
document_frequency = cache('doc_freq.pck', compute_document_frequency, df, vocabulary_complete)

In [None]:
# build the actual vocabulary
min_df = 0.01
print(f'Min number of document frequency: {len(df)*min_df}')
vocabulary = [token for token, freq in document_frequency.items() if freq > len(df)*min_df]

# create mappings for vocabulary
token2id = {word: i for i, word in enumerate(vocabulary)}
id2token = {i: word for word, i in token2id.items()}

print(f'Vocabulary size: {len(vocabulary)}')

# Tf-idf

First build the term-docs matrix

In [None]:
def compute_terms_docs(df, id2token):
    term_docs = np.zeros((len(vocabulary), len(df)))
    for i, word in tqdm(id2token.items()):
        for j, bow in enumerate(df['summary_bow']):
            if word in bow:
                term_docs[i,j] = bow[word]
    return term_docs

terms_docs = cache('terms_docs.pck', compute_terms_docs, df, id2token)
print(f'{terms_docs.shape[0]} words and {terms_docs.shape[1]} documents.')

In [None]:
def compute_tf_idf(df, id2token):
    tf_idf = np.zeros((len(vocabulary), len(df)))
    for i, word in tqdm(id2token.items()):
        for j, bow in enumerate(df['summary_bow']):
            if word in bow:
                tf_idf[i,j] = np.log(1+bow[word])*np.log(document_frequency[word])
    return tf_idf


tf_idf = cache('if-idf.pck', compute_tf_idf, df, id2token)
print(f'{tf_idf.shape[0]} words and {tf_idf.shape[1]} documents.')

# LSA on tf-idf matrix

Then compute the tf-idf

In [None]:
u, s, vt = cache('svd_tf_idf.pck', np.linalg.svd, tf_idf, full_matrices=False)

In [None]:
w, h, dpi = 960, 360, 100
fig, axs = plt.subplots(ncols=2, figsize=(w/dpi, h/dpi), dpi=dpi)

axs[0].plot(s)
axs[0].grid()

axs[1].plot(s[:30], '.-')
axs[1].grid()

fig.suptitle('Singular values')
plt.show()

In [None]:
def diff_function(x):
    return x[1:] - x[:-1]

def curvature_function(s):
    s_diff = diff_function(diff_function(s))
    return s_diff/((1+np.square(s_diff))**1.5)

w, h, dpi = 640, 360, 100
fig, ax = plt.subplots(figsize=(w/dpi, h/dpi), dpi=dpi)

ax.plot(curvature_function(s)[:30], 'o-')
ax.grid()
fig.suptitle('Singular values')

plt.show()

Low-rank approximation

In [None]:
k = 13
u_k = u[:, :k]
s_k = s[:k]
vt_k = vt[:k]

tf_idf_k = u_k@np.diag(s_k)@vt_k

In [None]:
def compute_cosine_similarity(v1, v2):
    return (v1/np.linalg.norm(v1)).dot(v2/np.linalg.norm(v2))