In [None]:
import json
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('default')

from ipywidgets import interact, widgets

from tqdm import tqdm

import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

nltk.download('wordnet')
nltk.download('stopwords')

from utils.caching import cache, ucache

import os
if not os.path.exists('out'):
    os.makedirs('out')

%load_ext autoreload
%autoreload 2

# Data reading and cleanup

The [Book Summary Dataset](https://www.cs.cmu.edu/~dbamman/booksummaries.html) is used.

The entries in the dataset (taken from its README file) are:
 1. Wikipedia article ID
 2. Freebase ID
 3. Book title
 4. Author
 5. Publication date
 6. Book genres (Freebase ID:name tuples)
 7. Plot summary

To save time, I rename them as follows:

| Original name     | renaming     |
| :---: | :---: |
| Wikipedia article ID | id |
| Freebase ID | f_id |
| Book title | title |
| Author | author |
| Publication date | date |
| Book genres (Freebase ID:name tuples) | genres |
| Plot summary | summary |




In [None]:
headers = ["id", "f_id", "title", "author", "date", "genres", "summary"]

df_original = pd.read_csv('dataset/booksummaries.txt', sep='\t', names=headers)
print(f'Number of documents: {len(df_original)}')
df_original.head()

## Data preprocessing

First make a dataframe for a nice visualization:
 - drop the Freebase ID column
 - convert the genres to lists of lowercase strings

Then add the tokenized columns for `summary` and `title`:
 - convert everything to lowercase
 - keep only alphabetic characters (drop digits and punctuation)
 - strip accents
 - On the `summary` column:
   - remove stopwords
   - remove words that make no sense
   - lemmatize words

Build bag-of-words of each document, where the format is:

 - {word: n_occurrences}


Finally, build the vocabulary of the summary. Only the words that appear in more than 1% of the documents are kept in the vocabulary, i.e the ones that appear in more than 17 documents.

In [None]:
# drop freebase ID
df = df_original.drop('f_id', axis=1)

# convert genres to lists of strings
df['genres'] = df['genres'].map(lambda x: list(json.loads(x.lower()).values()), na_action='ignore')

df.head()

In [None]:
# intitiate the tokenizer and lemmatizer
tokenizer = RegexpTokenizer(r'[a-z]+')
lemmatizer = WordNetLemmatizer()

# convert stopwords to set for better performance
stopwords_set = set(stopwords.words('english'))

# define all mapping functions for proper tokenization
def strip_accents(text: str):
   return ''.join(c for c in unicodedata.normalize('NFD', text)
                  if unicodedata.category(c) != 'Mn')

def tokenize(text: str):
    return tokenizer.tokenize(text)

def remove_single_chars(words: list):
    return [word for word in words if len(word) > 1]

def lemmatize(words: list):
    return [lemmatizer.lemmatize(word) for word in words]

def remove_stopwords(words: list):
    return [word for word in words if word not in stopwords_set]

def apply_preprocessing(text: str):
    return lemmatize(remove_stopwords(remove_single_chars(tokenize(strip_accents(text.lower())))))

tqdm.pandas() # defines the progress_map function

for key in ['title', 'summary']:
    df[key + '_t'] = cache(f'prep_{key}.pck', df[key].progress_map, apply_preprocessing)

df.head()

Build bag-of-words

In [None]:
# define function to build bag-of-words
def build_bow(doc: list):
    """Count the words in each and build its bag-of-words"""
    bow = {}
    for word in doc:
        if word not in bow.keys():
            bow[word] = 0
        bow[word]+=1
    return dict(sorted(bow.items(), key=lambda item: item[1], reverse=True))


# do it only on the summary
t0 = time.time()
# df['summary_set'] = df['summary_t'].map(lambda s: sorted(set(s))) # build the set just for better performance
df['summary_bow'] = df['summary_t'].map(build_bow) # this enables to build the tf part of the tf-idf matrix
t1 = time.time()
print(f'execution time: {t1-t0:.2f}s')

df.head()

Build vocabulary

In [None]:
# take the complete vocabulary and compute the document frequency
vocabulary_complete = [word for doc in df['summary_bow'] for word in doc.keys()]

from utils.document_frequency import compute_document_frequency
document_frequency = cache('doc_freq.pck', compute_document_frequency, df, vocabulary_complete)

In [None]:
# build the actual vocabulary
min_df = 0.01
print(f'Min number of document frequency: {len(df)*min_df}')
vocabulary = [token for token, freq in document_frequency.items() if freq > len(df)*min_df]

# create mappings for vocabulary
token2id = {word: i for i, word in enumerate(vocabulary)}
id2token = {i: word for word, i in token2id.items()}

print(f'Vocabulary size: {len(vocabulary)}')

# Tf-idf

Build the tf-idf matrix

In [None]:
def compute_tf_idf(df, id2token):
    tf_idf = np.zeros((len(vocabulary), len(df)))
    for i, word in tqdm(id2token.items()):
        for j, bow in enumerate(df['summary_bow']):
            if word in bow:
                tf_idf[i,j] = np.log(1+bow[word])*np.log(document_frequency[word])
    return tf_idf


tf_idf = cache('if-idf.pck', compute_tf_idf, df, id2token)
print(f'{tf_idf.shape[0]} words and {tf_idf.shape[1]} documents.')

# LSA on tf-idf matrix

Perform SVD on the tf-idf matrix

In [None]:
u, s, vt = cache('svd_tf_idf.pck', np.linalg.svd, tf_idf, full_matrices=False)

In [None]:
titles = ["Hercule Poirot's Christmas", "Murder on the Orient express",
          "Nightfall", "Robots and Empire", "Foundation", "Second Foundation",
          "Harry potter and the Philosopher's stone",
          "The fellowship of the Ring", "The Two Towers", "The Return of the King", "The Hobbit",
          "The da Vinci Code", "Angels and Demons",
          "Pride and Prejudice",
          "The Shining",
          "Moby-Dick; or, The Whale", "A Farewell to Arms"]

doc_subset = list(df[(df['title'].map(lambda t: t in titles) & ~df['author'].isna() & ~df['genres'].isna())].index)

In [None]:
df.loc[doc_subset][['author', 'title', 'genres']]

# Visualization

## LSA space of documents

Visualize the first 2 LSA dimensions for the docs

In [None]:
from utils.visualization import plot_docs, animate_k
from utils.caching import ext_cache, get_hash

At first, it is a good idea to tune the value of k. This is achieved by creating and animation using the FuncAnimation class from matplotlib

In [None]:
k_values = [2, 3, 4, 5, 10] + list(range(20, 110, 10)) + list(range(100, 1100, 100)) + [2000, 5000, 10000, vt.shape[1]]
animate_k('out/docs_animation.mp4', k_values, plot_docs,
          vt=vt, s=s, dimensions=(0,1), normalize=True, scatter_kw={'s': 1, 'alpha': 0.3})

In [None]:
k_values = [3, 4, 5, 10] + list(range(20, 110, 10)) + list(range(100, 1100, 100)) + [2000, 5000, 10000, vt.shape[1]]
animate_k('out/docs_animation_2.mp4', k_values, plot_docs,
          vt=vt, s=s, dimensions=(1,2), normalize=True, scatter_kw={'s': 1, 'alpha': 0.3})

Looking at the animations, the best spread of the data points occurs at around $k=100$.

Therefore, from now on, for the docs this value for k will be chosen.

Here is a visualization of the unnormalized LSA vectors (left), and the normalized k-rank approximated ones for LSA dimensions (0,1) (right).

In [None]:
w, h, dpi = 640*2, 640, 100
fig, axs = plt.subplots(ncols=2, figsize=(w/dpi, h/dpi), dpi=dpi)

# a random subsample is shown, because with all the points my PC does not handle the rendering very well
plot_docs(vt, s, (0,1), k=100, normalize=False, ax=axs[0], scatter_kw={'s': 1, 'alpha': 0.3}, subsample_size=1000)
plot_docs(vt, s, (0,1), k=100, normalize=True,  ax=axs[1], scatter_kw={'s': 1, 'alpha': 0.3}, subsample_size=1000)

plt.show()

And for LSA dimensions (1,2)

In [None]:
w, h, dpi = 640*2, 640, 100
fig, axs = plt.subplots(ncols=2, figsize=(w/dpi, h/dpi), dpi=dpi)

plot_docs(vt, s, (1,2), k=100, normalize=False, ax=axs[0], scatter_kw={'s': 1, 'alpha': 0.3}, subsample_size=1000)
plot_docs(vt, s, (1,2), k=100, normalize=True,  ax=axs[1], scatter_kw={'s': 1, 'alpha': 0.3}, subsample_size=1000)

plt.show()

### Display some specific books

Show only some specific books. The commented cells were used to check if the Title I want to show was present

In [None]:
# t = "A Farewell to Arms"
# df.loc[df['title'].map(lambda s: s.lower()).str.contains(t.lower())]

In [None]:
# df.loc[df['title'].str.lower() == t.lower()]

In [None]:
# df.loc[df['author'].str.lower() == 'Ernest Hemingway'.lower()]

In [None]:
titles = ["Hercule Poirot's Christmas", "Murder on the Orient express",
          "Nightfall", "Robots and Empire", "Foundation", "Second Foundation",
          "Harry potter and the Philosopher's stone",
          "The fellowship of the Ring", "The Two Towers", "The Return of the King", "The Hobbit",
          "The da Vinci Code", "Angels and Demons",
          "Pride and Prejudice",
          "The Shining",
          "Moby-Dick; or, The Whale", "A Farewell to Arms"]

doc_subset = list(df[(df['title'].map(lambda t: t in titles) & ~df['author'].isna() & ~df['genres'].isna())].index)

In [None]:
w, h, dpi = 640*2, 640, 70
fig, axs = plt.subplots(ncols=2, figsize=(w/dpi, h/dpi), dpi=dpi)

plot_docs(vt, s, (0,1), k=100, normalize=False, ax=axs[0], subset=doc_subset, labels=df['title'].to_numpy())
plot_docs(vt, s, (0,1), k=100, normalize=True,  ax=axs[1], subset=doc_subset, labels=df['title'].to_numpy())

axs[0].set_xlim(-80,0)
axs[0].set_ylim(-50, 30)

axs[1].set_xlim(-1.02,0)
axs[1].set_ylim(-0.51,0.51)

plt.show()

## LSA space of words

Analogue to before, but now let's see the similarities between words

In [None]:
from utils.visualization import plot_words

Tuning of the approximation level k.

In [None]:
k_values = [2, 3, 4, 5, 10] + list(range(20, 110, 10)) + list(range(100, 1100, 100)) + [1500, 2000, u.shape[1]]
animate_k('out/words_animation.mp4', k_values, plot_words,
          u=u, s=s, dimensions=(0,1), normalize=True, scatter_kw={'s': 1, 'alpha': 0.3})

animate_k('out/words_animation_2.mp4', k_values[1:], plot_words,
          u=u, s=s, dimensions=(1,2), normalize=True, scatter_kw={'s': 1, 'alpha': 0.3})

Here, some specific words are chosen. I have chosen 4 different categories of words. As it turns out, using small values of k results in the best visualization for the clustering of these specific words.

In [None]:
words_of_interest = ['love', 'marriage', 'parent', 'school',
                     'dark', 'fight', 'criminal',
                     'alien', 'spaceship', 'planet',
                     'car', 'truck', 'bus', 'train']

words_subset = []
voc = np.array(vocabulary)
for word_of_interest in words_of_interest:
    words_subset += list(np.indices(dimensions=voc.shape)[0][voc==word_of_interest])

In [None]:
k_values = [2, 3, 4, 5, 10] + list(range(20, 110, 10)) + list(range(100, 1100, 100)) + [1500, 2000, u.shape[1]]
animate_k('out/words_animation_woi.mp4', k_values, plot_words,
          u=u, s=s, labels=vocabulary, subset=words_subset)

animate_k('out/words_animation_woi_2.mp4', k_values[1:], plot_words,
          u=u, s=s, dimensions=(1,2), labels=vocabulary, subset=words_subset)

Example with $k=3$

In [None]:
k=3

w, h, dpi = 640*2, 640, 100
fig, axs = plt.subplots(ncols=2, figsize=(w/dpi, h/dpi), dpi=dpi)

plot_words(u, s, labels=vocabulary, ax=axs[0], subset=words_subset, k=k, dimensions=(0,1))
plot_words(u, s, labels=vocabulary, ax=axs[1], subset=words_subset, k=k, dimensions=(1,2))
axs[0].set_xlim(-1.02,0)
axs[0].set_ylim(-0.51,0.51)

axs[1].set_xlim(-1.02,1.02)
axs[1].set_ylim(-1.02,1.02)

plt.show()

In [None]:
'drama' in df['genres'].explode().unique()

In [None]:
genres = ['science fiction', 'fantasy', 'drama']
df['genres'].map(lambda s: len(set(genres).intersection(s))>=2, na_action='ignore').map(lambda s: False if pd.isna(s) else s)

In [None]:
from utils.visualization import plot_genres_analysis
plot_genres_analysis(vt, s, df, genres=['science fiction', 'fantasy'], normalize=True, k=100, u=u, voc=vocabulary, words=['spaceship'],
                     plot_most_relevant_words=True)

## Curvature of the singular values

In [None]:
def diff_function(x):
    return x[1:] - x[:-1]

def curvature_function(s):
    s_diff = diff_function(diff_function(s))
    return s_diff/((1+np.square(s_diff))**1.5)

w, h, dpi = 640, 360, 100
fig, ax = plt.subplots(figsize=(w/dpi, h/dpi), dpi=dpi)

ax.plot(curvature_function(s)[:30], 'o-')
ax.grid()
fig.suptitle('Curvature function')

plt.show()

In [None]:
k = 13
u_k = u[:, :k]
s_k = s[:k]
vt_k = vt[:k]

tf_idf_k = u_k@np.diag(s_k)@vt_k

In [None]:
from utils.similarities import compute_most_similar_movies_lsa
from utils.similarities import compute_cos_similarities

In [None]:
s_k*vt_k[:,0]

In [None]:
movies_vectors = np.reshape(s_k, newshape=(s_k.shape[0],1))*vt_k


In [None]:
movies_vectors.shape

In [None]:
compute_cos_similarities(s_k*vt_k[:,0], movies_vectors)

In [None]:
movie_row = df[df['title'].map(lambda t: 'the plague'.lower()==t.lower())]
movie_row

Low-rank approximation

In [None]:
def compute_term_similarities(terms_docs):
    terms_docs_norm = terms_docs.T/np.linalg.norm(terms_docs, axis=1)
    terms_docs_norm = np.where(np.isnan(terms_docs_norm), 0, terms_docs_norm)
    terms_docs_norm = terms_docs_norm.T

    similarities = terms_docs_norm@terms_docs_norm.T
    for i in range(len(similarities)):
        similarities[i,i]=0
    return similarities

def compute_doc_similarities(terms_docs):
    terms_docs_norm = terms_docs/np.linalg.norm(terms_docs, axis=0)
    terms_docs_norm = np.where(np.isnan(terms_docs_norm), 0, terms_docs_norm)

    similarities = terms_docs_norm.T@terms_docs_norm
    for i in range(len(similarities)):
        similarities[i,i]=0
    return similarities

In [None]:
term_similarities = cache('term_similarities.pck', compute_term_similarities, tf_idf_k)
doc_similarities = cache('doc_similarities.pck', compute_doc_similarities, tf_idf_k)

In [None]:
doc_similarities[].argmax()

In [None]:
d1 = 0
d2 = 813
print(df['title'][d1])
print(df['title'][d2])
doc_similarities[d1,d2]

In [None]:
df['title'][2]

In [None]:
df['title'][9296]

In [None]:
df[df['title'].str.lower().str.contains('fahrenheit')]

In [None]:
df[df['title'].str.lower().str.contains('foundation')]

In [None]:
df['summary'][9296]