# Part 2: Building a recommendation system

In this analysis we will build a recommendation engine based on the IMDB actor biographies extracted in part 1. This notebook shows one possible solution to part 2 of the tutorial.

In [1]:
# Text processing
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from joel_tools import SynonymBuilder

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

# For general data manipulations
import numpy as np
import pandas as pd

# For importing the biography data
import json

# Load this in advance - this will parse our text for us
PARSER = spacy.load('en')

## Extracting a good vocabulary

In [2]:
def spacy_tokenizer(sentence):
    """Split a sentence into tokens"""
    # Initial parsing
    tokens = PARSER(sentence)
    
    # Merge entities, and take a note of them
    entities = set(
    for entity in tokens.ents:
        entity.merge()
        entities.add(entity.text)
        
    # Only accept tokens which are made of letters, and are not
    # organisations, people or dates
    tokens = [t for t in tokens
              if (t.is_alpha or t.text in entities)
              and not (t.is_punct or t.is_stop
                       or t.ent_type_ in ["PERSON", "GPE", "DATE"])]
    
    # Lowercase and strip the text of excess spaces
    tokens = [t.lemma_.lower().strip()
              if (t.lemma_ != "-PRON-" and
                  t.text not in entities)
              else t.lower_ for t in tokens]

    # Finally, entities starting with pronouns are overkill for this analysis
    # so strip off the pronouns to maximise term counts
    for start in ["a", "an", "the"]:
        n = len(start) + 1
        tokens = [t[n:] if t.startswith(f"{start} ")
                  else t for t in tokens]
    return tokens


def clean_vocabulary(bios):
    """Generate the vocabulary to be used in the analysis. In order to maximise our
    chance of getting reasonable results, we need to increase our token counts. We therefore
    use the SynonymBuilder to decide which terms are actually the same."""
    names = bios.keys()
    texts = bios.values()
 
    # Build the basic vocab from the tokenizer
    vocab = []
    for text in texts:
        vocab += spacy_tokenizer(text)
        
    # Use the synonym builder to reduce the data size
    syn_builder = SynonymBuilder()
    vocab = syn_builder.fit_transform(vocab)
    texts = syn_builder.transform(texts)
    
    # Remove the author's name from the vocabulary
    vocab = set(vocab)
    for name in names:
        vocab = vocab - set(name.lower().split())
    return vocab, texts

## The "main" code

In [None]:
# Open the data from Part 1
with open("data/bios.json") as f:
    bios = json.load(f)          

# Generate a clean vocabulary and convert the text to data
clean_vocab, texts = clean_vocabulary(bios)
cv = TfidfVectorizer(tokenizer=spacy_tokenizer,
                     vocabulary=clean_vocab, min_df=5, max_df=0.95)                                   
data = cv.fit_transform(texts)
data.shape

We have many more columns of data than we have rows! There is no way this will give a good result (it's good to think why that is). We therefore reduce the data size using PCA.

In [None]:
# Reduce to 75% of original size
pca = PCA(0.75)
_data = pca.fit_transform(data.todense())
_data.shape

That's much better, now we can build the similarity matrix, and extract the most similar actor for each row.

In [None]:
sims = cosine_similarity(_data)
# Don't compare actors to themselves!
sims = sims - np.eye(sims.shape[0])

most_similar = []
for name, row in zip(bios.keys(), sims):
    highest = row.max()
    found = False
    for _name, score in zip(bios.keys(), row):
        if np.isclose(score, highest):
            found = True
            break
    if not found:
        continue
    most_similar.append(dict(name=name, most_similar=_name, score=score))

Putting the data into a pandas DataFrame will make read the data much easier. What do you think of the results? Pretty good I'd say! Now imagine applying this routine to any data you have. Could you think of practical uses?

In [9]:
df_sim = pd.DataFrame(most_similar, columns=["name","most_similar","score"]).sort_values("score", ascending=False)
df_sim.head(20)

Unnamed: 0,name,most_similar,score
940,Johnny Knoxville,Bonnie Hunt,0.789569
781,Bonnie Hunt,Johnny Knoxville,0.789569
729,Carrie Fisher,Mark Hamill,0.777845
912,Mark Hamill,Carrie Fisher,0.777845
902,Martin Lawrence,Johnny Knoxville,0.776004
982,Craig Robinson,Danny McBride,0.729525
974,Danny McBride,Craig Robinson,0.729525
248,Emma Watson,Daniel Radcliffe,0.710853
419,Daniel Radcliffe,Emma Watson,0.710853
473,William Shatner,Patrick Stewart,0.685567
