In [2]:
# Setup environment
from IPython.display import display
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import spacy
from gensim.models.word2vec import Word2Vec
from collections import Counter
from string import punctuation
from ast import literal_eval


In [3]:
movies = pd.read_csv('data/cleaned_movies.csv', converters={"Genre": literal_eval})
display(movies)

Unnamed: 0,Series_Title,Released_Year,Certificate,Genre,IMDB_Rating,Overview,Director,Star1,Star2,Star3,Star4
0,The Shawshank Redemption,1994,Adult,[Drama],9.3,Two imprisoned men bond over a number of years...,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler
1,The Godfather,1972,Adult,"[Crime, Drama]",9.2,An organized crime dynasty's aging patriarch t...,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton
2,The Dark Knight,2008,Adult,"[Action, Crime, Drama]",9.0,When the menace known as the Joker wreaks havo...,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine
3,The Godfather: Part II,1974,Adult,"[Crime, Drama]",9.0,The early life and career of Vito Corleone in ...,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton
4,12 Angry Men,1957,Family,"[Crime, Drama]",9.0,A jury holdout attempts to prevent a miscarria...,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler
...,...,...,...,...,...,...,...,...,...,...,...
995,Breakfast at Tiffany's,1961,Adult,"[Comedy, Drama, Romance]",7.6,A young New York socialite becomes interested ...,Blake Edwards,Audrey Hepburn,George Peppard,Patricia Neal,Buddy Ebsen
996,Giant,1956,,"[Drama, Western]",7.6,Sprawling epic covering the life of a Texas ca...,George Stevens,Elizabeth Taylor,Rock Hudson,James Dean,Carroll Baker
997,From Here to Eternity,1953,Adult,"[Drama, Romance, War]",7.6,"In Hawaii in 1941, a private is cruelly punish...",Fred Zinnemann,Burt Lancaster,Montgomery Clift,Deborah Kerr,Donna Reed
998,Lifeboat,1944,Adult,"[Drama, War]",7.6,Several survivors of a torpedoed merchant ship...,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix


Recommendation Engine Preprocessing

In [4]:
# Load spacy model for keyword recognition
nlp = spacy.load("en_core_web_sm")

In [5]:
# Create function that grabs all keywords that are proper nouns, adjectives, or nouns
def get_keywords(text):
    result = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN']
    doc = nlp(text.lower())
    for token in doc:
        if token.text in nlp.Defaults.stop_words or token.text in punctuation:
            continue
        if token.pos_ in pos_tag:
            result.append(token.text)
    
    # return list of unique keywords
    return set(result)

In [6]:
# Create Keywords column in movies df
movies['Keywords'] = movies.Overview.apply(lambda x: get_keywords(x))

In [7]:
display(movies)

Unnamed: 0,Series_Title,Released_Year,Certificate,Genre,IMDB_Rating,Overview,Director,Star1,Star2,Star3,Star4,Keywords
0,The Shawshank Redemption,1994,Adult,[Drama],9.3,Two imprisoned men bond over a number of years...,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,"{decency, men, number, acts, common, solace, y..."
1,The Godfather,1972,Adult,"[Crime, Drama]",9.2,An organized crime dynasty's aging patriarch t...,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,"{transfers, control, son, reluctant, dynasty, ..."
2,The Dark Knight,2008,Adult,"[Action, Crime, Drama]",9.0,When the menace known as the Joker wreaks havo...,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,"{psychological, menace, gotham, tests, batman,..."
3,The Godfather: Part II,1974,Adult,"[Crime, Drama]",9.0,The early life and career of Vito Corleone in ...,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,"{corleone, career, vito, york, city, new, mich..."
4,12 Angry Men,1957,Family,"[Crime, Drama]",9.0,A jury holdout attempts to prevent a miscarria...,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,"{attempts, evidence, holdout, colleagues, jury..."
...,...,...,...,...,...,...,...,...,...,...,...,...
995,Breakfast at Tiffany's,1961,Adult,"[Comedy, Drama, Romance]",7.6,A young New York socialite becomes interested ...,Blake Edwards,Audrey Hepburn,George Peppard,Patricia Neal,Buddy Ebsen,"{young, interested, man, socialite, way, york,..."
996,Giant,1956,,"[Drama, Western]",7.6,Sprawling epic covering the life of a Texas ca...,George Stevens,Elizabeth Taylor,Rock Hudson,James Dean,Carroll Baker,"{texas, cattle, epic, family, associates, life}"
997,From Here to Eternity,1953,Adult,"[Drama, Romance, War]",7.6,"In Hawaii in 1941, a private is cruelly punish...",Fred Zinnemann,Burt Lancaster,Montgomery Clift,Deborah Kerr,Donna Reed,"{command, love, team, wife, private, captain, ..."
998,Lifeboat,1944,Adult,"[Drama, War]",7.6,Several survivors of a torpedoed merchant ship...,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix,"{ii, crew, war, world, lifeboat, members, u, t..."


In [8]:
# Create function to clean feature columns for rec model
def clean_data(x):
    if isinstance(x, list):
        return " ".join([str.lower(word) for word in x])
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return " "

In [9]:
# Apply clean data function to transform the feature columns
features = ['Genre', 'Director', 'Star1', 'Star2', 'Star3', 'Star4', 'Keywords']
for feature in features:
    movies[feature] = movies[feature].apply(clean_data)

In [10]:
display(movies)

Unnamed: 0,Series_Title,Released_Year,Certificate,Genre,IMDB_Rating,Overview,Director,Star1,Star2,Star3,Star4,Keywords
0,The Shawshank Redemption,1994,Adult,drama,9.3,Two imprisoned men bond over a number of years...,frankdarabont,timrobbins,morganfreeman,bobgunton,williamsadler,
1,The Godfather,1972,Adult,crime drama,9.2,An organized crime dynasty's aging patriarch t...,francisfordcoppola,marlonbrando,alpacino,jamescaan,dianekeaton,
2,The Dark Knight,2008,Adult,action crime drama,9.0,When the menace known as the Joker wreaks havo...,christophernolan,christianbale,heathledger,aaroneckhart,michaelcaine,
3,The Godfather: Part II,1974,Adult,crime drama,9.0,The early life and career of Vito Corleone in ...,francisfordcoppola,alpacino,robertdeniro,robertduvall,dianekeaton,
4,12 Angry Men,1957,Family,crime drama,9.0,A jury holdout attempts to prevent a miscarria...,sidneylumet,henryfonda,leej.cobb,martinbalsam,johnfiedler,
...,...,...,...,...,...,...,...,...,...,...,...,...
995,Breakfast at Tiffany's,1961,Adult,comedy drama romance,7.6,A young New York socialite becomes interested ...,blakeedwards,audreyhepburn,georgepeppard,patricianeal,buddyebsen,
996,Giant,1956,,drama western,7.6,Sprawling epic covering the life of a Texas ca...,georgestevens,elizabethtaylor,rockhudson,jamesdean,carrollbaker,
997,From Here to Eternity,1953,Adult,drama romance war,7.6,"In Hawaii in 1941, a private is cruelly punish...",fredzinnemann,burtlancaster,montgomeryclift,deborahkerr,donnareed,
998,Lifeboat,1944,Adult,drama war,7.6,Several survivors of a torpedoed merchant ship...,alfredhitchcock,tallulahbankhead,johnhodiak,walterslezak,williambendix,


In [11]:
features = ['Genre', 'Director', 'Star1', 'Star2', 'Star3', 'Star4', 'Keywords']

In [12]:
# Create function for word soup column
def create_soup(x):
    return x.Genre + ' ' + x.Director + ' ' + x.Star1 + ' ' + x.Star2 + ' ' + x.Star3 + ' ' + x.Star4 + ' ' + x.Keywords

In [13]:
# Create soup column in movies df
movies['soup'] = movies.apply(create_soup, axis=1)

In [14]:
display(movies.soup[0])

'drama frankdarabont timrobbins morganfreeman bobgunton williamsadler  '

In [16]:
# Save df to data directory
movies.to_csv(path_or_buf='data/preprocessed_movies.csv', index=False)