## used this notebook mainly for cleaning and reducing the amount of data i had to use

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

import warnings; warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv("movies.csv")

In [3]:
df = df.drop(["release_date", "budget", "revenue", "runtime", "status", "tagline", "backdrop_path"], axis=1)

In [4]:
df = df.drop(df.loc[df['genres'].isin(["Documentary"])].index)
df = df.dropna(subset=["production_companies", "genres", "credits", "keywords", "overview", "recommendations"])

In [5]:
df.head()

Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,vote_average,vote_count,credits,keywords,poster_path,recommendations
0,631842,Knock at the Cabin,Horror-Mystery-Thriller,en,While vacationing at a remote cabin a young gi...,3422.537,Blinding Edge Pictures-Universal Pictures-Film...,6.457,888.0,Dave Bautista-Jonathan Groff-Ben Aldridge-Kris...,based on novel or book-sacrifice-cabin-faith-e...,/dm06L9pxDOL9jNSK4Cb6y139rrG.jpg,1058949-646389-772515-505642-143970-667216-104...
1,646389,Plane,Action-Adventure-Thriller,en,After a heroic job of successfully landing his...,2618.646,MadRiver Pictures-Di Bonaventura Pictures-G-BA...,6.901,785.0,Gerard Butler-Mike Colter-Yoson An-Tony Goldwy...,pilot-airplane-philippines-held hostage-plane ...,/qi9r5xBgcc9KTxlOLjssEbDgO0J.jpg,505642-758769-864692-631842-1058949-925943-758...
2,505642,Black Panther: Wakanda Forever,Action-Adventure-Science Fiction,en,Queen Ramonda Shuri M’Baku Okoye and the Dora ...,2525.408,Marvel Studios,7.338,3922.0,Letitia Wright-Lupita Nyong'o-Danai Gurira-Win...,loss of loved one-hero-sequel-superhero-based ...,/sv1xJUazXeYqALzczSZ3O6nkH75.jpg,436270-829280-76600-56969-312634-1037858-238-5...
4,315162,Puss in Boots: The Last Wish,Animation-Adventure-Comedy-Family,en,Puss in Boots discovers that his passion for a...,2078.28,DreamWorks Animation-Universal Pictures,8.363,4671.0,Antonio Banderas-Salma Hayek-Harvey Guillén-Wa...,fairy tale-talking dog-spin off-aftercreditsst...,/kuf6dutpsT0vSVehic3EZIqkOBt.jpg,1033456-1011679-505642-1058949-100088-536554-1...
7,937278,A Man Called Otto,Comedy-Drama,en,When a lively young family moves in next door ...,1545.382,Playtone-SF Studios-2DUX²-Stage 6 Films-Artist...,7.811,540.0,Tom Hanks-Mariana Treviño-Rachel Keller-Manuel...,based on novel or book-suicide attempt-remake-...,/130H1gap9lFfiTF9iDrqNIkFvC9.jpg,82856-100088-906221-18123-10431-1077280-587092...


In [6]:
df.isnull().sum()

id                       0
title                    0
genres                   0
original_language        0
overview                 0
popularity               0
production_companies     0
vote_average             0
vote_count               0
credits                  0
keywords                 0
poster_path             40
recommendations          0
dtype: int64

In [7]:
len(df)

22770

In [8]:
len(df[df["popularity"] > df["popularity"].median()])
df = df[df["popularity"] > df["popularity"].median()]

## writing pre-processed DataFrame to csv 

In [9]:
df.to_csv("re-movies.csv")

## calculating pair-wise cosine similarity between overviews of movies

In [10]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['overview'])

In [11]:
tfidf_matrix.shape

(11384, 263862)

In [12]:
print(tfidf_matrix.dtype)
tfidf_matrix = tfidf_matrix.astype("float32")
print(tfidf_matrix.dtype)

float64
float32


In [13]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [14]:
cosine_sim.shape

(11384, 11384)

In [15]:
newdf = df.reset_index()
titles = newdf['title']
indices = pd.Series(newdf.index, index=newdf['title'])

#### this cell is just to test if  writing cosine similarities to a csv so that i could read it later and get it was faster than calculating when the Flask server initialises. But pandas just takes too much time to read the file, so dropped the idea

In [16]:
# cos_sim_df = pd.DataFrame(cosine_sim, index=newdf["title"], columns=newdf["title"])
# cos_sim_df.to_csv("overview_similarities.csv")

In [17]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    movie_scores = [i[1] for i in sim_scores]
    return [[titles.iloc[movie_indices[i]],movie_scores[i]] for i in range(len(movie_scores))]

In [18]:
get_recommendations('Shrek')[:10]

[['Shrek 2', 0.10541846],
 ["Shrek's Thrilling Tales", 0.08867023],
 ['Shrek the Musical', 0.08815835],
 ['Shrek in the Swamp Karaoke Dance Party', 0.080541484],
 ['The Ghost of Lord Farquaad', 0.0796093],
 ['Shrek the Third', 0.0725713],
 ['Scared Shrekless', 0.057920665],
 ['Shanghai Noon', 0.051971093],
 ['Dreamworks Holiday Classics', 0.051124997],
 ['Shrek Forever After', 0.049867496]]

## making a "soup" out of genres, cast and keywords to get similar movies


In [19]:
x = df[df["title"] == "The Dark Knight"]["credits"].str.split("-")
list(x)[0][:5]

['Christian Bale',
 'Heath Ledger',
 'Michael Caine',
 'Gary Oldman',
 'Aaron Eckhart']

In [20]:
df['cast'] = df['credits'].apply(lambda x: list(x.split("-")) )
df['cast'] = df['cast'].apply(lambda x: x[:5] if len(x) >=5 else x)

In [21]:
df['cast'] = df['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [22]:
df["keywords"] = df["keywords"].apply(lambda x: list(x.split("-")))

In [23]:
s = df.apply(lambda x: pd.Series(x["keywords"]),axis=1).stack().reset_index(level=1, drop=True)
s = s.value_counts()
s = s[s > 1]

In [24]:
def filterKeywords(x):
    return [i for i in x if i in s]

In [25]:
filterKeywords(["murder", "splash", "woman director", "pranav"])

['murder', 'woman director']

In [26]:
df['keywords'] = df['keywords'].apply(filterKeywords)
df['keywords'] = df['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [27]:
df["genres"] = df["genres"].apply(lambda x: list(x.split("-")) )


In [28]:
df["soup"] = df['keywords'] + df['cast'] + df['genres']
df["soup"] = df["soup"].apply(lambda x: ' '.join(x))

In [29]:
df["soup"]

0        basedonnovelorbook sacrifice cabin faith endof...
1        pilot airplane philippines heldhostage planecr...
2        lossoflovedone hero sequel superhero basedonco...
4        fairytale talkingdog spinoff aftercreditssting...
7        basedonnovelorbook suicideattempt remake misca...
                               ...                        
16768    martialarts vampire vampirehunter(slayer) pete...
16770    japan samurai mutant worldwari superhero based...
16772    baseball sports johnnysimmons ethanhawke paulg...
16777    musical violinist davidgarrett joelyrichardson...
16778    writing popstar musician superbowl ladygaga bo...
Name: soup, Length: 11384, dtype: object

In [41]:
df.to_csv("re-movies.csv")

In [30]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [31]:
def get_recommendations(title, cosine):
    idx = indices[title]
    sim_scores = list(enumerate(cosine[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    movie_scores = [i[1] for i in sim_scores]
    return [[titles.iloc[movie_indices[i]],movie_scores[i]] for i in range(len(movie_scores))]

In [32]:
count_cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [33]:
df = df.reset_index()
titles = df['title']
indices = pd.Series(df.index, index=df['title'])

In [34]:
# cos_sim_df = pd.DataFrame(cosine_sim, index=titles, columns=titles)
# cos_sim_df.to_csv("soup_similarities.csv")

In [35]:
get_recommendations('The Dark Knight', count_cosine_sim)[:10]

[['Batman Begins', 0.3082056514117678],
 ['The Dark Knight Rises', 0.29090909090909095],
 ['Batman: Mask of the Phantasm', 0.2878806424034328],
 ['The Batman', 0.23636363636363641],
 ['Batman & Mr. Freeze: SubZero', 0.23354968324845696],
 ['SPL: Kill Zone', 0.22492862284621332],
 ['Batman: Under the Red Hood', 0.21635274585338146],
 ['Kick-Ass 2', 0.20759971844307284],
 ['The Black Dahlia', 0.20759971844307284],
 ['Fear Over the City', 0.20031323433445378]]

In [36]:
popularity_df = pd.DataFrame(df["popularity"].values, index=titles)
popularity_df.to_csv("popularity.csv")

In [39]:
poster_df = pd.DataFrame(df["poster_path"].values,index = titles)
poster_df.to_csv("poster.csv")