In [108]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

import warnings; warnings.simplefilter('ignore')

In [109]:
imdb_ratings_df = pd.read_csv('zippedData/imdb.title.ratings.csv.gz')

In [110]:
title_basics_df = pd.read_csv('zippedData/imdb.title.basics.csv.gz')

In [111]:
movie_gross_df = pd.read_csv('zippedData/bom.movie_gross.csv.gz')

In [112]:
movie_budgets_df = pd.read_csv('zippedData/tn.movie_budgets.csv.gz')

In [113]:
name_basics_df = pd.read_csv('zippedData/imdb.name.basics.csv.gz')

In [114]:
name_basics_df = name_basics_df.drop(columns = 'death_year')

In [115]:
name_basics_df = name_basics_df.dropna()

In [116]:
name_basics_df.isna().sum()

nconst                0
primary_name          0
birth_year            0
primary_profession    0
known_for_titles      0
dtype: int64

In [117]:
name_basics_df = (name_basics_df.set_index(['nconst', 'primary_name','birth_year'])
   .stack()
   .str.split(',', expand=True)
   .stack()
   .unstack(-2)
   .reset_index(-1, drop=True)
   .reset_index()
)

In [118]:
name_basics_df.head()

Unnamed: 0,nconst,primary_name,birth_year,primary_profession,known_for_titles
0,nm0000002,Lauren Bacall,1924.0,actress,tt0038355
1,nm0000002,Lauren Bacall,1924.0,soundtrack,tt0117057
2,nm0000002,Lauren Bacall,1924.0,,tt0071877
3,nm0000002,Lauren Bacall,1924.0,,tt0037382
4,nm0000003,Brigitte Bardot,1934.0,actress,tt0049189


In [119]:
name_basics_df = name_basics_df.rename({'known_for_titles': 'tconst'}, axis='columns')

In [120]:
budget_tconst = pd.merge(title_basics_df, movie_budgets_df , left_on= 'primary_title', right_on= 'movie', how = 'right')

In [121]:
df = pd.merge(budget_tconst,  imdb_ratings_df, on=['tconst'])

In [122]:
df.head()

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,averagerating,numvotes
0,tt1775309,Avatar,Abatâ,2011.0,93.0,Horror,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279",6.1,43
1,tt1298650,Pirates of the Caribbean: On Stranger Tides,Pirates of the Caribbean: On Stranger Tides,2011.0,136.0,"Action,Adventure,Fantasy",2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",6.6,447624
2,tt6565702,Dark Phoenix,Dark Phoenix,2019.0,113.0,"Action,Adventure,Sci-Fi",3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350",6.0,24451
3,tt2395427,Avengers: Age of Ultron,Avengers: Age of Ultron,2015.0,141.0,"Action,Adventure,Sci-Fi",4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963",7.3,665594
4,tt4154756,Avengers: Infinity War,Avengers: Infinity War,2018.0,149.0,"Action,Adventure,Sci-Fi",7,"Apr 27, 2018",Avengers: Infinity War,"$300,000,000","$678,815,482","$2,048,134,200",8.5,670926


In [123]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2875 entries, 0 to 2874
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   tconst             2875 non-null   object 
 1   primary_title      2875 non-null   object 
 2   original_title     2875 non-null   object 
 3   start_year         2875 non-null   float64
 4   runtime_minutes    2757 non-null   float64
 5   genres             2867 non-null   object 
 6   id                 2875 non-null   int64  
 7   release_date       2875 non-null   object 
 8   movie              2875 non-null   object 
 9   production_budget  2875 non-null   object 
 10  domestic_gross     2875 non-null   object 
 11  worldwide_gross    2875 non-null   object 
 12  averagerating      2875 non-null   float64
 13  numvotes           2875 non-null   int64  
dtypes: float64(3), int64(2), object(9)
memory usage: 336.9+ KB


In [124]:
def column_conversion(df, col):
    df[col] = df[col].str.replace("$","").str.replace(",","").astype('int64')
    return df

In [125]:
column_to_convert = ['production_budget','domestic_gross','worldwide_gross']

for col in column_to_convert:
    df = column_conversion(df, col)

In [126]:
df.head()

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,averagerating,numvotes
0,tt1775309,Avatar,Abatâ,2011.0,93.0,Horror,1,"Dec 18, 2009",Avatar,425000000,760507625,2776345279,6.1,43
1,tt1298650,Pirates of the Caribbean: On Stranger Tides,Pirates of the Caribbean: On Stranger Tides,2011.0,136.0,"Action,Adventure,Fantasy",2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,6.6,447624
2,tt6565702,Dark Phoenix,Dark Phoenix,2019.0,113.0,"Action,Adventure,Sci-Fi",3,"Jun 7, 2019",Dark Phoenix,350000000,42762350,149762350,6.0,24451
3,tt2395427,Avengers: Age of Ultron,Avengers: Age of Ultron,2015.0,141.0,"Action,Adventure,Sci-Fi",4,"May 1, 2015",Avengers: Age of Ultron,330600000,459005868,1403013963,7.3,665594
4,tt4154756,Avengers: Infinity War,Avengers: Infinity War,2018.0,149.0,"Action,Adventure,Sci-Fi",7,"Apr 27, 2018",Avengers: Infinity War,300000000,678815482,2048134200,8.5,670926


In [127]:
df = pd.merge(df, name_basics_df, on=['tconst'])

In [128]:
df.drop(['original_title', 'movie', 'tconst', 'release_date'], axis='columns', inplace = True)

In [139]:
df

Unnamed: 0,primary_title,start_year,runtime_minutes,genres,id,production_budget,domestic_gross,worldwide_gross,averagerating,numvotes,nconst,primary_name,birth_year,primary_profession,soup
0,Pirates of the Caribbean: On Stranger Tides,2011.0,136.0,"Action,Adventure,Fantasy",2,410600000,241063875,1045663875,6.6,447624,nm0004851,Penélope Cruz,1974.0,actress,410602147.0
2,Pirates of the Caribbean: On Stranger Tides,2011.0,136.0,"Action,Adventure,Fantasy",2,410600000,241063875,1045663875,6.6,447624,nm0046087,Jennifer Badger,1976.0,stunts,410602147.0
3,Pirates of the Caribbean: On Stranger Tides,2011.0,136.0,"Action,Adventure,Fantasy",2,410600000,241063875,1045663875,6.6,447624,nm0263740,Steve Evets,1960.0,actor,410602147.0
4,Pirates of the Caribbean: On Stranger Tides,2011.0,136.0,"Action,Adventure,Fantasy",2,410600000,241063875,1045663875,6.6,447624,nm0273797,Sebastian Armesto,1982.0,assistant_director,410602147.0
6,Pirates of the Caribbean: On Stranger Tides,2011.0,136.0,"Action,Adventure,Fantasy",2,410600000,241063875,1045663875,6.6,447624,nm0573618,Kevin McNally,1956.0,actor,410602147.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17427,Raymond Did It,2011.0,83.0,Horror,32,40000,3632,3632,3.3,211,nm3171405,Robert J. Williams,1986.0,editor,42094.0
17429,Foreign Letters,2012.0,99.0,"Comedy,Drama",33,40000,0,0,6.9,180,nm0858008,Ela Thier,1971.0,writer,42111.0
17432,Exeter,2015.0,91.0,"Horror,Mystery,Thriller",49,25000,0,489792,4.5,5156,nm1885051,Nick Nicotera,1981.0,editor,27106.0
17433,Ten,2013.0,118.0,Drama,51,25000,0,0,5.5,31,nm2793783,Colin Burt Vidler,1970.0,actor,27131.0


In [131]:
df.isna().sum()

primary_title            0
start_year               0
runtime_minutes         68
genres                   5
id                       0
production_budget        0
domestic_gross           0
worldwide_gross          0
averagerating            0
numvotes                 0
nconst                   0
primary_name             0
birth_year               0
primary_profession    6444
dtype: int64

In [132]:
df = df.dropna()

In [133]:
df.isna().sum()

primary_title         0
start_year            0
runtime_minutes       0
genres                0
id                    0
production_budget     0
domestic_gross        0
worldwide_gross       0
averagerating         0
numvotes              0
nconst                0
primary_name          0
birth_year            0
primary_profession    0
dtype: int64

In [134]:
df.shape

(10958, 14)

In [135]:
df = df.drop_duplicates()

In [156]:
df['primary_title'].nunique

<bound method IndexOpsMixin.nunique of 0        Pirates of the Caribbean: On Stranger Tides
1        Pirates of the Caribbean: On Stranger Tides
2        Pirates of the Caribbean: On Stranger Tides
3        Pirates of the Caribbean: On Stranger Tides
4        Pirates of the Caribbean: On Stranger Tides
                            ...                     
10953                                 Raymond Did It
10954                                Foreign Letters
10955                                         Exeter
10956                                            Ten
10957                                         Red 11
Name: primary_title, Length: 10958, dtype: object>

In [137]:
df.shape

(10958, 14)

In [145]:
df['soup'] = df['genres'] + df['primary_name'] + df['primary_profession']
df['soup'] = df['soup'].apply(lambda x: ''.join(x))

In [146]:
print(df['soup'])

0             Action,Adventure,FantasyPenélope Cruzactress
2            Action,Adventure,FantasyJennifer Badgerstunts
3                 Action,Adventure,FantasySteve Evetsactor
4        Action,Adventure,FantasySebastian Armestoassis...
6               Action,Adventure,FantasyKevin McNallyactor
                               ...                        
17427                       HorrorRobert J. Williamseditor
17429                          Comedy,DramaEla Thierwriter
17432           Horror,Mystery,ThrillerNick Nicoteraeditor
17433                          DramaColin Burt Vidleractor
17435          Horror,Sci-Fi,ThrillerRacer Rodriguezstunts
Name: soup, Length: 10958, dtype: object


In [147]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [148]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [150]:
df = df.reset_index()
titles = df['primary_title']
indices = pd.Series(df.index, index=df['primary_title'])

In [152]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [158]:
get_recommendations('Ten')

10800                                    The Journey
2603                                     San Andreas
0        Pirates of the Caribbean: On Stranger Tides
1        Pirates of the Caribbean: On Stranger Tides
2        Pirates of the Caribbean: On Stranger Tides
3        Pirates of the Caribbean: On Stranger Tides
4        Pirates of the Caribbean: On Stranger Tides
5        Pirates of the Caribbean: On Stranger Tides
6        Pirates of the Caribbean: On Stranger Tides
7        Pirates of the Caribbean: On Stranger Tides
8        Pirates of the Caribbean: On Stranger Tides
9        Pirates of the Caribbean: On Stranger Tides
10       Pirates of the Caribbean: On Stranger Tides
11       Pirates of the Caribbean: On Stranger Tides
12       Pirates of the Caribbean: On Stranger Tides
13       Pirates of the Caribbean: On Stranger Tides
14       Pirates of the Caribbean: On Stranger Tides
15       Pirates of the Caribbean: On Stranger Tides
16       Pirates of the Caribbean: On Stranger