In [1]:
import pandas as pd
import csv
import numpy
import seaborn as sns
import matplotlib.pyplot as plt

### Read "title.ratings.tsv" file

In [2]:
title_ratings=pd.read_csv("title.ratings.tsv", sep='\t')

In [3]:
title_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.6,1647
1,tt0000002,6.1,198
2,tt0000003,6.5,1345
3,tt0000004,6.2,120
4,tt0000005,6.2,2131


### Read "title.basics.tsv" file

In [4]:
title_basics=pd.read_csv("title.basics.tsv", sep='\t')
title_basics=title_basics.drop_duplicates()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
title_basics=title_basics[['titleType','tconst','primaryTitle', 'originalTitle', 'startYear']]
title_basics=title_basics[title_basics.titleType=='movie']
title_basics=title_basics[title_basics.startYear.apply(lambda x: str(x).isnumeric())]
title_basics.head()

Unnamed: 0,titleType,tconst,primaryTitle,originalTitle,startYear
8,movie,tt0000009,Miss Jerry,Miss Jerry,1894
144,movie,tt0000147,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,1897
331,movie,tt0000335,Soldiers of the Cross,Soldiers of the Cross,1900
498,movie,tt0000502,Bohemios,Bohemios,1905
570,movie,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906


### Merge "title.ratings.tsv" and "title.basics.tsv" 

In [6]:
ratings_and_titles=pd.merge(title_ratings.set_index('tconst'), title_basics.set_index('tconst'), left_index=True, right_index=True, how='inner')
ratings_and_titles=ratings_and_titles.drop_duplicates()

In [7]:
ratings_and_titles.head()

Unnamed: 0_level_0,averageRating,numVotes,titleType,primaryTitle,originalTitle,startYear
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0000009,5.9,155,movie,Miss Jerry,Miss Jerry,1894
tt0000147,5.2,357,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,1897
tt0000335,6.1,41,movie,Soldiers of the Cross,Soldiers of the Cross,1900
tt0000502,3.8,6,movie,Bohemios,Bohemios,1905
tt0000574,6.1,590,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,1906


### Read netflix_titles.csv

In [8]:
netflix_titles=pd.read_csv("netflix_titles.csv", index_col="show_id")

#### Drop rows without release_year

In [9]:
netflix_titles=netflix_titles.dropna(subset=['release_year'])

#### Change release_year column to integer

In [10]:
netflix_titles.release_year=netflix_titles.release_year.astype(numpy.int64)

#### Drop rows in ratings_and_titles with non-numeric values for startYear and convert to integer

In [11]:
ratings_and_titles=ratings_and_titles[ratings_and_titles.startYear.apply(lambda x: str(x).isnumeric())]

In [12]:
ratings_and_titles.startYear=ratings_and_titles.startYear.astype(numpy.int64)

#### Convert titles to lowercase

In [13]:
netflix_titles['title']=netflix_titles['title'].str.lower()
ratings_and_titles['originalTitle']=ratings_and_titles['originalTitle'].str.lower()
ratings_and_titles['primaryTitle']=ratings_and_titles['primaryTitle'].str.lower()

### Join netflix titles with IMDb ratings on title name and release year.

In [14]:
##subset movies
netflix_titles=netflix_titles[netflix_titles.type=='Movie']

In [15]:
netflix_titles_rating=pd.merge(netflix_titles, ratings_and_titles, left_on=['title', 'release_year'], right_on=['primaryTitle', 'startYear'], how='inner')

### Sort the obtained data frame by averageRating and number of votes

In [16]:
netflix_titles_rating.sort_values(by=['averageRating', 'numVotes'], inplace=True, ascending=False)

In [17]:
netflix_titles_rating_2000=netflix_titles_rating[netflix_titles_rating.numVotes>2000]

In [18]:
netflix_titles_rating_2000.head(30)

Unnamed: 0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,averageRating,numVotes,titleType,primaryTitle,originalTitle,startYear
1894,Movie,pulp fiction,Quentin Tarantino,"John Travolta, Samuel L. Jackson, Uma Thurman,...",United States,"January 1, 2019",1994,R,154 min,"Classic Movies, Cult Movies, Dramas",This stylized crime caper weaves together stor...,8.9,1782352,movie,pulp fiction,pulp fiction,1994
1854,Movie,the lord of the rings: the return of the king,Peter Jackson,"Elijah Wood, Ian McKellen, Liv Tyler, Viggo Mo...","New Zealand, United States","January 1, 2020",2003,PG-13,201 min,"Action & Adventure, Sci-Fi & Fantasy",Aragorn is revealed as the heir to the ancient...,8.9,1605940,movie,the lord of the rings: the return of the king,the lord of the rings: the return of the king,2003
2836,Movie,schindler's list,Steven Spielberg,"Liam Neeson, Ben Kingsley, Ralph Fiennes, Caro...",United States,"April 1, 2018",1993,R,195 min,"Classic Movies, Dramas",Oskar Schindler becomes an unlikely humanitari...,8.9,1184746,movie,schindler's list,schindler's list,1993
1813,Movie,inception,Christopher Nolan,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ellen...","United States, United Kingdom","January 1, 2020",2010,PG-13,148 min,"Action & Adventure, Sci-Fi & Fantasy, Thrillers","In this mind-bending sci-fi thriller, a man ru...",8.8,2006939,movie,inception,inception,2010
740,Movie,the matrix,"Lilly Wachowski, Lana Wachowski","Keanu Reeves, Laurence Fishburne, Carrie-Anne ...",United States,"November 1, 2019",1999,R,136 min,"Action & Adventure, Sci-Fi & Fantasy",A computer hacker learns that what most people...,8.7,1634375,movie,the matrix,the matrix,1999
1855,Movie,the lord of the rings: the two towers,Peter Jackson,"Elijah Wood, Ian McKellen, Liv Tyler, Viggo Mo...","New Zealand, United States","January 1, 2020",2002,PG-13,179 min,"Action & Adventure, Sci-Fi & Fantasy",Frodo and Sam head to Mordor to destroy the On...,8.7,1451316,movie,the lord of the rings: the two towers,the lord of the rings: the two towers,2002
1971,Movie,be here now,Lilibet Foster,Andy Whitfield,"United States, Australia","February 28, 2017",2015,TV-MA,110 min,Documentaries,Trace the brave journey of actor Andy Whitfiel...,8.7,3157,movie,be here now,be here now,2015
1792,Movie,city of god,"Fernando Meirelles, Katia Lund","Alexandre Rodrigues, Leandro Firmino, Phellipe...","Brazil, France, Germany","January 1, 2020",2002,R,130 min,"Dramas, Independent Movies, International Movies","Growing up in a Rio de Janeiro slum, Rocket is...",8.6,686449,movie,city of god,cidade de deus,2002
2180,Movie,gol maal,Hrishikesh Mukherjee,"Amol Palekar, Bindiya Goswami, Deven Verma, Ut...",India,"December 31, 2019",1979,TV-PG,137 min,"Classic Movies, Comedies, International Movies",When circumstances at work compel Ram to prete...,8.6,17423,movie,gol maal,gol maal,1979
2398,Movie,the departed,Martin Scorsese,"Leonardo DiCaprio, Matt Damon, Jack Nicholson,...",United States,"December 1, 2019",2006,R,151 min,"Dramas, Thrillers",Two rookie Boston cops are sent deep undercove...,8.5,1161114,movie,the departed,the departed,2006


In [19]:
netflix_titles_rating_2000.dropna(subset=['director'], inplace=True)
netflix_titles_rating_2000.dropna(subset=['cast'], inplace=True)
netflix_titles_rating_2000.dropna(subset=['country'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  netflix_titles_rating_2000.dropna(subset=['director'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  netflix_titles_rating_2000.dropna(subset=['cast'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  netflix_titles_rating_2000.dropna(subset=['country'], inplace=True)


### Check for NaN values

In [20]:
netflix_titles_rating_2000.isnull().any()

type             False
title            False
director         False
cast             False
country          False
date_added       False
release_year     False
rating           False
duration         False
listed_in        False
description      False
averageRating    False
numVotes         False
titleType        False
primaryTitle     False
originalTitle    False
startYear        False
dtype: bool

In [21]:
nan_vars = netflix_titles_rating_2000.columns[netflix_titles_rating_2000.isnull().any()].tolist()
print(nan_vars)

[]


In [22]:
for variable in nan_vars:
    print(variable, sum(netflix_titles_rating_2000[variable].isnull()))

Now we see that there is no NaN values in our dataset

### Split Genres 

In [23]:
from itertools import chain

# return list from series of comma-separated strings
def chainer(s):
    return list(chain.from_iterable(s.str.split(',')))

# calculate lengths of splits
lens = netflix_titles_rating_2000.head(2000)['listed_in'].str.split(',').map(len)

# create new dataframe, repeating or chaining as appropriate
res = pd.DataFrame({'title': numpy.repeat(netflix_titles_rating_2000.head(2000)['title'], lens),
                    'listed_in': chainer(netflix_titles_rating_2000.head(2000)['listed_in']),
                    })
res['listed_in']=res['listed_in'].str.strip()

print(res)

                                              title             listed_in
1894                                   pulp fiction        Classic Movies
1894                                   pulp fiction           Cult Movies
1894                                   pulp fiction                Dramas
1854  the lord of the rings: the return of the king    Action & Adventure
1854  the lord of the rings: the return of the king      Sci-Fi & Fantasy
...                                             ...                   ...
765                                      himmatwala    Action & Adventure
765                                      himmatwala              Comedies
765                                      himmatwala  International Movies
1915                 justin bieber: never say never         Documentaries
1915                 justin bieber: never say never      Music & Musicals

[3336 rows x 2 columns]


#### Show Top Genres

In [24]:
top_genres=res['listed_in'].value_counts()
top_genres

Dramas                      664
International Movies        511
Comedies                    477
Action & Adventure          338
Independent Movies          260
Thrillers                   236
Romantic Movies             152
Horror Movies               134
Sci-Fi & Fantasy            131
Children & Family Movies    116
Documentaries                62
Music & Musicals             57
Classic Movies               52
Sports Movies                48
Cult Movies                  42
LGBTQ Movies                 24
Anime Features               16
Faith & Spirituality         11
Stand-Up Comedy               5
Name: listed_in, dtype: int64

### Compare means of average ratings per rating

In [25]:
netflix_titles_rating_2000.groupby("rating").agg({"averageRating": ["mean", "std"]}).sort_values(("averageRating", "mean"), ascending=False)

Unnamed: 0_level_0,averageRating,averageRating
Unnamed: 0_level_1,mean,std
rating,Unnamed: 1_level_2,Unnamed: 2_level_2
TV-G,7.3,0.424264
R,6.542721,0.998833
G,6.527778,1.388527
NR,6.494828,0.921749
TV-PG,6.490411,1.319213
PG-13,6.426582,1.042736
TV-14,6.357732,1.189708
UR,6.28,0.944458
TV-MA,6.226836,1.014868
PG,6.197297,1.049109


# Movie ratings across different timeframes

We want to know if there is a difference between what people liked based on their average ratings on the movies.

To do this, we will group the release_year, which is the year when the movie is produced into an interval of 15 years.

We will bin the release_year into an intervals of 15 by using pandas.cut().

### Group release_year into an interval of 10 years

In [26]:
bins = [1941,1957,1973,1989,2005,2020]
labels = ['1941-1956','1957-1972','1973-1988','1989-2004','2005-2020']
netflix_titles_rating_2000['year_interval_15'] = pd.cut(netflix_titles_rating_2000['release_year'], bins=bins, labels=labels)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  netflix_titles_rating_2000['year_interval_15'] = pd.cut(netflix_titles_rating_2000['release_year'], bins=bins, labels=labels)


In [27]:
netflix_titles_rating_2000['year_interval_15'].value_counts()

2005-2020    1181
1989-2004     231
1973-1988      40
1957-1972      18
1941-1956       4
Name: year_interval_15, dtype: int64

After grouping the release_year, we get the average of the averageRatings per year_interval_15.

### Compare means of averageRating per year range

In [28]:
netflix_titles_rating_2000.groupby("year_interval_15").agg({"averageRating": ["mean","std"]})

Unnamed: 0_level_0,averageRating,averageRating
Unnamed: 0_level_1,mean,std
year_interval_15,Unnamed: 1_level_2,Unnamed: 2_level_2
1941-1956,7.575,0.125831
1957-1972,7.405556,0.653022
1973-1988,7.255,0.879671
1989-2004,6.654545,0.969609
2005-2020,6.291702,1.063652


We want to know if there is a difference of ratings between recent movies and older movies in span of 15 years interval.

We will get the point estimate of the group year range from 1989-2004 and 2005-2020

### Difference between year range from 1989-2004 to 2005-2020

In [29]:
6.654545 - 6.291702

0.3628429999999998

Based on our data, we see that there is 0.36 difference of ratings of movies between 1989-2004 and 2005-2020.


To see if there is a significant difference between the two grouped year intervals, we will use T-test unpaired observation.


We set up our hypotheses as follows:

$H_0$ (null hypothesis): There is no true difference between the two grouped year intervals.

$H_A$ (alternative hypothesis): There is a true difference between the two grouped year intervals.

Now, we can use a $t$-test to compare the two means from the unpaired groups. This function assumes that the null hypothesis is that the difference between the two means is 0, while the alternative hypothesis is that the diference between them is not 0. We set the `equal_var` parameter to `False` because we don't want to assume that the population has equal variances.

Now using a t-test to compare the two groups:

### Find the statistics of two groups (1989-2004 and 2005-2020)

In [30]:
from scipy.stats import ttest_ind
ttest_ind(netflix_titles_rating_2000[netflix_titles_rating_2000["year_interval_15"] == "2005-2020"]["averageRating"],
          netflix_titles_rating_2000[netflix_titles_rating_2000["year_interval_15"] == "1989-2004"]["averageRating"],
          equal_var = False)

Ttest_indResult(statistic=-5.117153927143217, pvalue=5.14440408750733e-07)

With a 95% confidence level,

the result shows that the p-value is less than 0.05. This means that we accept the null hypothesis, which is there is no significant difference between the two grouped year intervals.

However, if we look back to the count of the grouped year intervals' values used, they are not close in values because there is more movies produced and rated in the recent years than the older ones.

# Recommender Systems - Content-based Filtering on multiple factors

This Recommender System uses multiple factors to recommend other movies using content-based filtering.

First, we copy the dataframe into another to avoid disruption of data since we will be cleaning in a different way to manipulate the dataframe.

In [31]:
recom_netflix_2000 = netflix_titles_rating_2000.copy()

We define a function that cleans the data which removes the spaces in the words.

In [32]:
def clean_data(x):
        return str.lower(x.replace(" ", ""))

We set up the factors to filter by the title, director, cast, listed_in(genre), and description of the movies

In [33]:
features=['title','director','cast','listed_in','description']
recom_netflix_2000=recom_netflix_2000[features]

recom_netflix_2000.head()

Unnamed: 0,title,director,cast,listed_in,description
1894,pulp fiction,Quentin Tarantino,"John Travolta, Samuel L. Jackson, Uma Thurman,...","Classic Movies, Cult Movies, Dramas",This stylized crime caper weaves together stor...
1854,the lord of the rings: the return of the king,Peter Jackson,"Elijah Wood, Ian McKellen, Liv Tyler, Viggo Mo...","Action & Adventure, Sci-Fi & Fantasy",Aragorn is revealed as the heir to the ancient...
2836,schindler's list,Steven Spielberg,"Liam Neeson, Ben Kingsley, Ralph Fiennes, Caro...","Classic Movies, Dramas",Oskar Schindler becomes an unlikely humanitari...
1813,inception,Christopher Nolan,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ellen...","Action & Adventure, Sci-Fi & Fantasy, Thrillers","In this mind-bending sci-fi thriller, a man ru..."
740,the matrix,"Lilly Wachowski, Lana Wachowski","Keanu Reeves, Laurence Fishburne, Carrie-Anne ...","Action & Adventure, Sci-Fi & Fantasy",A computer hacker learns that what most people...


We then clean the dataframe with the factors defined using the clean_data function defined earlier.

In [34]:
for feature in features:
    recom_netflix_2000[feature] = recom_netflix_2000[feature].apply(clean_data)
    
recom_netflix_2000[feature].head()

1894    thisstylizedcrimecaperweavestogetherstoriesfea...
1854    aragornisrevealedastheheirtotheancientkingsash...
2836    oskarschindlerbecomesanunlikelyhumanitarian,sp...
1813    inthismind-bendingsci-fithriller,amanrunsanesp...
740     acomputerhackerlearnsthatwhatmostpeopleperceiv...
Name: description, dtype: object

We define a function that concatenates the data into one string and separate them into spaces

In [35]:
def create_soup(x):
    return x['title']+ ' ' + x['director'] + ' ' + x['cast'] + ' ' +x['listed_in']+' '+ x['description']

We apply the defined function to the dataframe

In [36]:
recom_netflix_2000['soup'] = recom_netflix_2000.apply(create_soup, axis=1)

recom_netflix_2000['soup'].head()

1894    pulpfiction quentintarantino johntravolta,samu...
1854    thelordoftherings:thereturnoftheking peterjack...
2836    schindler'slist stevenspielberg liamneeson,ben...
1813    inception christophernolan leonardodicaprio,jo...
740     thematrix lillywachowski,lanawachowski keanure...
Name: soup, dtype: object

We import functions to compute for the cosine similarity

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(recom_netflix_2000['soup'])

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [39]:
recom_netflix_2000=recom_netflix_2000.reset_index()
indices = pd.Series(recom_netflix_2000.index, index=recom_netflix_2000['title'])

We define a function that will return the top 10 recommended movies based on the multiple factors that we used for content-based filtering

In [40]:
def get_recommendations_new(title, cosine_sim=cosine_sim2):
    title=title.replace(' ','').lower()
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return recom_netflix_2000['title'].iloc[movie_indices]

In [41]:
get_recommendations_new('3 idiots', cosine_sim2)

55                    pk
37         rangdebasanti
343              talaash
121                sanju
779     ferrarikisawaari
1121      ekmainaurekktu
496        dildhadaknedo
1188                zero
60          dilchahtahai
545         tanuwedsmanu
Name: title, dtype: object

In [42]:
get_recommendations_new('pulp fiction', cosine_sim2)

111                thehatefuleight
210                    jackiebrown
287                    coachcarter
293                    meanstreets
304    alicedoesn'tlivehereanymore
25                      taxidriver
244                         carrie
247                       truegrit
82                catonahottinroof
411                   theinterview
Name: title, dtype: object