## Content based

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
movies = pd.read_csv('datasets/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings = pd.read_csv('datasets/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
# MovieID üzerinden kişilerin yorumlarına film isimlerini ve genrelerini ekliyoruz. 
df = pd.merge(ratings, movies, how='left', on='movieId')

df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [6]:
df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64

In [7]:
df.duplicated().sum()


0

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
 4   title      100836 non-null  object 
 5   genres     100836 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 5.4+ MB


In [9]:
print(df[df.title=="Toy Story (1995)"])

       userId  movieId  rating   timestamp             title  \
0           1        1     4.0   964982703  Toy Story (1995)   
516         5        1     4.0   847434962  Toy Story (1995)   
874         7        1     4.5  1106635946  Toy Story (1995)   
1434       15        1     2.5  1510577970  Toy Story (1995)   
1667       17        1     4.5  1305696483  Toy Story (1995)   
...       ...      ...     ...         ...               ...   
97364     606        1     2.5  1349082950  Toy Story (1995)   
98479     607        1     4.0   964744033  Toy Story (1995)   
98666     608        1     2.5  1117408267  Toy Story (1995)   
99497     609        1     3.0   847221025  Toy Story (1995)   
99534     610        1     5.0  1479542900  Toy Story (1995)   

                                            genres  
0      Adventure|Animation|Children|Comedy|Fantasy  
516    Adventure|Animation|Children|Comedy|Fantasy  
874    Adventure|Animation|Children|Comedy|Fantasy  
1434   Adventure|An

In [10]:
df.groupby('title')['rating'].mean()


title
'71 (2014)                                   4.000000
'Hellboy': The Seeds of Creation (2004)      4.000000
'Round Midnight (1986)                       3.500000
'Salem's Lot (2004)                          5.000000
'Til There Was You (1997)                    4.000000
                                               ...   
eXistenZ (1999)                              3.863636
xXx (2002)                                   2.770833
xXx: State of the Union (2005)               2.000000
¡Three Amigos! (1986)                        3.134615
À nous la liberté (Freedom for Us) (1931)    1.000000
Name: rating, Length: 9719, dtype: float64

In [11]:
df.groupby('title')['rating'].mean().sort_values(ascending=False).head()


title
Gena the Crocodile (1969)              5.0
True Stories (1986)                    5.0
Cosmic Scrat-tastrophe (2015)          5.0
Love and Pigeons (1985)                5.0
Red Sorghum (Hong gao liang) (1987)    5.0
Name: rating, dtype: float64

In [12]:
df.groupby('title')['rating'].count().sort_values(ascending=False).head()


title
Forrest Gump (1994)                 329
Shawshank Redemption, The (1994)    317
Pulp Fiction (1994)                 307
Silence of the Lambs, The (1991)    279
Matrix, The (1999)                  278
Name: rating, dtype: int64

In [13]:
genre_popularity = (df.genres.str.split('|')
                      .explode()
                      .value_counts()
                      .sort_values(ascending=False))
genre_popularity

Drama                 41928
Comedy                39053
Action                30635
Thriller              26452
Adventure             24161
Romance               18124
Sci-Fi                17243
Crime                 16681
Fantasy               11834
Children               9208
Mystery                7674
Horror                 7291
Animation              6988
War                    4859
IMAX                   4145
Musical                4138
Western                1930
Documentary            1219
Film-Noir               870
(no genres listed)       47
Name: genres, dtype: int64

In [14]:
# the function to extract titles 
def extract_title(title):
   year = title[len(title)-5:len(title)-1]
   
   # some movies do not have the info about year in the column title. So, we should take care of the case as well.
   
   if year.isnumeric():
      title_no_year = title[:len(title)-7]
      return title_no_year
   else:
      return title
# the function to extract years
def extract_year(title):
   year = title[len(title)-5:len(title)-1]
   # some movies do not have the info about year in the column title. So, we should take care of the case as well.
   if year.isnumeric():
      return int(year)
   else:
      return np.nan
# change the column name from title to title_year
movies.rename(columns={'title':'title_year'}, inplace=True) 
# remove leading and ending whitespaces in title_year
movies['title_year'] = movies['title_year'].apply(lambda x: x.strip()) 
# create the columns for title and year
movies['title'] = movies['title_year'].apply(extract_title) 
movies['year'] = movies['title_year'].apply(extract_year) 

In [15]:
movies

Unnamed: 0,movieId,title_year,genres,title,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995.0
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995.0
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995.0
...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,Black Butler: Book of the Atlantic,2017.0
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,No Game No Life: Zero,2017.0
9739,193585,Flint (2017),Drama,Flint,2017.0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,Bungo Stray Dogs: Dead Apple,2018.0


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import combinations
tf = TfidfVectorizer(analyzer=lambda s: (c for i in range(1,4)
                                             for c in combinations(s.split('|'), r=i)))
tfidf_matrix = tf.fit_transform(movies['genres'])
tfidf_matrix.shape

(9742, 783)

In [17]:
pd.DataFrame(tfidf_matrix.todense(), columns=tf.get_feature_names(), index=movies.title).sample(50, axis=1).sample(50, axis=0)

Unnamed: 0_level_0,"(Animation, Documentary, War)","(Children, Horror, Sci-Fi)","(Action, Mystery)","(Children, Crime, Drama)","(Comedy, Crime, Romance)","(Adventure, Drama, Thriller)","(Action, Mystery, Western)","(Crime, IMAX)","(Sci-Fi, Thriller)","(Comedy, Documentary, Drama)",...,"(Children, War)","(Comedy, Romance)","(Horror, Romance, Sci-Fi)","(Animation, Sci-Fi, War)","(Comedy, Thriller)","(Animation, Documentary)","(Adventure, Children, Comedy)","(Action, Drama, IMAX)","(Mystery, Sci-Fi)","(Action, Horror, Thriller)"
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Jennifer's Body,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.242839,0.0,...,0.0,0.0,0.0,0.0,0.267058,0.0,0.0,0.0,0.0,0.0
Young Guns II,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
They Live,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.444603,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Secret in Their Eyes, The (El secreto de sus ojos)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fun with Dick and Jane,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Rocco and His Brothers (Rocco e i suoi fratelli),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Blue Exorcist: The Movie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Beach Blanket Bingo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
World of Tomorrow,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Sky Crawlers, The (Sukai kurora)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim, index=movies['title'], columns=movies['title'])
print('Shape:', cosine_sim_df.shape)
cosine_sim_df.sample(10, axis=1).round(2)

Shape: (9742, 9742)


title,What Ever Happened to Baby Jane?,I Am Sam,It's Complicated,Colombiana,North Pole: Open For Christmas,Laputa: Castle in the Sky (Tenkû no shiro Rapyuta),Scooby-Doo Goes Hollywood,No Mercy,"Daytrippers, The","F*ck You, Goethe 2"
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Toy Story,0.00,0.00,0.03,0.03,0.30,0.36,0.46,0.00,0.01,0.08
Jumanji,0.00,0.00,0.00,0.05,0.63,0.32,0.11,0.00,0.00,0.00
Grumpier Old Men,0.00,0.00,1.00,0.00,0.00,0.00,0.07,0.00,0.27,0.40
Waiting to Exhale,0.04,0.22,0.59,0.03,0.00,0.00,0.04,0.00,0.46,0.24
Father of the Bride Part II,0.00,0.00,0.40,0.00,0.00,0.00,0.18,0.00,0.11,1.00
...,...,...,...,...,...,...,...,...,...,...
Black Butler: Book of the Atlantic,0.00,0.00,0.04,0.03,0.10,0.24,0.20,0.04,0.01,0.11
No Game No Life: Zero,0.00,0.00,0.07,0.00,0.15,0.13,0.31,0.00,0.02,0.17
Flint,0.17,1.00,0.00,0.12,0.00,0.00,0.00,0.00,0.10,0.00
Bungo Stray Dogs: Dead Apple,0.00,0.00,0.00,0.07,0.00,0.20,0.19,0.11,0.00,0.00


In [19]:
def movie_recommendations(i):
    
    """
    i : Movie
        
    """
    ix = cosine_sim_df.loc[:,i].to_numpy().argpartition(range(-1,-10,-1))
    closest = cosine_sim_df.columns[ix[-1:-(10+2):-1]]
    closest = closest.drop(i, errors='ignore')
    return pd.DataFrame(closest).merge(movies[['title', 'genres']]).head(10)

In [20]:
movie_recommendations('Fraktus')

Unnamed: 0,title,genres
0,Andrew Dice Clay: Dice Rules,Comedy
1,Unfinished Business,Comedy
2,Who Is Cletis Tout?,Comedy
3,Dirty Work,Comedy
4,Top Secret!,Comedy
5,Secret Ballot (Raye makhfi),Comedy
6,Dead Man on Campus,Comedy
7,Class,Comedy
8,Betsy's Wedding,Comedy
9,Down and Out in Beverly Hills,Comedy
