# LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.options.display.max_colwidth = 200

## CONTENT

In [3]:
#############################
# Content Filtering
#############################
# Recommendation System According to Movie Overviews
#############################

# Step 1: CREATING THE TF-IDF MATRIX
# Step 2: CREATING THE COSINE SIMILARITY MATRIX
# Step 3: SUGGEST FILMS THAT ARE MOST SIMILAR TO A FILM




In [17]:
#################################
# 1. CREATING THE TF-IDF MATRIX
#################################

import pandas as pd
pd.set_option('display.max_columns', 30)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [18]:
df = pd.read_csv("movies_metadata.csv", low_memory=False)  # DtypeWarning kapamak icin
df.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circum...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]",,8844,tt0113497,en,Jumanji,"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- in...",17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'name': 'Teitler Film', 'id': 2550}, {'name': 'Interscope Communications', 'id': 10201}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name': 'Français'}]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [19]:
df.shape

(45466, 24)

In [20]:
df["overview"].head()

0    Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circum...
1    When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- in...
2    A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming t...
3    Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive "good man" to break a string of less-than-stellar lovers. Friends and confidants Vannah, Bernie, ...
4    Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting too. He was planning 

In [23]:

#################################
# 2: tf-idf method
#################################

# TF-IDF = TF(t) * IDF(t)
# TF(t) = (Frequency of observation in a related document) / (Total number of terms in the document) (term frequency)
# IDF(t) = log_e(Total number of documents / Number of documents with t term in them) (inverse document frequency)

In [24]:
df['overview'].head()

0    Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circum...
1    When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- in...
2    A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming t...
3    Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive "good man" to break a string of less-than-stellar lovers. Friends and confidants Vannah, Bernie, ...
4    Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting too. He was planning 

In [25]:
df['overview'] = df['overview'].fillna('')

In [26]:
df['overview']

0        Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circum...
1        When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- in...
2        A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming t...
3        Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive "good man" to break a string of less-than-stellar lovers. Friends and confidants Vannah, Bernie, ...
4        Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting t

In [27]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['overview'])

In [28]:
tfidf_matrix.shape

(45466, 75827)

In [33]:
print(tfidf_matrix)

  (0, 17764)	0.13483149538639247
  (0, 4388)	0.1474882034218405
  (0, 38030)	0.10142919482788751
  (0, 21887)	0.10438761058719498
  (0, 19641)	0.13281884272823927
  (0, 48558)	0.10339358185033234
  (0, 59519)	0.13008016104455086
  (0, 12490)	0.12544427954397822
  (0, 51108)	0.13434817283119177
  (0, 29238)	0.10093917370354445
  (0, 50914)	0.09190797940163035
  (0, 39423)	0.11907123344715953
  (0, 1847)	0.140911774178889
  (0, 58571)	0.1135591886873686
  (0, 38693)	0.20627924682810617
  (0, 9874)	0.5028038686135609
  (0, 9087)	0.10635375129287977
  (0, 7491)	0.12380553184830104
  (0, 56872)	0.111248510865236
  (0, 28729)	0.13311522181618415
  (0, 39012)	0.08718689178959059
  (0, 67874)	0.14878284660693247
  (0, 3159)	0.41178365711725945
  (0, 73468)	0.4809827114790237
  (0, 38088)	0.10739705953465473
  :	:
  (45464, 26957)	0.0735096263170162
  (45464, 18919)	0.09271509240923416
  (45464, 18119)	0.07466631763708825
  (45464, 39012)	0.0682961777913538
  (45465, 16520)	0.32373307886945113


In [29]:
df['title'].shape

(45466,)

In [30]:
df['title'].head()

0                      Toy Story
1                        Jumanji
2               Grumpier Old Men
3              Waiting to Exhale
4    Father of the Bride Part II
Name: title, dtype: object

In [34]:
#################################
# 2. COSINE SIMILARITY MATRIX
#################################
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [35]:
cosine_sim.shape

(45466, 45466)

In [39]:
cosine_sim[0:4]

array([[1.        , 0.01504121, 0.        , ..., 0.        , 0.00595453,
        0.        ],
       [0.01504121, 1.        , 0.04681953, ..., 0.        , 0.02198641,
        0.00929411],
       [0.        , 0.04681953, 1.        , ..., 0.        , 0.01402548,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.00952214,
        0.01641271]])

In [40]:
#################################
# 3. Recommend the most similar movies
#################################

df = df[~df["title"].isna()]


In [42]:
df["title"].head()

0                      Toy Story
1                        Jumanji
2               Grumpier Old Men
3              Waiting to Exhale
4    Father of the Bride Part II
Name: title, dtype: object

In [48]:
indices = pd.Series(df.index, index=df['title'])

In [49]:
indices

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                               ...  
Subdue                         45461
Century of Birthing            45462
Betrayal                       45463
Satan Triumphant               45464
Queerama                       45465
Length: 45460, dtype: int64

In [50]:
indices.count()

45460

In [51]:
indices = indices[~indices.index.duplicated(keep='last')]

In [53]:
indices.shape

(42277,)

In [54]:
indices[:10]

title
Toy Story                       0
Jumanji                         1
Grumpier Old Men                2
Waiting to Exhale               3
Father of the Bride Part II     4
Tom and Huck                    7
Sudden Death                    8
GoldenEye                       9
The American President         10
Dracula: Dead and Loving It    11
dtype: int64

In [55]:
indices["Sherlock Holmes"]

35116

In [56]:
movie_index = indices["Sherlock Holmes"]

In [57]:
cosine_sim[movie_index]

array([0.        , 0.00392837, 0.00476764, ..., 0.        , 0.0067919 ,
       0.        ])

In [58]:
similarity_scores = pd.DataFrame(cosine_sim[movie_index], columns=["score"])

In [60]:
similarity_scores.head()

Unnamed: 0,score
0,0.0
1,0.003928
2,0.004768
3,0.0
4,0.0


In [61]:
movie_indices = similarity_scores.sort_values("score", ascending=False)[1:11].index

In [147]:
movie_indices

Int64Index([34737, 14821, 34750, 9743, 4434, 29706, 18258, 24665, 6432, 29154], dtype='int64')

In [148]:
df['title'].head()

0    Toy Story
1    Toy Story
2    Toy Story
3    Toy Story
4    Toy Story
Name: title, dtype: object

In [64]:
df['title'].iloc[movie_indices]

34741                           Vanished Empire
14821                         The Royal Scandal
34754                        Helen the Baby Fox
9743                The Seven-Per-Cent Solution
4434                             Without a Clue
29710           Doug Stanhope: Beer Hall Putsch
18258        Sherlock Holmes: A Game of Shadows
24667    Ultramarines: A Warhammer 40,000 Movie
6432        The Private Life of Sherlock Holmes
29156                               The Village
Name: title, dtype: object

# FUNCTIONS

In [66]:

def content_based_recommender(title, cosine_sim, dataframe):

    dataframe = dataframe[~dataframe["title"].isna()]
    indices = pd.Series(dataframe.index, index=dataframe['title'])
    indices = indices[~indices.index.duplicated(keep='last')]

    movie_index = indices[title]

    similarity_scores = pd.DataFrame(cosine_sim[movie_index], columns=["score"])

    movie_indices = similarity_scores.sort_values("score", ascending=False)[1:11].index
    return dataframe['title'].iloc[movie_indices]

In [68]:
content_based_recommender("The Godfather", cosine_sim, df)

1178      The Godfather: Part II
44036              Clown Service
1914     The Godfather: Part III
23128               Free to Play
11297           Household Saints
34721      The Brides Are Coming
10821                   Election
38036      Dr. Kildare Goes Home
17729          Short Sharp Shock
26295                Weary River
Name: title, dtype: object

In [69]:
content_based_recommender('The Dark Knight Rises', cosine_sim, df)

12481                       The Dark Knight
150                          Batman Forever
1328                         Batman Returns
15511            Batman: Under the Red Hood
585                                  Batman
21196                           Ghoulies IV
9230     Batman Beyond: Return of the Joker
18035                      Batman: Year One
19794                   Beloved Berlin Wall
3095           Batman: Mask of the Phantasm
Name: title, dtype: object

In [71]:

def calculate_cosine_sim(dataframe):
    tfidf = TfidfVectorizer(stop_words='english')
    dataframe['overview'] = dataframe['overview'].fillna('')
    tfidf_matrix = tfidf.fit_transform(dataframe['overview'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return cosine_sim

In [72]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
df = pd.read_csv("movies_metadata.csv", low_memory=False)

cosine_sim = calculate_cosine_sim(df)


In [73]:
content_based_recommender("The Godfather", cosine_sim, df)

1178      The Godfather: Part II
44036              Clown Service
1914     The Godfather: Part III
23128               Free to Play
11297           Household Saints
34721      The Brides Are Coming
10821                   Election
38036      Dr. Kildare Goes Home
17729          Short Sharp Shock
26295                Weary River
Name: title, dtype: object

# II CASE Item-Based Collaborative Filtering (Item-Item Filtering)

# Objective : User - Product Matrix
## Sparseness is a serious problem here

In [143]:
#######################################
# Item-Based Collaborative Filtering (Item-Item Filtering)
#######################################

# Step 1: Preparing the Data Set
# Step 2: Creating the User Movie Df
# Step 3: Making Item-Based Film Suggestions Based on Correlation
# Step 4: Functionalization of Transactions

###################################
# Step 1: Preparing the Data Set
###################################

In [75]:
# Developing a recommendation system over films that have similar rates/grades.

import pandas as pd
pd.set_option('display.max_columns', 20)

movie = pd.read_csv('movie.csv')


In [76]:
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [77]:
rating = pd.read_csv('rating.csv')

In [151]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [79]:
df = movie.merge(rating, how="left", on="movieId")

In [80]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41


In [81]:

#################
# title
#################

df['year_movie'] = df.title.str.extract('(\(\d\d\d\d\))', expand=False)


In [82]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,year_movie
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47,(1995)
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52,(1995)
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51,(1995)
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47,(1995)
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41,(1995)


In [83]:
df['year_movie'] = df.year_movie.str.extract('(\d\d\d\d)', expand=False)

In [85]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,year_movie
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47,1995
1,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52,1995
2,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51,1995
3,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47,1995
4,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41,1995


In [84]:
df['title'] = df.title.str.replace('(\(\d\d\d\d\))', '')

In [86]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,year_movie
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47,1995
1,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52,1995
2,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51,1995
3,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47,1995
4,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41,1995


In [87]:
df['title'] = df['title'].apply(lambda x: x.strip())

In [88]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,year_movie
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47,1995
1,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52,1995
2,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51,1995
3,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47,1995
4,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41,1995


In [91]:
df.shape

(20000797, 7)

In [90]:
#################
# genres
#################

df["genre"] = df["genres"].apply(lambda x: x.split("|")[0])
df.drop("genres", inplace=True, axis=1)
df.head()


Unnamed: 0,movieId,title,userId,rating,timestamp,year_movie,genre
0,1,Toy Story,3.0,4.0,1999-12-11 13:36:47,1995,Adventure
1,1,Toy Story,6.0,5.0,1997-03-13 17:50:52,1995,Adventure
2,1,Toy Story,8.0,4.0,1996-06-05 13:37:51,1995,Adventure
3,1,Toy Story,10.0,4.0,1999-11-25 02:44:47,1995,Adventure
4,1,Toy Story,11.0,4.5,2009-01-02 01:13:41,1995,Adventure


In [92]:
#################
# timestamp
#################

df.info()

df["timestamp"] = pd.to_datetime(df["timestamp"], format='%Y-%m-%d')
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000797 entries, 0 to 20000796
Data columns (total 7 columns):
 #   Column      Dtype  
---  ------      -----  
 0   movieId     int64  
 1   title       object 
 2   userId      float64
 3   rating      float64
 4   timestamp   object 
 5   year_movie  object 
 6   genre       object 
dtypes: float64(2), int64(1), object(4)
memory usage: 1.2+ GB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000797 entries, 0 to 20000796
Data columns (total 7 columns):
 #   Column      Dtype         
---  ------      -----         
 0   movieId     int64         
 1   title       object        
 2   userId      float64       
 3   rating      float64       
 4   timestamp   datetime64[ns]
 5   year_movie  object        
 6   genre       object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 1.2+ GB


In [93]:

df["year"] = df["timestamp"].dt.year
df["month"] = df["timestamp"].dt.month
df["day"] = df["timestamp"].dt.day
df.head()


Unnamed: 0,movieId,title,userId,rating,timestamp,year_movie,genre,year,month,day
0,1,Toy Story,3.0,4.0,1999-12-11 13:36:47,1995,Adventure,1999.0,12.0,11.0
1,1,Toy Story,6.0,5.0,1997-03-13 17:50:52,1995,Adventure,1997.0,3.0,13.0
2,1,Toy Story,8.0,4.0,1996-06-05 13:37:51,1995,Adventure,1996.0,6.0,5.0
3,1,Toy Story,10.0,4.0,1999-11-25 02:44:47,1995,Adventure,1999.0,11.0,25.0
4,1,Toy Story,11.0,4.5,2009-01-02 01:13:41,1995,Adventure,2009.0,1.0,2.0


In [94]:

######################################
# Step 2: Creating User Movie Df
######################################

df.shape
df["title"].nunique()
a = pd.DataFrame(df["title"].value_counts())
a.head()



Unnamed: 0,title
Pulp Fiction,67310
Forrest Gump,66172
"Shawshank Redemption, The",63366
"Silence of the Lambs, The",63299
Jurassic Park,59715


In [152]:
a.count()

title    26213
dtype: int64

In [95]:
rare_movies = a[a["title"] <= 1000].index
common_movies = df[~df["title"].isin(rare_movies)]
common_movies.shape
common_movies["title"].nunique()
common_movies.head()

Unnamed: 0,movieId,title,userId,rating,timestamp,year_movie,genre,year,month,day
0,1,Toy Story,3.0,4.0,1999-12-11 13:36:47,1995,Adventure,1999.0,12.0,11.0
1,1,Toy Story,6.0,5.0,1997-03-13 17:50:52,1995,Adventure,1997.0,3.0,13.0
2,1,Toy Story,8.0,4.0,1996-06-05 13:37:51,1995,Adventure,1996.0,6.0,5.0
3,1,Toy Story,10.0,4.0,1999-11-25 02:44:47,1995,Adventure,1999.0,11.0,25.0
4,1,Toy Story,11.0,4.5,2009-01-02 01:13:41,1995,Adventure,2009.0,1.0,2.0


In [97]:
user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")

In [98]:
user_movie_df.shape


(138493, 3134)

In [99]:
user_movie_df.head(10)



title,"'burbs, The",(500) Days of Summer,*batteries not included,...And Justice for All,10 Things I Hate About You,"10,000 BC",101 Dalmatians,101 Dalmatians (One Hundred and One Dalmatians),102 Dalmatians,12 Angry Men,...,Zero Dark Thirty,Zero Effect,Zodiac,Zombieland,Zoolander,Zulu,[REC],eXistenZ,xXx,¡Three Amigos!
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,,,,,,,,...,,,,,,,,,,
6.0,,,,,,,,,,,...,,,,,,,,,,
7.0,,,,,,,,,,,...,,,,,,,,,,2.0
8.0,,,,,,,,,,,...,,,,,,,,,,
9.0,,,,,,,,,,,...,,,,,,,,,,
10.0,,,,,,,,,,,...,,,,,,,,,,


In [100]:
user_movie_df.columns

Index([''burbs, The', '(500) Days of Summer', '*batteries not included',
       '...And Justice for All', '10 Things I Hate About You', '10,000 BC',
       '101 Dalmatians', '101 Dalmatians (One Hundred and One Dalmatians)',
       '102 Dalmatians', '12 Angry Men',
       ...
       'Zero Dark Thirty', 'Zero Effect', 'Zodiac', 'Zombieland', 'Zoolander',
       'Zulu', '[REC]', 'eXistenZ', 'xXx', '¡Three Amigos!'],
      dtype='object', name='title', length=3134)

In [103]:
len(user_movie_df.columns)

3134

In [104]:
common_movies["title"].nunique()

3134

In [105]:

######################################
# Step 3: Making Item-Based Movie Suggestions Based on Correlation
######################################

movie = user_movie_df["Matrix, The"]

user_movie_df.corrwith(movie).sort_values(ascending=False).head(10)


title
Matrix, The                                           1.000000
Matrix Reloaded, The                                  0.516906
Matrix Revolutions, The                               0.449588
Animatrix, The                                        0.367151
Blade                                                 0.334493
Terminator 2: Judgment Day                            0.333882
Minority Report                                       0.332434
Edge of Tomorrow                                      0.326762
Mission: Impossible                                   0.320815
Lord of the Rings: The Fellowship of the Ring, The    0.318726
dtype: float64

In [109]:
######################################
 Functionalization
######################################

def create_user_movie_df():
    import pandas as pd
    movie = pd.read_csv('movie.csv')
    rating = pd.read_csv('rating.csv')
    df = movie.merge(rating, how="left", on="movieId")
    df['title'] = df.title.str.replace('(\(\d\d\d\d\))', '')
    df['title'] = df['title'].apply(lambda x: x.strip())
    a = pd.DataFrame(df["title"].value_counts())
    rare_movies = a[a["title"] <= 1000].index
    common_movies = df[~df["title"].isin(rare_movies)]
    user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")
    return user_movie_df
