# LIBRARIES

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.options.display.max_colwidth = 200

## CONTENT

In [4]:
#############################
# Content Filtering
#############################
# Recommendation System According to Movie Overviews
#############################

# Step 1: CREATING THE TF-IDF MATRIX
# Step 2: CREATING THE COSINE SIMILARITY MATRIX
# Step 3: SUGGEST FILMS THAT ARE MOST SIMILAR TO A FILM




In [5]:
#################################
# 1. CREATING THE TF-IDF MATRIX
#################################

import pandas as pd
pd.set_option('display.max_columns', 30)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [6]:
df = pd.read_csv("movies_metadata.csv", low_memory=False)  # DtypeWarning kapamak icin
df.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circum...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]",,8844,tt0113497,en,Jumanji,"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- in...",17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'name': 'Teitler Film', 'id': 2550}, {'name': 'Interscope Communications', 'id': 10201}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name': 'Français'}]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [7]:
df.shape

(45466, 24)

In [8]:
df["overview"].head()

0    Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circum...
1    When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- in...
2    A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming t...
3    Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive "good man" to break a string of less-than-stellar lovers. Friends and confidants Vannah, Bernie, ...
4    Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting too. He was planning 

In [9]:

#################################
# 2: tf-idf method
#################################

# TF-IDF = TF(t) * IDF(t)
# TF(t) = (Frequency of observation in a related document) / (Total number of terms in the document) (term frequency)
# IDF(t) = log_e(Total number of documents / Number of documents with t term in them) (inverse document frequency)

In [10]:
df['overview'].head()

0    Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circum...
1    When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- in...
2    A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming t...
3    Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive "good man" to break a string of less-than-stellar lovers. Friends and confidants Vannah, Bernie, ...
4    Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting too. He was planning 

In [11]:
df['overview'] = df['overview'].fillna('')

In [12]:
df['overview']

0        Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circum...
1        When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- in...
2        A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming t...
3        Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive "good man" to break a string of less-than-stellar lovers. Friends and confidants Vannah, Bernie, ...
4        Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting t

In [13]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['overview'])

In [14]:
tfidf_matrix.shape

(45466, 75827)

In [16]:
print(tfidf_matrix)

  (0, 17764)	0.13483149538639247
  (0, 4388)	0.1474882034218405
  (0, 38030)	0.10142919482788751
  (0, 21887)	0.10438761058719498
  (0, 19641)	0.13281884272823927
  (0, 48558)	0.10339358185033234
  (0, 59519)	0.13008016104455086
  (0, 12490)	0.12544427954397822
  (0, 51108)	0.13434817283119177
  (0, 29238)	0.10093917370354445
  (0, 50914)	0.09190797940163035
  (0, 39423)	0.11907123344715953
  (0, 1847)	0.140911774178889
  (0, 58571)	0.1135591886873686
  (0, 38693)	0.20627924682810617
  (0, 9874)	0.5028038686135609
  (0, 9087)	0.10635375129287977
  (0, 7491)	0.12380553184830104
  (0, 56872)	0.111248510865236
  (0, 28729)	0.13311522181618415
  (0, 39012)	0.08718689178959059
  (0, 67874)	0.14878284660693247
  (0, 3159)	0.41178365711725945
  (0, 73468)	0.4809827114790237
  (0, 38088)	0.10739705953465473
  :	:
  (45464, 26957)	0.07350962631701621
  (45464, 18919)	0.09271509240923419
  (45464, 18119)	0.07466631763708827
  (45464, 39012)	0.06829617779135382
  (45465, 16520)	0.3237330788694511

In [17]:
df['title'].shape

(45466,)

In [18]:
df['title'].head()

0                      Toy Story
1                        Jumanji
2               Grumpier Old Men
3              Waiting to Exhale
4    Father of the Bride Part II
Name: title, dtype: object

In [19]:
type(tfidf_matrix)

scipy.sparse.csr.csr_matrix

In [25]:
#tfidf_matrix.getcol(0)

<45466x1 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [26]:
#Event if I use float32 insted of float64 the amount of memmory I need is too much
tfidf_matrix_norm = tfidf_matrix.astype(np.float32)

In [28]:
print(tfidf_matrix_norm)

  (0, 1847)	0.14091177
  (0, 3159)	0.41178367
  (0, 4388)	0.1474882
  (0, 7491)	0.12380553
  (0, 9087)	0.10635375
  (0, 9874)	0.50280386
  (0, 12490)	0.12544428
  (0, 17764)	0.13483149
  (0, 19641)	0.13281885
  (0, 21887)	0.10438761
  (0, 28729)	0.13311522
  (0, 29238)	0.10093918
  (0, 38030)	0.101429194
  (0, 38088)	0.10739706
  (0, 38693)	0.20627925
  (0, 39012)	0.087186895
  (0, 39423)	0.11907123
  (0, 48558)	0.103393584
  (0, 50914)	0.09190798
  (0, 51108)	0.13434817
  (0, 56872)	0.11124851
  (0, 58571)	0.11355919
  (0, 59519)	0.13008016
  (0, 67874)	0.14878285
  (0, 73468)	0.48098272
  :	:
  (45464, 67646)	0.11003492
  (45464, 67836)	0.06091759
  (45464, 71311)	0.14487898
  (45464, 72980)	0.05915209
  (45465, 529)	0.19370638
  (45465, 740)	0.19703211
  (45465, 3912)	0.22202763
  (45465, 4443)	0.3118343
  (45465, 7157)	0.2914932
  (45465, 11313)	0.14949019
  (45465, 15834)	0.24236871
  (45465, 16520)	0.3237331
  (45465, 17272)	0.2019167
  (45465, 17977)	0.14001141
  (45465, 22285)	

In [None]:
#################################
# 2. COSINE SIMILARITY MATRIX
#################################
#cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim = cosine_similarity(tfidf_matrix_norm, tfidf_matrix_norm)

In [None]:
cosine_sim.shape

In [None]:
cosine_sim[0:4]

In [None]:
#################################
# 3. Recommend the most similar movies
#################################

df = df[~df["title"].isna()]


In [None]:
df["title"].head()

In [None]:
indices = pd.Series(df.index, index=df['title'])

In [None]:
indices

In [None]:
indices.count()

In [None]:
indices = indices[~indices.index.duplicated(keep='last')]

In [None]:
indices.shape

In [None]:
indices[:10]

In [None]:
indices["Sherlock Holmes"]

In [None]:
movie_index = indices["Sherlock Holmes"]

In [None]:
cosine_sim[movie_index]

In [None]:
similarity_scores = pd.DataFrame(cosine_sim[movie_index], columns=["score"])

In [None]:
similarity_scores.head()

In [None]:
movie_indices = similarity_scores.sort_values("score", ascending=False)[1:11].index

In [None]:
movie_indices

In [None]:
df['title'].head()

In [None]:
df['title'].iloc[movie_indices]

# FUNCTIONS

In [None]:

def content_based_recommender(title, cosine_sim, dataframe):

    dataframe = dataframe[~dataframe["title"].isna()]
    indices = pd.Series(dataframe.index, index=dataframe['title'])
    indices = indices[~indices.index.duplicated(keep='last')]

    movie_index = indices[title]

    similarity_scores = pd.DataFrame(cosine_sim[movie_index], columns=["score"])

    movie_indices = similarity_scores.sort_values("score", ascending=False)[1:11].index
    return dataframe['title'].iloc[movie_indices]

In [None]:
content_based_recommender("The Godfather", cosine_sim, df)

In [None]:
content_based_recommender('The Dark Knight Rises', cosine_sim, df)

In [None]:

def calculate_cosine_sim(dataframe):
    tfidf = TfidfVectorizer(stop_words='english')
    dataframe['overview'] = dataframe['overview'].fillna('')
    tfidf_matrix = tfidf.fit_transform(dataframe['overview'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return cosine_sim

In [None]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
df = pd.read_csv("movies_metadata.csv", low_memory=False)

cosine_sim = calculate_cosine_sim(df)


In [None]:
content_based_recommender("The Godfather", cosine_sim, df)

# II CASE Item-Based Collaborative Filtering (Item-Item Filtering)

# Objective : User - Product Matrix
## Sparseness is a serious problem here

In [None]:
#######################################
# Item-Based Collaborative Filtering (Item-Item Filtering)
#######################################

# Step 1: Preparing the Data Set
# Step 2: Creating the User Movie Df
# Step 3: Making Item-Based Film Suggestions Based on Correlation
# Step 4: Functionalization of Transactions

###################################
# Step 1: Preparing the Data Set
###################################

In [None]:
# Developing a recommendation system over films that have similar rates/grades.

import pandas as pd
pd.set_option('display.max_columns', 20)

movie = pd.read_csv('movie.csv')


In [None]:
movie.head()

In [None]:
rating = pd.read_csv('rating.csv')

In [None]:
rating.head()

In [None]:
df = movie.merge(rating, how="left", on="movieId")

In [None]:
df.head()

In [None]:

#################
# title
#################

df['year_movie'] = df.title.str.extract('(\(\d\d\d\d\))', expand=False)


In [None]:
df.head()

In [None]:
df['year_movie'] = df.year_movie.str.extract('(\d\d\d\d)', expand=False)

In [None]:
df.head()

In [None]:
df['title'] = df.title.str.replace('(\(\d\d\d\d\))', '')

In [None]:
df.head()

In [None]:
df['title'] = df['title'].apply(lambda x: x.strip())

In [None]:
df.head()

In [None]:
df.shape

In [None]:
#################
# genres
#################

df["genre"] = df["genres"].apply(lambda x: x.split("|")[0])
df.drop("genres", inplace=True, axis=1)
df.head()


In [None]:
#################
# timestamp
#################

df.info()

df["timestamp"] = pd.to_datetime(df["timestamp"], format='%Y-%m-%d')
df.info()


In [None]:

df["year"] = df["timestamp"].dt.year
df["month"] = df["timestamp"].dt.month
df["day"] = df["timestamp"].dt.day
df.head()


In [None]:

######################################
# Step 2: Creating User Movie Df
######################################

df.shape
df["title"].nunique()
a = pd.DataFrame(df["title"].value_counts())
a.head()



In [None]:
a.count()

In [None]:
rare_movies = a[a["title"] <= 1000].index
common_movies = df[~df["title"].isin(rare_movies)]
common_movies.shape
common_movies["title"].nunique()
common_movies.head()

In [None]:
user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")

In [None]:
user_movie_df.shape


In [None]:
user_movie_df.head(10)



In [None]:
user_movie_df.columns

In [None]:
len(user_movie_df.columns)

In [None]:
common_movies["title"].nunique()

In [None]:

######################################
# Step 3: Making Item-Based Movie Suggestions Based on Correlation
######################################

movie = user_movie_df["Matrix, The"]

user_movie_df.corrwith(movie).sort_values(ascending=False).head(10)


In [None]:
######################################
 Functionalization
######################################

def create_user_movie_df():
    import pandas as pd
    movie = pd.read_csv('movie.csv')
    rating = pd.read_csv('rating.csv')
    df = movie.merge(rating, how="left", on="movieId")
    df['title'] = df.title.str.replace('(\(\d\d\d\d\))', '')
    df['title'] = df['title'].apply(lambda x: x.strip())
    a = pd.DataFrame(df["title"].value_counts())
    rare_movies = a[a["title"] <= 1000].index
    common_movies = df[~df["title"].isin(rare_movies)]
    user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")
    return user_movie_df
