In [1]:
# Gerekli kütüphanelerin çağrılması
import pandas as pd
import numpy as np
import datetime as dt

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
%matplotlib inline
import seaborn as sns

import random

In [2]:
# Verinin okunması, kopyasının yaratılması
df1 = pd.read_csv("movies.csv")
df2 = pd.read_csv("ratings.csv")

movies = df1.copy()
ratings = df2.copy()

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [5]:
# Sadece 1000 veya daha fazla izlenen filmlerin belirlenmesi
valid_movies = ratings[ratings.groupby('movieId')['movieId'].transform('size') >= 1000]

# Sadece 1000 veya daha fazla izlenen filmlerin filtrelenmesi
movies = movies[movies["movieId"].isin(valid_movies["movieId"])]
ratings = ratings[ratings["movieId"].isin(valid_movies["movieId"])]

In [6]:
# 1000'den fazla izlenen/değerlendirilen film sayısı
len(movies)

3159

In [7]:
# Bir kullanıcı ortalama kaç filmi izlemiş/değerlendirmiş ?
avg = int(ratings["userId"].value_counts().mean())
avg

128

In [8]:
# değerlendirmelerin(ratings) üzerine filmlerin(movies) movieId değeri üzerinden left joinlenmesi
user_movie_df = ratings.merge(movies, left_on = "movieId", right_on = "movieId", how = "left")

# userId'leri index'e film isimlerini ise sütunlara atayarak pivot table oluşturulması
user_movie_df = user_movie_df.pivot_table(index = ['userId'], columns = ['title'], values = ['rating'])

user_movie_df.columns = user_movie_df.columns.droplevel(0)

user_movie_df.head(15)

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zulu (1964),[REC] (2007),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,2.0
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


## User Based Recommendation

In [9]:
random_user_id = 138489

# Seçilen userId üzerinden user'a ait df parçasının user_df olarak belirlenmesi

user_df = user_movie_df[user_movie_df.index == random_user_id]

user_df

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zulu (1964),[REC] (2007),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
138489,,,,,,,,,,4.5,...,,,,,,,,,,


In [10]:
# Seçilen kullanıcının izlediği film sayısının gösterilmesi
one_user_watched_movies = user_df.columns[user_df.notna().any()].tolist()

len(one_user_watched_movies)

38

In [11]:
# Oluşturulan user_movie_df tablosu üzerinden sadece seçilen kullanıcının izlediği filmleri
# ifade eden sütunların bırakılması
watched_movies_df = user_movie_df[one_user_watched_movies]

watched_movies_df.head()

title,12 Angry Men (1957),"Addams Family, The (1991)","Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",American Beauty (1999),Analyze This (1999),Batman & Robin (1997),Billy Madison (1995),Blazing Saddles (1974),Casablanca (1942),Chinatown (1974),...,Seven Samurai (Shichinin no samurai) (1954),"Shawshank Redemption, The (1994)","Silence of the Lambs, The (1991)",Sleepers (1996),"Third Man, The (1949)",To Kill a Mockingbird (1962),"Usual Suspects, The (1995)",Wag the Dog (1997),Wallace & Gromit: A Close Shave (1995),Wallace & Gromit: The Wrong Trousers (1993)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,4.0,3.5,,,,3.5,,,
2,,,,3.0,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,5.0,5.0,,,,5.0,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,5.0,3.0,,,,,,,


In [12]:
# Seçilen kullanıcının izlediği filmleri diğer kullanıcıların ne kadar izlediği
# Kendisi de tamamını izledi :)

users_movie_count = watched_movies_df.T.notnull().sum()

users_movie_count = users_movie_count.sort_values(ascending = False)

users_movie_count = users_movie_count.reset_index()
users_movie_count.columns = ["userId", "MovieCount"]

users_movie_count.head(10)

Unnamed: 0,userId,MovieCount
0,88820,38
1,88604,38
2,138489,38
3,118205,38
4,8405,38
5,71975,38
6,80771,37
7,76437,37
8,83090,37
9,79159,37


In [13]:
# Bize verilen göreve göre seçili kullanıcıya öneri yapılabilmesi için baz alınacak
# diğer kullanıcıların seçili kullanıcının değerlendirdiği filmlerin en az %60'ını değerlendirmiş olması gerekiyor.

limit = len(one_user_watched_movies) * 0.60

correlated_users = users_movie_count[users_movie_count["MovieCount"] >= limit]
correlated_users

Unnamed: 0,userId,MovieCount
0,88820,38
1,88604,38
2,138489,38
3,118205,38
4,8405,38
...,...,...
1961,92770,23
1962,56753,23
1963,92493,23
1964,82578,23


In [14]:
# watched_movies_df üzerinde sadece seçili kullanıcının ve 
# onunla minimum %60 aynı filmleri izlemiş olan kullanıcıların bırakılması
main_df = watched_movies_df[watched_movies_df.index.isin(correlated_users["userId"])]

main_df.head()

title,12 Angry Men (1957),"Addams Family, The (1991)","Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",American Beauty (1999),Analyze This (1999),Batman & Robin (1997),Billy Madison (1995),Blazing Saddles (1974),Casablanca (1942),Chinatown (1974),...,Seven Samurai (Shichinin no samurai) (1954),"Shawshank Redemption, The (1994)","Silence of the Lambs, The (1991)",Sleepers (1996),"Third Man, The (1949)",To Kill a Mockingbird (1962),"Usual Suspects, The (1995)",Wag the Dog (1997),Wallace & Gromit: A Close Shave (1995),Wallace & Gromit: The Wrong Trousers (1993)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
104,4.0,,,2.0,3.0,,,,4.0,4.0,...,5.0,3.0,3.0,,4.0,2.0,3.0,3.0,5.0,5.0
156,,4.0,,5.0,4.0,,2.0,,5.0,,...,,5.0,5.0,4.0,,5.0,5.0,4.0,,
208,,,4.0,3.5,,2.0,,4.5,5.0,4.5,...,5.0,4.5,4.5,,4.0,4.0,4.5,3.5,4.0,4.0
359,,,5.0,4.0,3.5,,,,5.0,4.5,...,4.0,5.0,5.0,4.0,,4.0,4.5,,4.0,4.0
394,4.5,4.0,4.0,5.0,3.0,,,4.0,4.0,5.0,...,5.0,3.0,5.0,,5.0,5.0,5.0,4.0,4.0,5.0


In [15]:
# Kullanıcılar arasındaki korelasyonların hesaplanması
corr_df = main_df.T.corr().unstack().sort_values().drop_duplicates()

corr_df = pd.DataFrame(corr_df, columns = ["Correlation"])

corr_df.index.names = ['userId1', 'userId2']

corr_df = corr_df.sort_values(by = "Correlation", ascending = False)
corr_df = corr_df.reset_index()

corr_df.head()

Unnamed: 0,userId1,userId2,Correlation
0,66827,69969,1.0
1,138325,138325,1.0
2,86749,65852,1.0
3,34632,38994,0.999084
4,34632,73348,0.998928


In [16]:
# Seçilen kullanıcılarının korelasyon değerlerinin %65'den fazla olması gerektiği söylenmişti.
selected_users = corr_df[(((corr_df["userId1"] == random_user_id) & (corr_df["Correlation"] >= 0.65))
                         | ((corr_df["userId2"] == random_user_id) & (corr_df["Correlation"] >= 0.65)))
                         & (corr_df["userId1"] != corr_df["userId2"])]

selected_users.head()

Unnamed: 0,userId1,userId2,Correlation
1805,138489,31906,0.90218
11642,138489,80141,0.852116
13107,138489,37999,0.848063
16101,138489,15616,0.840156
16781,71245,138489,0.838555


In [17]:
# Merge yapılabilmesi için sütun ismi değişikliği
selected_users.rename(columns = {"userId1" : "userId"}, inplace = True)

selected_users_ratings = selected_users.merge(ratings[["userId", "movieId", "rating"]], how = 'inner')

selected_users_ratings.head()

Unnamed: 0,userId,userId2,Correlation,movieId,rating
0,138489,31906,0.90218,29,4.0
1,138489,31906,0.90218,50,4.5
2,138489,31906,0.90218,216,2.0
3,138489,31906,0.90218,236,2.5
4,138489,31906,0.90218,318,5.0


In [18]:
# weightedRating = rating * Correlation
selected_users_ratings['weightedRating'] = selected_users_ratings["rating"] * selected_users_ratings["Correlation"]
# recommendation yapılacak dataframe'in oluşturulması
df_rec = selected_users_ratings.groupby('movieId', as_index = False).agg({"weightedRating" : "mean"})

# Seçilen kullanıcının izlemediği/değerlendirmediği filmler arasından en yüksek skoru vereceği
# düşünülen filmler başta olacak şekilde filmlerin sortlanması
df_rec = df_rec.sort_values("weightedRating", ascending = False)

df_rec["weightedRating"] = round(df_rec["weightedRating"], 1)

#Önerilen Filmler
recommended_movies_user_based = df_rec.head().reset_index(drop = True)

recommended_movies_user_based = recommended_movies_user_based.merge(movies, left_on = "movieId", 
                                                                    right_on = "movieId", how = "left")

recommended_movies_user_based

Unnamed: 0,movieId,weightedRating,title,genres
0,858,3.4,"Godfather, The (1972)",Crime|Drama
1,2019,3.4,Seven Samurai (Shichinin no samurai) (1954),Action|Adventure|Drama
2,912,3.4,Casablanca (1942),Drama|Romance
3,318,3.4,"Shawshank Redemption, The (1994)",Crime|Drama
4,1221,3.2,"Godfather: Part II, The (1974)",Crime|Drama


## Item Based Recommendation

In [19]:
# Seçili kullanıcının izlediği ve 5 puan verdiği filmlerin filtrelenmesi
top_rated_movies = ratings[(ratings["userId"] == random_user_id) & (ratings["rating"] == 5.0)]

# timestamp sütununun veri tipinin datetime'a çevrilmesi
top_rated_movies['timestamp'] = top_rated_movies['timestamp'].astype('datetime64[ns]')

# Bu filmlerin en güncel izlenmeye göre sortlanması
top_rated_movies = top_rated_movies.sort_values(by = "timestamp", ascending = False)

top_rated_movies = top_rated_movies.merge(movies, left_on = "movieId", right_on = "movieId", how = "left")

top_rated_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,138489,912,5.0,2012-11-15 14:21:30,Casablanca (1942),Drama|Romance
1,138489,2019,5.0,2012-11-15 14:21:28,Seven Samurai (Shichinin no samurai) (1954),Action|Adventure|Drama
2,138489,858,5.0,2012-11-15 14:21:18,"Godfather, The (1972)",Crime|Drama
3,138489,318,5.0,2012-11-15 14:21:15,"Shawshank Redemption, The (1994)",Crime|Drama


In [20]:
# Kullanıcının tam puan verdiği filmler arasındaki en son izlediğinin id'si
top_rated_movie_id = top_rated_movies["movieId"][0:1].values[0]

top_rated_movie_id

912

In [22]:
# Kullanıcının tam puan verdiği filmler arasındaki en son izlediğinin ismi
top_rated_movie_name = top_rated_movies[top_rated_movies["movieId"] == top_rated_movie_id]["title"].values[0]

top_rated_movie_name

'Casablanca (1942)'

In [26]:
# Film üzerinden öneri yapılmasını sağlayan fonksiyon
def item_based_recommender(movie, user_movie_df):
    movie = user_movie_df[movie]
    
    return user_movie_df.corrwith(movie).sort_values(ascending = False).head(10)


recommended_movies_item_based = item_based_recommender(top_rated_movie_name, user_movie_df)

recommended_movies_item_based=recommended_movies_item_based[1:6]

recommended_movies_item_based = pd.DataFrame(recommended_movies_item_based)

recommended_movies_item_based = recommended_movies_item_based.reset_index()

recommended_movies_item_based.rename(columns = {0 : "Expected_weightedRating"}, inplace = True)

#0-1 arası korelasyonu 0-5 arası rating ölçeğine çekmek için
recommended_movies_item_based["Expected_weightedRating"]=recommended_movies_item_based["Expected_weightedRating"]*5

recommended_movies_item_based["Expected_weightedRating"] = round(recommended_movies_item_based["Expected_weightedRating"], 1)

recommended_movies_item_based = recommended_movies_item_based.merge(movies, left_on = "title", 
                                                                    right_on = "title", how = "left")

#Item-based önerilen 5 film
recommended_movies_item_based

Unnamed: 0,title,Expected_weightedRating,movieId,genres
0,"Maltese Falcon, The (1941)",2.4,913,Film-Noir|Mystery
1,"Maltese Falcon, The (a.k.a. Dangerous Female) ...",2.4,8228,Mystery
2,Ikiru (1952),2.2,6669,Drama
3,City Lights (1931),2.1,3307,Comedy|Drama|Romance
4,Double Indemnity (1944),2.1,3435,Crime|Drama|Film-Noir
