In [1]:
import pandas as pd
import numpy as np
import warnings
import os


In [2]:
warnings.filterwarnings("ignore")

In [3]:

movies = pd.read_csv("movies.csv", header=0)
ratings = pd.read_csv("ratings_small.csv", header=0)

movies = movies.fillna("None")

In [4]:
movies.drop_duplicates(inplace=True)
movies.drop_duplicates(['movieId'], inplace=True)
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
# extract movies' content
movie_profile = movies[['movieId', 'title', 'genres']]
movie_profile.rename(columns={'id': 'movieId'}, inplace=True)
movie_profile

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [6]:
# one-hot encoding the genres
all_genres = [s.split(", ") for s in movies[movies.genres.notnull()].genres]
genres = [item.strip() for l in all_genres for item in l]
unique_genres = set(genres)
for genre in unique_genres:
    movie_profile[genre] = 0

movie_profile

Unnamed: 0,movieId,title,genres,Adventure|Drama|Romance|War,Adventure|Animation|Drama|Horror,Children|Comedy|Drama|Mystery,Drama|Horror|Mystery,Children|Drama,Crime|Drama|Fantasy,Animation|Children|Musical|IMAX,...,Comedy|Fantasy,Comedy|Romance|Sci-Fi,Children,Children|Drama|Musical,Comedy|Fantasy|Thriller,Adventure|Comedy|Drama|Fantasy|Romance,Crime|Thriller|Western,Drama|Sci-Fi|Thriller,Action|Adventure|Comedy|Romance|Thriller,Action|Adventure|Animation|Comedy
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9739,193585,Flint (2017),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:

for i in range(len(movie_profile)):
    if type(movie_profile['genres'].iloc[i]) != None.__class__:
        Genres = movie_profile.iloc[i].genres.split(', ')
        for g in Genres:
            movie_profile[g].iloc[i] = 1


In [8]:

movie_profile = movie_profile.drop(columns=['title', 'genres']).set_index('movieId')
# movie_profile.sort_index(axis=0, inplace=True)


In [9]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [10]:
user_x_movie = pd.pivot_table(ratings, values='rating', index=['movieId'], columns=['userId'])
user_x_movie.sort_index(axis=0, inplace=True)

In [11]:
userIDs = user_x_movie.columns
user_profile = pd.DataFrame(columns=movie_profile.columns)
user_profile.to_csv("user_profile.csv")

In [12]:
for i in (range(len(user_x_movie.columns))):
    working_df = movie_profile.mul(user_x_movie.iloc[:, i], axis=0)
    working_df.replace(0, np.NaN, inplace=True)
    user_profile.loc[userIDs[i]] = working_df.mean(axis=0)

# apply TFIDF for similarity comparison
df = movie_profile.sum()

In [13]:
idf = (len(movies) / df).apply(np.log)  # log inverse of DF
idf# transformed the data 

Adventure|Drama|Romance|War                 9.184202
Adventure|Animation|Drama|Horror            9.184202
Children|Comedy|Drama|Mystery               9.184202
Drama|Horror|Mystery                        7.797907
Children|Drama                              6.293830
                                              ...   
Adventure|Comedy|Drama|Fantasy|Romance      9.184202
Crime|Thriller|Western                      9.184202
Drama|Sci-Fi|Thriller                       6.619252
Action|Adventure|Comedy|Romance|Thriller    8.491055
Action|Adventure|Animation|Comedy           8.491055
Length: 951, dtype: float64

In [14]:
TFIDF = movie_profile.mul(idf.values)# transformed idf
TFIDF

Unnamed: 0_level_0,Adventure|Drama|Romance|War,Adventure|Animation|Drama|Horror,Children|Comedy|Drama|Mystery,Drama|Horror|Mystery,Children|Drama,Crime|Drama|Fantasy,Animation|Children|Musical|IMAX,Comedy|Crime|Horror,Drama|Horror|Romance,Romance,...,Comedy|Fantasy,Comedy|Romance|Sci-Fi,Children,Children|Drama|Musical,Comedy|Fantasy|Thriller,Adventure|Comedy|Drama|Fantasy|Romance,Crime|Thriller|Western,Drama|Sci-Fi|Thriller,Action|Adventure|Comedy|Romance|Thriller,Action|Adventure|Animation|Comedy
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
df_predict = pd.DataFrame()

for i in (range(len(user_x_movie.columns))):
    working_df = transformed_inverse_df.mul(user_profile.iloc[i], axis=1)
    df_predict[user_x_movie.columns[i]] = working_df.sum(axis=1)
df_predict.to_csv("TFIDF.csv")
df_predict

NameError: name 'transformed_inverse_df' is not defined

In [None]:
movies = pd.read_csv('movies.csv')
movies.drop_duplicates(inplace=True)
df_predict = pd.read_csv('Transformed_inverse.csv')
ratings = pd.read_csv('ratings_small.csv')
user_profile = pd.read_csv('user_profile.csv')


In [None]:

def recommender(user_no):
    # user predicted rating to all films
    user_predicted_rating = df_predict[['movieId', df_predict.columns[user_no]]]
    # combine film rating and film detail
    user_rating_film = pd.merge(user_predicted_rating, movies, left_on='movieId', right_on='movieId')
    # films already watched by user
    already_watched = ratings[ratings['userId'].isin([user_no])]['movieId']
    # recommendation without films being watched by user
    all_rec = user_rating_film[~user_rating_film.index.isin(already_watched)]
    return all_rec.sort_values(by=str(user_no), ascending=False, axis=0).iloc[0:10][['movieId', 'title']].set_index("movieId")


In [None]:
recommender(23)
# these predictions are dpne on the basis of the previous history of the user
# if he has 