In [4]:
# Import Modules
# pip install pandas
import pandas as pd 
import numpy as np 
# pip install scikit-learn
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
# Import the Dataset
#Load the rating data into a DataFrame:
column_names = ['User_ID', 'User_Names','Movie_ID','Rating','Timestamp']
movies_df = pd.read_csv("https://raw.githubusercontent.com/naru94/Collaborative-Filtering-Recommendation-System/main/dataset/Movie_data.csv", sep = ',', names = column_names)
print(movies_df.head())

   User_ID    User_Names  Movie_ID  Rating  Timestamp
0        0  Shawn Wilson        50       5  881250949
1        0  Shawn Wilson       172       5  881250949
2        0  Shawn Wilson       133       1  881250949
3      196  Bessie White       242       3  881250949
4      196  Bessie White       393       4  881251863


In [11]:
#Load the move information in a DataFrame:
column_names = {'item_id':'Movie_ID', 'title':'Movie_Title'}
movies_title_df = pd.read_csv("https://raw.githubusercontent.com/naru94/Collaborative-Filtering-Recommendation-System/main/dataset/Movie_Id_Titles.csv")
movies_title_df.rename(columns = {'item_id':'Movie_ID', 'title':'Movie_Title'}, inplace = True)
print(movies_title_df.head())

   Movie_ID        Movie_Title
0         1   Toy Story (1995)
1         2   GoldenEye (1995)
2         3  Four Rooms (1995)
3         4  Get Shorty (1995)
4         5     Copycat (1995)


In [12]:
#Merge the DataFrames:
movies_df = pd.merge(movies_df,movies_title_df, on='Movie_ID')

#View the DataFrame:
print(movies_df.head())

   User_ID        User_Names  Movie_ID  Rating  Timestamp       Movie_Title
0        0      Shawn Wilson        50       5  881250949  Star Wars (1977)
1       22     Robert Poulin        50       5  878887765  Star Wars (1977)
2      244      Laura Krulik        50       5  880604379  Star Wars (1977)
3      298      Loren Aucoin        50       5  884125578  Star Wars (1977)
4      115  Dominick Jenkins        50       5  881172049  Star Wars (1977)


In [13]:
# Explore the dataset
print(f"\n Size of the movie_df dataset is {movies_df.shape}")


 Size of the movie_df dataset is (100003, 6)


In [14]:
movies_df.describe()

Unnamed: 0,User_ID,Movie_ID,Rating,Timestamp
count,100003.0,100003.0,100003.0,100003.0
mean,462.470876,425.520914,3.529864,883528800.0
std,266.622454,330.797791,1.125704,5343791.0
min,0.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


In [16]:
movies_df.groupby('User_ID')['Rating'].count().sort_values(ascending = True)

User_ID
0        3
166     20
418     20
34      20
441     20
      ... 
276    518
450    540
13     636
655    685
405    737
Name: Rating, Length: 944, dtype: int64

In [17]:
n_users = movies_df.User_ID.unique().shape[0]
n_movies = movies_df.Movie_ID.unique().shape[0]
print( str(n_users) + ' users')
print( str(n_movies) + ' movies')

944 users
1682 movies


In [18]:
#This would be a 2D array matrix to display user-movie_rating relationship
#Rows represent users by IDs, columns represent movies by IDs
ratings = np.zeros((n_users, n_movies))
for row in movies_df.itertuples():
    ratings[row[1], row[3]-1] = row[4]

# View the matrix
print(ratings)

[[0. 0. 0. ... 0. 0. 0.]
 [5. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]


In [19]:
ratings.shape

(944, 1682)

In [20]:
# Explore the Interaction Matrix
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print(sparsity)

6.298179628771237


In [28]:
# Create a Similarity Matrix
rating_cosine_similarity = cosine_similarity(ratings)
rating_cosine_similarity

array([[1.        , 0.11988816, 0.11554032, ..., 0.        , 0.18180857,
        0.11890394],
       [0.11988816, 1.        , 0.16693098, ..., 0.14861694, 0.17950788,
        0.39817474],
       [0.11554032, 0.16693098, 1.        , ..., 0.16148478, 0.17226781,
        0.10579788],
       ...,
       [0.        , 0.14861694, 0.16148478, ..., 1.        , 0.1016418 ,
        0.09511958],
       [0.18180857, 0.17950788, 0.17226781, ..., 0.1016418 , 1.        ,
        0.18246466],
       [0.11890394, 0.39817474, 0.10579788, ..., 0.09511958, 0.18246466,
        1.        ]])

In [25]:
# Provide Recommendations
def movie_recommender(user_item_m, X_user, user, k=10, top_n=10):
    # Get the location of the actual user in the User-Items matrix
    # Use it to index the User similarity matrix
    user_similarities = X_user[user]
    # obtain the indices of the top k most similar users
    most_similar_users = user_item_m.index[user_similarities.argpartition(-k)[-k:]]
    # Obtain the mean ratings of those users for all movies
    rec_movies = user_item_m.loc[most_similar_users].mean(0).sort_values(ascending=False)
    # Discard already seen movies
    m_seen_movies = user_item_m.loc[user].gt(0)
    seen_movies = m_seen_movies.index[m_seen_movies].tolist()
    rec_movies = rec_movies.drop(seen_movies).head(top_n)
    # return recommendations - top similar users rated movies
    rec_movies_a=rec_movies.index.to_frame().reset_index(drop=True)
    rec_movies_a.rename(columns={rec_movies_a.columns[0]: 'Movie_ID'}, inplace=True)
    return rec_movies_a

In [26]:
# View the Provided Recommendations
#Converting the 2D array into a DataFrame as expected by the movie_recommender function
ratings_df=pd.DataFrame(ratings)

In [27]:
user_ID=12
movie_recommender(ratings_df, rating_cosine_similarity, user_ID)

Unnamed: 0,Movie_Title
0,180
1,209
2,495
3,422
4,172
5,384
6,78
7,567
8,565
9,21


In [None]:
def movie_recommender_run(user_Name):
    #Get ID from Name
    user_ID=movies_df.loc[movies_df['User_Names'] == user_Name].User_ID.values[0]
    #Call the function
    temp=movie_recommender(ratings_df, rating_cosine_similarity, user_ID)
    # Join with the movie_title_df to get the movie titles
    top_k_rec=temp.merge(movies_title_df, how='inner')
    return top_k_rec