## A7 - RECOMMENDER SYSTEMS
---

In [25]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

---  
#### Setting the necessary dataframe variables

In [26]:
df_movies = pd.read_csv(r'./ContentBasedRecommenderSystem/movies.csv')
df_ratings = pd.read_csv(r'./ContentBasedRecommenderSystem/ratings.csv')

In [27]:
# CUTTING DATAFRAME @ USER 200
df = df_ratings.loc[df_ratings['userId']<=200]
df = df.drop(['timestamp'], axis=1)

# PIVOTING ELEMENTS BY INDECES OF USERS AND MOVIES
df_new = df.groupby(['userId', 'movieId']).sum().reset_index().pivot(index='movieId', columns='userId').fillna(0)

# TAKING ONLY THE 1ST 200 ROWS AND COLUMNS
df_new = df_new.iloc[0:200 , :]
df_new

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
userId,1,2,3,4,5,6,7,8,9,10,...,191,192,193,194,195,196,197,198,199,200
movieId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
257,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
258,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


---  
#### Similarity matrix  
Using cosine similarity function

In [28]:
similarity = pd.DataFrame(cosine_similarity(df_new), index = df_new.index , columns = df_new.columns)
similarity

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
userId,1,2,3,4,5,6,7,8,9,10,...,191,192,193,194,195,196,197,198,199,200
movieId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,1.000000,0.364634,0.284825,0.000000,0.279775,0.327928,0.270701,0.128765,0.195017,0.352899,...,0.078029,0.207645,0.338312,0.000000,0.054509,0.193446,0.029922,0.144217,0.121886,0.533147
2,0.364634,1.000000,0.278446,0.102937,0.277072,0.278936,0.240488,0.130496,0.000000,0.342590,...,0.257005,0.218991,0.368741,0.205874,0.000000,0.150889,0.171278,0.391489,0.000000,0.374672
3,0.284825,0.278446,1.000000,0.150278,0.391915,0.341700,0.592157,0.406427,0.288534,0.231410,...,0.346341,0.317709,0.389201,0.300557,0.000000,0.278921,0.242671,0.243856,0.240445,0.211501
4,0.000000,0.102937,0.150278,1.000000,0.239259,0.176604,0.297614,0.253546,0.000000,0.129040,...,0.384111,0.132964,0.066749,0.500000,0.000000,0.126547,0.272772,0.253546,0.000000,0.051085
5,0.279775,0.277072,0.391915,0.239259,1.000000,0.256166,0.539392,0.404422,0.000000,0.275293,...,0.229755,0.234621,0.226247,0.299074,0.107000,0.262406,0.241474,0.151658,0.239259,0.157366
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,0.193446,0.150889,0.278921,0.126547,0.262406,0.134093,0.281211,0.484849,0.000000,0.300284,...,0.194433,0.496376,0.374480,0.253095,0.301833,1.000000,0.322174,0.171123,0.539935,0.100203
257,0.029922,0.171278,0.242671,0.272772,0.241474,0.226412,0.266273,0.276642,0.000000,0.272201,...,0.419099,0.413467,0.242764,0.545545,0.000000,0.322174,1.000000,0.276642,0.327327,0.075247
258,0.144217,0.391489,0.243856,0.253546,0.151658,0.104481,0.181101,0.257143,0.000000,0.232658,...,0.389559,0.235989,0.157957,0.507093,0.000000,0.171123,0.276642,1.000000,0.000000,0.159747
259,0.121886,0.000000,0.240445,0.000000,0.239259,0.000000,0.238091,0.676123,0.000000,0.240874,...,0.000000,0.199447,0.249197,0.000000,0.000000,0.539935,0.327327,0.000000,1.000000,0.000000


---  
#### Finding top 10 movies

In [29]:
def movieMatching(matchID, number):
    # GETTING NAME AND ID OF MOVIE TO MATCH
    matchMovie = df_movies.loc[df_movies['movieId'] == matchID].title
    matchMovie = matchMovie.to_string(index=False)
    print('Top ', number, ' similar movies to : ', matchMovie, ', with ID : ', matchID)
    
    # LOCATING IDs OF MATCHING MOVIES
    movieList = np.array(similarity.loc[[matchID]])
    movieList = pd.DataFrame(movieList, index=[matchID], columns=similarity.index)
    movieList_sorted = movieList.sort_values(by=matchID, axis=1, ascending=False)
    # movieIDs = movieList_sorted.iloc[0,1:11].index.tolist()
    movieIDs = movieList_sorted.iloc[0,1:(number+1)].index.tolist()
    
    # GETTING NAMES OF SELECTED IDs
    movieNames = np.array([])
    for id in movieIDs:
        getName = df_movies.loc[df_movies['movieId'] == id].title
        np.append(movieNames, getName)
        # PRINT [ 'ID' : 'MOVIE NAME' ]
        print(id, ' : ', getName.to_string(index=False))

In [30]:
movieMatching(1, 10)

Top  10  similar movies to :  Toy Story (1995) , with ID :  1
260  :  Star Wars: Episode IV - A New Hope (1977)
110  :  Braveheart (1995)
150  :  Apollo 13 (1995)
50  :  Usual Suspects, The (1995)
104  :  Happy Gilmore (1996)
165  :  Die Hard: With a Vengeance (1995)
47  :  Seven (a.k.a. Se7en) (1995)
34  :  Babe (1995)
208  :  Waterworld (1995)
153  :  Batman Forever (1995)


In [31]:
movieMatching(4, 10)

Top  10  similar movies to :  Waiting to Exhale (1995) , with ID :  4
113  :  Before and After (1996)
181  :  Mighty Morphin Power Rangers: The Movie (1995)
189  :  Reckless (1995)
179  :  Mad Love (1995)
201  :  Three Wishes (1995)
27  :  Now and Then (1995)
55  :  Georgia (1995)
254  :  Jefferson in Paris (1995)
214  :  Before the Rain (Pred dozhdot) (1994)
219  :  Cure, The (1995)


---  
#### BONUS : Recommend 3 movies

In [32]:
userID = 200
# LOCATING RATINGS OF SPECIFIED USER
user = df.loc[df['userId']==userID]
user = user.drop(columns=['userId'])

# LOCATING MOVIES WITH THE HIGHEST RATINGS @ rating = 5
rating5 = user.loc[user['rating']==5]
movieIDs = np.array(rating5['movieId'])
id = int(rating5.iloc[[0]].movieId)
movieMatching(id, 3)

Top  3  similar movies to :  Clueless (1995) , with ID :  39
11  :  American President, The (1995)
231  :  Dumb & Dumber (Dumb and Dumber) (1994)
185  :  Net, The (1995)
