In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import operator

In [2]:
output = pd.read_csv('output.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
output.head()

Unnamed: 0,movieId,title,imdbId,tmdbId,genres,tag,actors,director,document
0,2,Jumanji (1995),113497,8844,Adventure|Children|Fantasy,Robin Williams|time travel|fantasy|based on ch...,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Joe Johnston,Jumanji (1995)|Adventure|Children|Fantasy|Robi...
1,4,Waiting to Exhale (1995),114885,31357,Comedy|Drama|Romance,based on novel or book|chick flick|divorce|int...,Whitney Houston|Angela Bassett|Loretta Devine|...,Forest Whitaker,Waiting to Exhale (1995)|Comedy|Drama|Romance|...
2,5,Father of the Bride Part II (1995),113041,11862,Comedy,aging|baby|confidence|contraception|daughter|g...,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Charles Shyer,Father of the Bride Part II (1995)|Comedy|agin...
3,6,Heat (1995),113277,949,Action|Crime|Thriller,imdb top 250|great acting|realistic action|sus...,Al Pacino|Robert De Niro|Val Kilmer|Jon Voight...,Michael Mann,Heat (1995)|Action|Crime|Thriller|imdb top 250...
4,7,Sabrina (1995),114319,11860,Comedy|Romance,remake|chauffeur|fusion|long island|millionair...,Harrison Ford|Julia Ormond|Greg Kinnear|Angie ...,Sydney Pollack,Sabrina (1995)|Comedy|Romance|remake|chauffeur...


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,306,3.5
1,1,307,5.0
2,1,899,3.5
3,1,1088,4.0
4,1,1175,3.5


In [5]:
# null = 0
ratings.rating.replace({0 : np.nan}, regex = True, inplace = True)

In [6]:
# rating ratio
ratings.groupby("rating").count().iloc[:, :1] / ratings.count().userId * 100

Unnamed: 0_level_0,userId
rating,Unnamed: 1_level_1
0.5,1.666193
1.0,3.352163
1.5,1.7054
2.0,7.02086
2.5,5.315897
3.0,20.376186
3.5,12.879272
4.0,26.196847
4.5,8.258469
5.0,13.228713


In [7]:
# output + ratings
output_ratings = ratings.merge(output, left_on = 'movieId', right_on = 'movieId')

In [8]:
output_ratings.head()

Unnamed: 0,userId,movieId,rating,title,imdbId,tmdbId,genres,tag,actors,director,document
0,1,306,3.5,Three Colors: Red (Trois couleurs: Rouge) (1994),111495,110,Drama,atmospheric|enigmatic|gentle|lyrical|meditativ...,Jean-Louis Trintignant|Jean-Pierre Lorit|Samue...,,Three Colors: Red (Trois couleurs: Rouge) (199...
1,7,306,5.0,Three Colors: Red (Trois couleurs: Rouge) (1994),111495,110,Drama,atmospheric|enigmatic|gentle|lyrical|meditativ...,Jean-Louis Trintignant|Jean-Pierre Lorit|Samue...,,Three Colors: Red (Trois couleurs: Rouge) (199...
2,25,306,4.0,Three Colors: Red (Trois couleurs: Rouge) (1994),111495,110,Drama,atmospheric|enigmatic|gentle|lyrical|meditativ...,Jean-Louis Trintignant|Jean-Pierre Lorit|Samue...,,Three Colors: Red (Trois couleurs: Rouge) (199...
3,47,306,4.0,Three Colors: Red (Trois couleurs: Rouge) (1994),111495,110,Drama,atmospheric|enigmatic|gentle|lyrical|meditativ...,Jean-Louis Trintignant|Jean-Pierre Lorit|Samue...,,Three Colors: Red (Trois couleurs: Rouge) (199...
4,79,306,5.0,Three Colors: Red (Trois couleurs: Rouge) (1994),111495,110,Drama,atmospheric|enigmatic|gentle|lyrical|meditativ...,Jean-Louis Trintignant|Jean-Pierre Lorit|Samue...,,Three Colors: Red (Trois couleurs: Rouge) (199...


In [9]:
match_df = output_ratings[['movieId', 'title']]
match_df.drop_duplicates(inplace = True)
match_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  match_df.drop_duplicates(inplace = True)


Unnamed: 0,movieId,title
0,306,Three Colors: Red (Trois couleurs: Rouge) (1994)
7058,307,Three Colors: Blue (Trois couleurs: Bleu) (1993)
13674,899,Singin' in the Rain (1952)
24569,1088,Dirty Dancing (1987)
36504,1175,Delicatessen (1991)
...,...,...
20807545,200192,Den frusna leoparden (1986)
20807546,200194,Tough Luck (2004)
20807547,139970,I Don't Speak English (1995)
20807548,200726,The Graduates (1995)


In [10]:
pivot_input = output_ratings[['userId', 'movieId', 'rating']]
pivot_input = pivot_input[pivot_input.userId <= 8500]  # processing error with file size

In [11]:
pivot_input.tail()

Unnamed: 0,userId,movieId,rating
20639671,8477,176035,4.0
20639674,8477,191165,1.0
20639676,8477,191679,4.5
20639685,8477,194578,5.0
20639688,8482,163034,4.0


In [12]:
pivot = pivot_input.pivot_table(index = ['userId'], columns = ['movieId'], values = 'rating')

In [13]:
pivot.head()

movieId,2,4,5,6,7,8,9,10,11,12,...,208112,208507,208513,208515,208715,208737,208787,208793,208939,209163
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [14]:
norm_pivot = pivot.apply(lambda x : (x - np.min(x)) / (np.max(x) - np.min(x)), axis = 1)

In [15]:
norm_pivot.head()

movieId,2,4,5,6,7,8,9,10,11,12,...,208112,208507,208513,208515,208715,208737,208787,208793,208939,209163
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [16]:
norm_pivot.fillna(0, inplace = True)
norm_pivot = norm_pivot.T
norm_pivot = norm_pivot.loc[:, (norm_pivot != 0).any(axis = 0)]

In [17]:
norm_pivot.head()

userId,1,2,3,4,5,6,7,8,9,10,...,8491,8492,8493,8494,8495,8496,8497,8498,8499,8500
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.888889,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
sparse_pivot = csr_matrix(norm_pivot.values)

In [19]:
item_similarity = cosine_similarity(sparse_pivot)
user_similarity = cosine_similarity(sparse_pivot.T)

In [20]:
is_df = pd.DataFrame(item_similarity, index = norm_pivot.index, columns = norm_pivot.index)
us_df = pd.DataFrame(user_similarity, index = norm_pivot.columns, columns = norm_pivot.columns)

In [21]:
is_df.head()
us_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,8491,8492,8493,8494,8495,8496,8497,8498,8499,8500
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.040693,0.058,0.039635,0.0,0.0,0.105518,0.0,0.031042,0.0,...,0.078888,0.0,0.0,0.036695,0.055848,0.055312,0.0,0.0,0.110479,0.028847
2,0.040693,1.0,0.161724,0.171517,0.08639,0.070749,0.045163,0.11256,0.077503,0.031694,...,0.096239,0.085351,0.063082,0.102089,0.165176,0.167843,0.18961,0.019215,0.243892,0.090544
3,0.058,0.161724,1.0,0.344305,0.022045,0.10832,0.020829,0.0527,0.044316,0.089073,...,0.248533,0.017495,0.0,0.153022,0.287029,0.217997,0.106188,0.0,0.204895,0.136192
4,0.039635,0.171517,0.344305,1.0,0.027393,0.049469,0.0,0.071802,0.040792,0.045846,...,0.207916,0.0,0.0,0.125498,0.251155,0.199524,0.069658,0.0,0.180906,0.104327
5,0.0,0.08639,0.022045,0.027393,1.0,0.082188,0.0254,0.205259,0.144108,0.104719,...,0.050742,0.032001,0.119737,0.026499,0.085142,0.053993,0.066014,0.085102,0.054486,0.0


In [22]:
def find_id(title):
    id = int(match_df[match_df['title'] == title].movieId)
    return id

def find_title(id):
    title = match_df[match_df['movieId'] == id].title.values[0]
    return title

def similar_5movies(title):
    movie = find_id(title)
    num = 1
    print(f'Similar 5 movies to \'{title}\' :\n')
    top_five = is_df[movie].sort_values(ascending = False)[1:6] #[0] = movie
    for item, score in top_five.items():
        title = find_title(item)
        print(f'No.{num} : \'{title}\' (Similarity score : {score})')
        num += 1
        
def similar_5users(user):
    if user not in norm_pivot.columns:
        return('No data of user {}'.format(user))
    print('Most 5 similar users : \n')
    top_five = us_df.sort_values(by = user, ascending = False).loc[:, user][1:6]
    for user, similarity in top_five.items():
        print(f'UserId : {user} => Similarity : {similarity}')

In [23]:
similar_5movies('Dirty Dancing (1987)')

Similar 5 movies to 'Dirty Dancing (1987)' :

No.1 : 'Grease (1978)' (Similarity score : 0.48330423685922)
No.2 : 'Top Gun (1986)' (Similarity score : 0.3895388027336958)
No.3 : 'Pretty Woman (1990)' (Similarity score : 0.3860216042998241)
No.4 : 'Sound of Music, The (1965)' (Similarity score : 0.3706440916650196)
No.5 : 'When Harry Met Sally... (1989)' (Similarity score : 0.3665522296845649)


In [24]:
similar_5users(1)

Most 5 similar users : 

UserId : 4505 => Similarity : 0.23621711672473655
UserId : 6183 => Similarity : 0.21903834230671998
UserId : 5087 => Similarity : 0.20951381980929978
UserId : 4787 => Similarity : 0.2093168377195409
UserId : 6720 => Similarity : 0.19996875732231195


In [53]:
def recom_5movie(user):
    similar_30users = us_df.sort_values(by = user, ascending = False).index[1:31]
    movie_list = []
    recom = {}
    num = 0
    
    for i in similar_30users:
        movies = norm_pivot.loc[:, i][(norm_pivot.loc[:, user] == 0)].sort_values(ascending = False).index[:5]
        movie_list.append(movies.tolist())
    
    for i in range(len(movie_list)):
        for j in movie_list[i]:
            if j in recom:
                num += 1
            else:
                num = 1
            recom[j] = num
    five_movies = sorted(recom.items(), key = lambda x:x[1], reverse = True)[:5]
    for i in five_movies:
        print('The movie \'{}\' (No. of recommenders : {})'.format(find_title(int(i[0])), i[1]))

In [54]:
recom_5movie(5197)

The movie 'Fight Club (1999)' (No. of recommenders : 6)
The movie 'Lord of the Rings: The Fellowship of the Ring, The (2001)' (No. of recommenders : 5)
The movie 'American Beauty (1999)' (No. of recommenders : 5)
The movie 'Schindler's List (1993)' (No. of recommenders : 5)
The movie 'Lord of the Rings: The Two Towers, The (2002)' (No. of recommenders : 4)
