In [2]:
# Import libraries
import pandas as pd
import numpy as np

In [3]:
# Import datasets
movies = pd.read_csv("movielens/movies.csv")
ratings = pd.read_csv("movielens/ratings.csv")
tags = pd.read_csv("movielens/tags.csv")

# Exploring the data

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [6]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


# Normalization of the data

In [7]:
Mean = ratings.groupby(
    by = "userId",
    as_index = "False",
)['rating'].mean()

Mean.head()

userId
1    3.742857
2    4.000000
3    4.122995
4    3.571429
5    4.272727
Name: rating, dtype: float64

In [8]:
rating_avg = pd.merge(
    ratings,
    Mean,
    on = "userId"
)

rating_avg.head()

Unnamed: 0,userId,movieId,rating_x,timestamp,rating_y
0,1,2,3.5,1112486027,3.742857
1,1,29,3.5,1112484676,3.742857
2,1,32,3.5,1112484819,3.742857
3,1,47,3.5,1112484727,3.742857
4,1,50,3.5,1112484580,3.742857


In [9]:
rating_avg['adg_rating'] = (rating_avg['rating_x'] - rating_avg['rating_y'])
rating_avg.head()            

Unnamed: 0,userId,movieId,rating_x,timestamp,rating_y,adg_rating
0,1,2,3.5,1112486027,3.742857,-0.242857
1,1,29,3.5,1112484676,3.742857,-0.242857
2,1,32,3.5,1112484819,3.742857,-0.242857
3,1,47,3.5,1112484727,3.742857,-0.242857
4,1,50,3.5,1112484580,3.742857,-0.242857


# Cleaning the data

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

final = pd.pivot_table(
    rating_avg,
    values = 'adg_rating',
    index = 'userId',
    columns = 'movieId'
)

final.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,129350,129354,129428,129707,130052,130073,130219,130462,130490,130642
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,-0.242857,,,,,,,,,...,,,,,,,,,,
2,,,0.0,,,,,,,,...,,,,,,,,,,
3,-0.122995,,,,,,,,,,...,,,,,,,,,,
4,,,,,,-0.571429,,,,0.428571,...,,,,,,,,,,
5,,-1.272727,,,,,,,,,...,,,,,,,,,,


There are a lots of NaN values because every user has not seen all the movies. This matrix is call `sparse matrix`.
Replacing this NaN values: 

In [11]:
# By movie average
final_movie = final.fillna(final.mean(axis = 0))

# Replacing NaN by user Average
final_user = final.apply(lambda row: row.fillna(row.mean()), axis = 1)

final_movie.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,129350,129354,129428,129707,130052,130073,130219,130462,130490,130642
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.354987,-0.242857,-0.356159,-0.504011,-0.374364,0.288623,-0.145776,-0.097414,-0.440951,-0.118551,...,0.040724,-0.162266,-0.250513,-1.986486,-1.597955,-0.375697,0.557331,0.124595,-1.135011,0.402045
2,0.354987,-0.269639,0.0,-0.504011,-0.374364,0.288623,-0.145776,-0.097414,-0.440951,-0.118551,...,0.040724,-0.162266,-0.250513,-1.986486,-1.597955,-0.375697,0.557331,0.124595,-1.135011,0.402045
3,-0.122995,-0.269639,-0.356159,-0.504011,-0.374364,0.288623,-0.145776,-0.097414,-0.440951,-0.118551,...,0.040724,-0.162266,-0.250513,-1.986486,-1.597955,-0.375697,0.557331,0.124595,-1.135011,0.402045
4,0.354987,-0.269639,-0.356159,-0.504011,-0.374364,-0.571429,-0.145776,-0.097414,-0.440951,0.428571,...,0.040724,-0.162266,-0.250513,-1.986486,-1.597955,-0.375697,0.557331,0.124595,-1.135011,0.402045
5,0.354987,-1.272727,-0.356159,-0.504011,-0.374364,0.288623,-0.145776,-0.097414,-0.440951,-0.118551,...,0.040724,-0.162266,-0.250513,-1.986486,-1.597955,-0.375697,0.557331,0.124595,-1.135011,0.402045


In [12]:
# calculating the similarity between the users

cosine = cosine_similarity(final_movie)

np.fill_diagonal(cosine, 0)

In [13]:
similarity_with_movie = pd.DataFrame(
    cosine,
    index = final_movie.index
)

similarity_with_movie.columns = final_user.index

similarity_with_movie.head()

userId,1,2,3,4,5,6,7,8,9,10,...,7111,7112,7113,7114,7115,7116,7117,7118,7119,7120
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.993645,0.989068,0.995953,0.993566,0.995469,0.985177,0.993894,0.994673,0.996406,...,0.992366,0.996391,0.990803,0.996072,0.994978,0.989353,0.987931,0.995011,0.995373,0.996025
2,0.993645,0.0,0.988834,0.995416,0.99324,0.995189,0.985937,0.994043,0.994434,0.995516,...,0.991986,0.996015,0.990378,0.995575,0.994238,0.989081,0.986762,0.995841,0.995259,0.995537
3,0.989068,0.988834,0.0,0.990749,0.987702,0.990023,0.980075,0.988536,0.989736,0.990406,...,0.987065,0.991159,0.984923,0.990694,0.988997,0.982924,0.982128,0.98976,0.99034,0.990764
4,0.995953,0.995416,0.990749,0.0,0.99519,0.996885,0.986879,0.995924,0.99636,0.997529,...,0.993738,0.997971,0.993146,0.997498,0.996241,0.99038,0.989021,0.996667,0.996561,0.997543
5,0.993566,0.99324,0.987702,0.99519,0.0,0.995004,0.985308,0.993087,0.994096,0.995203,...,0.991692,0.995676,0.990518,0.995262,0.995181,0.987669,0.986575,0.994067,0.994213,0.995319


In [14]:
# user similarity on replacing NAN by user avg
b = cosine_similarity(final_user)
np.fill_diagonal(b, 0 )
similarity_with_user = pd.DataFrame(b,index=final_user.index)
similarity_with_user.columns=final_user.index
similarity_with_user.head()

userId,1,2,3,4,5,6,7,8,9,10,...,7111,7112,7113,7114,7115,7116,7117,7118,7119,7120
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.025079,0.052611,0.031839,0.057268,-0.004678,0.03186,-0.007859,0.0008064088,0.071578,...,0.013262,0.01755313,-0.008575,0.06611135,-0.000905,0.101994,0.044777,-0.04135166,-0.013052,0.01836
2,0.025079,0.0,0.064919,0.025317,0.052187,0.035336,0.088485,0.039048,1.665899e-32,0.013856,...,0.011279,-2.8178280000000003e-33,0.04682,0.01170373,0.0014,0.057578,0.006336,0.1377168,0.034293,0.006262
3,0.052611,0.064919,0.0,0.0153,-0.008025,-0.024011,0.01259,0.012737,0.02177168,0.026654,...,0.017999,0.01751704,-0.01558,0.05513281,-0.012029,0.008905,0.03682,0.01587831,0.045976,0.011999
4,0.031839,0.025317,0.0153,0.0,0.044112,-0.050693,0.02121,0.158575,0.01485647,-0.025345,...,-0.016181,0.01071634,0.201465,-1.4566660000000002e-29,0.01406,0.018712,-0.003757,2.203416e-29,-0.127582,0.032342
5,0.057268,0.052187,-0.008025,0.044112,0.0,0.050837,0.080559,-0.004609,6.018011e-30,0.015948,...,0.028127,-1.1088790000000001e-29,0.047999,0.01796047,0.152094,0.005639,0.042318,-0.07484938,-0.042648,0.01892


In [15]:
def find_n_neighbours(df,n):
    order = np.argsort(df.values, axis=1)[:, :n]
    df = df.apply(lambda x: pd.Series(x.sort_values(ascending=False)
           .iloc[:n].index, 
          index=['top{}'.format(i) for i in range(1, n+1)]), axis=1)
    return df

In [16]:
sim_user_30_u = find_n_neighbours(similarity_with_user,30)
sim_user_30_u.head()

Unnamed: 0_level_0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10,...,top21,top22,top23,top24,top25,top26,top27,top28,top29,top30
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3083,1385,1596,179,211,1273,194,83,6310,5554,...,6541,734,2452,1962,4079,5854,5616,1687,2207,4336
2,5295,6062,213,5974,1004,3056,4612,1941,6944,5126,...,3554,6112,3880,3702,1174,4297,265,6994,1576,5493
3,3786,2994,554,1120,1811,5433,4074,3746,3847,1969,...,5781,1232,5092,2289,3484,2099,1896,6478,5041,2207
4,4609,3120,4115,126,3843,2703,2145,257,6183,6918,...,6154,6165,1214,1368,3566,3271,6177,1043,1995,3886
5,1607,1438,6521,4876,5271,4278,117,4801,6221,3476,...,1891,3214,1064,5012,3753,2499,2493,4727,1699,4580


In [22]:
def get_user_similar_movies( user1, user2 ):
    common_movies = rating_avg[rating_avg.userId == user1].merge(
    rating_avg[rating_avg.userId == user2],
    on = "movieId",
    how = "inner" )
    return common_movies.merge( movies, on = 'movieId' )

In [24]:
a = get_user_similar_movies(370,371)
a = a.loc[ : , ['rating_x_x','rating_x_y','title']]
a.head()

Unnamed: 0,rating_x_x,rating_x_y,title
0,5.0,4.5,Three Colors: Blue (Trois couleurs: Bleu) (1993)
1,4.5,4.0,Sleeper (1973)
2,3.5,2.5,Sphere (1998)


In [None]:
def user_item_score(user, item):
    a = sim_user_30_m[sim_user_30_m.index == user].values
    b = a.squeeze().tolist()
    c = final_movie.loc[:, item]
    d = c[c.index.isin(b)]
    f = d[d.notnull()]
    avg_user = mean.loc[mean['userId'] == user, 'rating'].values[0]
    index = f.index.values.squeeze().tolist()
    corr = similarity_with_movie.loc[user,index]
    fin = pd.concat([f, corr], axis=1)
    fin.columns = []