In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import pearsonr
import pickle
from random import shuffle

In [2]:
tags_df = pd.read_csv('tags.csv')
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [3]:
movies_df = pd.read_csv('movies.csv')
max_movie_id = max(movies_df['movieId'])
movies_df.set_index('movieId', inplace=True)
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings_df = pd.read_csv('ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
movie_user_table = pd.pivot_table(ratings_df, index='movieId', columns='userId', values='rating', aggfunc=np.max)
movie_user_table.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,


In [6]:
movie_user_table.T

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
6,,4.0,5.0,3.0,5.0,4.0,4.0,3.0,,3.0,...,,,,,,,,,,
7,4.5,,,,,,,,,,...,,,,,,,,,,
8,,4.0,,,,,,,,2.0,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [7]:
def find_common(table, user_1, user_2):
    u1_values = np.invert(table[user_1].isna())
    u2_values = np.invert(table[user_2].isna())
    # common = u1_values & u2_values
    # return common[common].index
    return np.where(u1_values & u2_values)[0]

In [8]:
no_of_users = len(movie_user_table.columns)
no_of_users

610

In [9]:
%%time
res = np.zeros((no_of_users, no_of_users), dtype=np.float)
for i in range(no_of_users):
    for j in range(no_of_users):
        if i == j:
            res[i][j] = -1
        elif i > j:
            res[i][j] = res[j][i]
        else:
            common = find_common(movie_user_table, i+1, j+1)
            if len(common) > 0:
                sim = pearsonr(movie_user_table[i+1].iloc[common], 
                               movie_user_table[j+1].iloc[common])[0]
                if not pd.isna(sim):
                    res[i][j] = sim
similarity_df = pd.DataFrame(res, columns=np.arange(1, no_of_users+1), 
             index=np.arange(1, no_of_users+1))
with open('pearson.dat', 'wb') as f:
    pickle.dump(similarity_df, f)

  r = r_num / r_den


Wall time: 3min 39s


In [10]:
with open('pearson.dat', 'rb') as f:
    similarity_df = pickle.load(f)

In [11]:
similarity_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
1,-1.000000,0.000000,0.079819,0.207983,0.268749,-2.916358e-01,-0.118773,0.469668,0.918559,-0.037987,...,0.091574,0.000000,-0.061503,-0.407556,-0.164871,0.066378,0.174557,0.268070,-0.175412,-0.032086
2,0.000000,-1.000000,0.000000,0.000000,0.000000,0.000000e+00,-0.991241,0.000000,0.000000,0.037796,...,-0.387347,0.000000,-1.000000,0.000000,0.000000,0.583333,0.000000,-0.125000,0.000000,0.623288
3,0.079819,0.000000,-1.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.433200,0.000000,0.000000,-0.791334,-0.333333,-0.395092,0.000000,0.569562
4,0.207983,0.000000,0.000000,-1.000000,-0.336525,1.484982e-01,0.542861,0.117851,0.000000,0.485794,...,-0.222113,0.396641,0.090090,-0.080296,0.400124,0.144603,0.116518,-0.170501,-0.277350,-0.043786
5,0.268749,0.000000,0.000000,-0.336525,-1.000000,4.316590e-02,0.158114,0.028347,0.000000,-0.777714,...,0.000000,0.153303,0.234743,0.067791,-0.364156,0.244321,0.231080,-0.020546,0.384111,0.040582
6,-0.291636,0.000000,0.000000,0.148498,0.043166,-1.000000e+00,-0.126595,-0.200062,0.000000,0.957427,...,-0.292770,-0.027495,-0.112141,-0.090297,0.100735,-0.049192,0.255639,0.125428,0.193649,0.115580
7,-0.118773,-0.991241,0.000000,0.542861,0.158114,-1.265950e-01,-1.000000,0.220416,0.925000,-0.156764,...,-0.280496,0.110432,0.343649,0.560968,0.416186,0.137771,0.402792,0.008081,0.420288,0.341233
8,0.469668,0.000000,0.000000,0.117851,0.028347,-2.000621e-01,0.220416,-1.000000,0.000000,-1.000000,...,1.000000,0.190902,0.249252,0.534375,-0.217405,0.253582,0.251280,0.434423,0.141860,0.167931
9,0.918559,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.925000,0.000000,-1.000000,0.000000,...,0.577350,0.000000,0.376715,0.000000,0.158777,0.572700,0.000000,0.336625,0.000000,0.615638
10,-0.037987,0.037796,0.000000,0.485794,-0.777714,9.574271e-01,-0.156764,-1.000000,0.000000,-1.000000,...,-0.397276,-0.944911,-0.424631,-1.000000,0.268684,-0.382955,-0.241121,-0.571043,0.000000,-0.205081


In [12]:
def get_recommendation(table, user, no_of_users=5):
    sim_to_user = similarity_df[user].sort_values(ascending=False)[:no_of_users]
    totals = pd.DataFrame({'ratings': np.zeros(max_movie_id),
                           'sum': np.zeros(max_movie_id)}, 
                           index=np.arange(1, 193610))
    for sim_user, sim in sim_to_user.items():
        mask = table[user].isna() & np.invert(table[sim_user].isna())
        r = table[sim_user][mask] * sim
        totals['ratings'][r.index] += r.values
        totals['sum'][r.index] += sim
    return (totals['ratings'] / totals['sum']).dropna().sort_values(ascending=False)

In [13]:
def show_recommendations(user_id, top_n=10, show_hist=True, 
                         show_rec=True, randomize=True):
    rec_movies = get_recommendation(movie_user_table, user_id)[:top_n*10]
    rec_movies_list = list(rec_movies.items())
    if randomize:
        shuffle(rec_movies_list)
    rec_movies_list = rec_movies_list[:top_n]
    rec_movies_list.sort(key=lambda x: x[1], reverse=True)
    
    if show_hist:
        [print('-', end='') for i in range(50)]
        print("\nUser Movie History:\n")
        user_movies = ratings_df[ratings_df.userId == user_id]
        user_movies = user_movies.sort_values(by='rating', ascending=False)
        for i in range(len(user_movies)):
            movie_id = user_movies.iloc[i]['movieId']
            title = movies_df.loc[movie_id].title
            genre = movies_df.loc[movie_id].genres
            rating = user_movies.iloc[i]['rating']
            print("Title: {}\nGenre: {}\nUser Rating: {}\n"
                  .format(title, genre, rating))

    if show_rec:
        [print('-', end='') for i in range(50)]
        print("\nRecommended Movies:\n")
        for movieId, rating in rec_movies_list:
            title = movies_df.loc[movieId].title
            genre = movies_df.loc[movieId].genres
            print("Title: {}\nGenre: {}\nPredicted Rating: {}\n"
                  .format(title, genre, rating))


In [14]:
show_recommendations(600, show_hist=False, randomize=False)

--------------------------------------------------
Recommended Movies:

Title: Chasing Liberty (2004)
Genre: Comedy|Romance
Predicted Rating: 5.0

Title: Inglourious Basterds (2009)
Genre: Action|Drama|War
Predicted Rating: 5.0

Title: Footloose (1984)
Genre: Drama
Predicted Rating: 5.0

Title: Snatch (2000)
Genre: Comedy|Crime|Thriller
Predicted Rating: 5.0

Title: In the Mood For Love (Fa yeung nin wa) (2000)
Genre: Drama|Romance
Predicted Rating: 5.0

Title: Blade Runner (1982)
Genre: Action|Sci-Fi|Thriller
Predicted Rating: 5.0

Title: Raise Your Voice (2004)
Genre: Romance
Predicted Rating: 5.0

Title: Memories of Murder (Salinui chueok) (2003)
Genre: Crime|Drama|Mystery|Thriller
Predicted Rating: 5.0

Title: Ice Princess (2005)
Genre: Children|Comedy|Drama
Predicted Rating: 5.0

Title: Goal! The Dream Begins (Goal!) (2005)
Genre: Drama
Predicted Rating: 5.0



In [140]:
def get_recommendation2(user_based, person):
    sim_to_user = df[person].sort_values(ascending=False)[:5]
    totals = {}
    simSums = {}
    for other, sim in sim_to_user.items():
        for item in user_based[other].dropna().index:
            if item not in user_based[person].dropna().index:
                totals.setdefault(item, 0)
                totals[item] += user_based[other][item] * sim
                simSums.setdefault(item, 0)
                simSums[item] += sim
    rankings=[(total/simSums[item],item) for item,total in totals.items( )]
    rankings.sort(reverse=True)
    return rankings

In [151]:
get_recommendation2(user_based, 2)

[(5.0, 98491),
 (5.0, 45499),
 (5.0, 40629),
 (5.0, 30816),
 (5.0, 27801),
 (5.0, 8636),
 (5.0, 6711),
 (5.0, 6618),
 (5.0, 6539),
 (5.0, 6333),
 (5.0, 5349),
 (5.0, 4446),
 (5.0, 3996),
 (5.0, 3793),
 (5.0, 2943),
 (5.0, 2916),
 (5.0, 2872),
 (5.0, 2580),
 (5.0, 2431),
 (5.0, 2357),
 (5.0, 2273),
 (5.0, 2193),
 (5.0, 2161),
 (5.0, 2105),
 (5.0, 2028),
 (5.0, 1923),
 (5.0, 1917),
 (5.0, 1719),
 (5.0, 1678),
 (5.0, 1673),
 (5.0, 1653),
 (5.0, 1639),
 (5.0, 1617),
 (5.0, 1545),
 (5.0, 1446),
 (5.0, 1372),
 (5.0, 1356),
 (5.0, 1280),
 (5.0, 1265),
 (5.0, 1249),
 (5.0, 1245),
 (5.0, 1210),
 (5.0, 1097),
 (5.0, 1060),
 (5.0, 994),
 (5.0, 919),
 (5.0, 914),
 (5.0, 858),
 (5.0, 838),
 (5.0, 613),
 (5.0, 593),
 (5.0, 590),
 (5.0, 562),
 (5.0, 552),
 (5.0, 527),
 (5.0, 509),
 (5.0, 480),
 (5.0, 457),
 (5.0, 446),
 (5.0, 434),
 (5.0, 428),
 (5.0, 380),
 (5.0, 367),
 (5.0, 337),
 (5.0, 329),
 (5.0, 316),
 (5.0, 307),
 (5.0, 296),
 (5.0, 265),
 (5.0, 260),
 (5.0, 39),
 (5.0, 36),
 (5.0, 28),
 (5.0

In [8]:
def sim_distance(prefs,person1,person2):
    
    common = find_common(prefs, person1, person2)
    
    if len(common)==0: return 0
            
    sum_of_squares = sum(np.power(
        prefs[person1].iloc[common] - prefs[person2].iloc[common], 2))
    
    return 1/(1+sum_of_squares)

In [9]:
def pearson_for_movies(prefs, person1, person2):
    
    common = find_common(prefs, person1, person2)
    
    if len(common)==0: return 0
    
    n = len(common)
    x1 = prefs[person1].iloc[common]
    x2 = prefs[person2].iloc[common]
    
    sum1=sum(x1)
    sum2=sum(x2)
    
    sum1Sq=sum(np.power(x1, 2))
    sum2Sq=sum(np.power(x2, 2))
    
    # Sum up the products
    pSum=sum(x1 * x2)
    
    # Calculate Pearson score
    num=pSum-(sum1*sum2/n)
    den=np.sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
        
    if den==0: return 0

    r=num/den 
    return r


In [89]:
%%time
res = np.zeros((no_of_users, no_of_users), dtype=np.float)
for i in range(no_of_users):
    for j in range(no_of_users):
        if i == j:
            res[i][j] = -1
        elif i > j:
            res[i][j] = res[j][i]
        else:
            res[i][j] = pearson_for_movies(user_based, i+1, j+1)
df1 = pd.DataFrame(res, columns=np.arange(1, no_of_users+1), 
             index=np.arange(1, no_of_users+1))
with open('custom_pearson.dat', 'wb') as f:
    pickle.dump(df1, f)

Wall time: 4min 27s


In [98]:
with open('custom_pearson.dat', 'rb') as f:
    df1 = pickle.load(f)

In [99]:
df1

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
1,-1.000000,0.000000,0.079819,0.207983,0.268749,-0.291636,-0.118773,0.469668,0.918559,-0.037987,...,0.091574,0.000000,-0.061503,-0.407556,-0.164871,0.066378,0.174557,0.268070,-0.175412,-0.032086
2,0.000000,-1.000000,0.000000,0.000000,0.000000,0.000000,-0.991241,0.000000,0.000000,0.037796,...,-0.387347,0.000000,-1.000000,0.000000,0.000000,0.583333,0.000000,-0.125000,0.000000,0.623288
3,0.079819,0.000000,-1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.433200,0.000000,0.000000,-0.791334,-0.333333,-0.395092,0.000000,0.569562
4,0.207983,0.000000,0.000000,-1.000000,-0.336525,0.148498,0.542861,0.117851,0.000000,0.485794,...,-0.222113,0.396641,0.090090,-0.080296,0.400124,0.144603,0.116518,-0.170501,-0.277350,-0.043786
5,0.268749,0.000000,0.000000,-0.336525,-1.000000,0.043166,0.158114,0.028347,0.000000,-0.777714,...,0.000000,0.153303,0.234743,0.067791,-0.364156,0.244321,0.231080,-0.020546,0.384111,0.040582
6,-0.291636,0.000000,0.000000,0.148498,0.043166,-1.000000,-0.126595,-0.200062,0.000000,0.957427,...,-0.292770,-0.027495,-0.112141,-0.090297,0.100735,-0.049192,0.255639,0.125428,0.193649,0.115580
7,-0.118773,-0.991241,0.000000,0.542861,0.158114,-0.126595,-1.000000,0.220416,0.925000,-0.156764,...,-0.280496,0.110432,0.343649,0.560968,0.416186,0.137771,0.402792,0.008081,0.420288,0.341233
8,0.469668,0.000000,0.000000,0.117851,0.028347,-0.200062,0.220416,-1.000000,0.000000,-1.000000,...,1.000000,0.190902,0.249252,0.534375,-0.217405,0.253582,0.251280,0.434423,0.141860,0.167931
9,0.918559,0.000000,0.000000,0.000000,0.000000,0.000000,0.925000,0.000000,-1.000000,0.000000,...,0.577350,0.000000,0.376715,0.000000,0.158777,0.572700,0.000000,0.336625,0.000000,0.615638
10,-0.037987,0.037796,0.000000,0.485794,-0.777714,0.957427,-0.156764,-1.000000,0.000000,-1.000000,...,-0.397276,-0.944911,-0.424631,-1.000000,0.268684,-0.382955,-0.241121,-0.571043,0.000000,-0.205081
