In [1]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import pickle
from random import shuffle

In [2]:
movies_df = pd.read_csv('movies.csv')
max_movie_id = max(movies_df['movieId'])
movies_df.set_index('movieId', inplace=True)
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [3]:
movies_df.shape

(9742, 2)

In [4]:
ratings_df = pd.read_csv('ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
ratings_df.shape

(100836, 4)

In [6]:
movie_user_table = pd.pivot_table(ratings_df, index='movieId', columns='userId', values='rating', aggfunc=np.max)
movie_user_table.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,


In [7]:
movie_user_table.T

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
6,,4.0,5.0,3.0,5.0,4.0,4.0,3.0,,3.0,...,,,,,,,,,,
7,4.5,,,,,,,,,,...,,,,,,,,,,
8,,4.0,,,,,,,,2.0,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [8]:
def get_table_size(columns, cell_size=8):
    byte_scale = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
    size = len(columns) * len(columns) * 8
    i = 0
    while i < len(byte_scale) and size > 1024:
        size /= 1024
        i += 1
    print("Estimated Table Size: {:0.02f} {}".format(size, byte_scale[i]))

In [9]:
def find_common(table, item1, item2):
    item1_values = np.invert(table[item1].isna())
    item2_values = np.invert(table[item2].isna())
    return np.where(item1_values & item2_values)[0]

In [10]:
def find_column_similarity(table, file_name='pearson.dat'):
    size = len(table.columns)
    i = 0
    similarity_df = pd.DataFrame(data=0.0, index=table.columns, columns=table.columns)
    for col in table.columns:
        for row in table.columns:
            if col == row:
                similarity_df[col][row] = -1
            elif col > row:
                similarity_df[col][row] = similarity_df[row][col]
            else:
                common = find_common(table, col, row)
                if len(common):
                    sim = pearsonr(table[col].iloc[common], table[row].iloc[common])[0]
                    if not pd.isna(sim):
                        similarity_df[col][row] = sim
        i += 1
        print("{}/{}\r".format(i, size), end='')
    with open(file_name, 'wb') as f:
        pickle.dump(similarity_df, f)
    return similarity_df

## User Based Recommendation

In [11]:
get_table_size(movie_user_table.columns)

Estimated Table Size: 2.84 MB


In [12]:
%%time
find_column_similarity(movie_user_table, 'user_similarity.dat')

  r = r_num / r_den
  x = np.where(x < 1.0, x, 1.0)  # if x > 1 then return 1.0


Wall time: 2min 58s


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1.000000,0.000000,0.079819,0.207983,0.268749,-2.916358e-01,-0.118773,0.469668,0.918559,-0.037987,...,0.091574,0.000000,-0.061503,-0.407556,-0.164871,0.066378,0.174557,0.268070,-0.175412,-0.032086
2,0.000000,-1.000000,0.000000,0.000000,0.000000,0.000000e+00,-0.991241,0.000000,0.000000,0.037796,...,-0.387347,0.000000,-1.000000,0.000000,0.000000,0.583333,0.000000,-0.125000,0.000000,0.623288
3,0.079819,0.000000,-1.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.433200,0.000000,0.000000,-0.791334,-0.333333,-0.395092,0.000000,0.569562
4,0.207983,0.000000,0.000000,-1.000000,-0.336525,1.484982e-01,0.542861,0.117851,0.000000,0.485794,...,-0.222113,0.396641,0.090090,-0.080296,0.400124,0.144603,0.116518,-0.170501,-0.277350,-0.043786
5,0.268749,0.000000,0.000000,-0.336525,-1.000000,4.316590e-02,0.158114,0.028347,0.000000,-0.777714,...,0.000000,0.153303,0.234743,0.067791,-0.364156,0.244321,0.231080,-0.020546,0.384111,0.040582
6,-0.291636,0.000000,0.000000,0.148498,0.043166,-1.000000e+00,-0.126595,-0.200062,0.000000,0.957427,...,-0.292770,-0.027495,-0.112141,-0.090297,0.100735,-0.049192,0.255639,0.125428,0.193649,0.115580
7,-0.118773,-0.991241,0.000000,0.542861,0.158114,-1.265950e-01,-1.000000,0.220416,0.925000,-0.156764,...,-0.280496,0.110432,0.343649,0.560968,0.416186,0.137771,0.402792,0.008081,0.420288,0.341233
8,0.469668,0.000000,0.000000,0.117851,0.028347,-2.000621e-01,0.220416,-1.000000,0.000000,-1.000000,...,1.000000,0.190902,0.249252,0.534375,-0.217405,0.253582,0.251280,0.434423,0.141860,0.167931
9,0.918559,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.925000,0.000000,-1.000000,0.000000,...,0.577350,0.000000,0.376715,0.000000,0.158777,0.572700,0.000000,0.336625,0.000000,0.615638
10,-0.037987,0.037796,0.000000,0.485794,-0.777714,9.574271e-01,-0.156764,-1.000000,0.000000,-1.000000,...,-0.397276,-0.944911,-0.424631,-1.000000,0.268684,-0.382955,-0.241121,-0.571043,0.000000,-0.205081


In [13]:
with open('user_similarity.dat', 'rb') as f:
    user_similarity_df = pickle.load(f)

In [14]:
user_similarity_df

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1.000000,0.000000,0.079819,0.207983,0.268749,-2.916358e-01,-0.118773,0.469668,0.918559,-0.037987,...,0.091574,0.000000,-0.061503,-0.407556,-0.164871,0.066378,0.174557,0.268070,-0.175412,-0.032086
2,0.000000,-1.000000,0.000000,0.000000,0.000000,0.000000e+00,-0.991241,0.000000,0.000000,0.037796,...,-0.387347,0.000000,-1.000000,0.000000,0.000000,0.583333,0.000000,-0.125000,0.000000,0.623288
3,0.079819,0.000000,-1.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.433200,0.000000,0.000000,-0.791334,-0.333333,-0.395092,0.000000,0.569562
4,0.207983,0.000000,0.000000,-1.000000,-0.336525,1.484982e-01,0.542861,0.117851,0.000000,0.485794,...,-0.222113,0.396641,0.090090,-0.080296,0.400124,0.144603,0.116518,-0.170501,-0.277350,-0.043786
5,0.268749,0.000000,0.000000,-0.336525,-1.000000,4.316590e-02,0.158114,0.028347,0.000000,-0.777714,...,0.000000,0.153303,0.234743,0.067791,-0.364156,0.244321,0.231080,-0.020546,0.384111,0.040582
6,-0.291636,0.000000,0.000000,0.148498,0.043166,-1.000000e+00,-0.126595,-0.200062,0.000000,0.957427,...,-0.292770,-0.027495,-0.112141,-0.090297,0.100735,-0.049192,0.255639,0.125428,0.193649,0.115580
7,-0.118773,-0.991241,0.000000,0.542861,0.158114,-1.265950e-01,-1.000000,0.220416,0.925000,-0.156764,...,-0.280496,0.110432,0.343649,0.560968,0.416186,0.137771,0.402792,0.008081,0.420288,0.341233
8,0.469668,0.000000,0.000000,0.117851,0.028347,-2.000621e-01,0.220416,-1.000000,0.000000,-1.000000,...,1.000000,0.190902,0.249252,0.534375,-0.217405,0.253582,0.251280,0.434423,0.141860,0.167931
9,0.918559,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.925000,0.000000,-1.000000,0.000000,...,0.577350,0.000000,0.376715,0.000000,0.158777,0.572700,0.000000,0.336625,0.000000,0.615638
10,-0.037987,0.037796,0.000000,0.485794,-0.777714,9.574271e-01,-0.156764,-1.000000,0.000000,-1.000000,...,-0.397276,-0.944911,-0.424631,-1.000000,0.268684,-0.382955,-0.241121,-0.571043,0.000000,-0.205081


In [15]:
def get_recommendation(table, similarity_table, user, no_of_users=5):
    sim_to_user = similarity_table[user].sort_values(ascending=False)[:no_of_users]
    totals = pd.DataFrame({'ratings': np.zeros(max_movie_id),
                           'sum': np.zeros(max_movie_id)}, 
                           index=np.arange(1, 193610))
    for sim_user, sim in sim_to_user.items():
        mask = table[user].isna() & np.invert(table[sim_user].isna())
        r = table[sim_user][mask] * sim
        totals['ratings'][r.index] += r.values
        totals['sum'][r.index] += sim
    return (totals['ratings'] / totals['sum']).dropna().sort_values(ascending=False)

In [16]:
def show_recommendations(user_id, table=movie_user_table, 
                         similarity_table=user_similarity_df,
                         top_n=10, show_hist=False, 
                         show_rec=False, randomize=True):
    if show_hist:
        [print('-', end='') for i in range(50)]
        print("\nUser Movie History:\n")
        user_movies = ratings_df[ratings_df.userId == user_id]
        user_movies = user_movies.sort_values(by='rating', ascending=False)
        for i in range(len(user_movies)):
            movie_id = user_movies.iloc[i]['movieId']
            title = movies_df.loc[movie_id].title
            genre = movies_df.loc[movie_id].genres
            rating = user_movies.iloc[i]['rating']
            print("Title: {}\nGenre: {}\nUser Rating: {}\n"
                  .format(title, genre, rating))

    if show_rec:
        rec_movies = get_recommendation(table, similarity_table, user_id)[:top_n*10]
        rec_movies_list = list(rec_movies.items())
        if randomize:
            shuffle(rec_movies_list)
        rec_movies_list = rec_movies_list[:top_n]
        rec_movies_list.sort(key=lambda x: x[1], reverse=True)
        [print('-', end='') for i in range(50)]
        print("\nRecommended Movies:\n")
        for movieId, rating in rec_movies_list:
            title = movies_df.loc[movieId].title
            genre = movies_df.loc[movieId].genres
            print("Title: {}\nGenre: {}\nPredicted Rating: {}\n"
                  .format(title, genre, rating))


In [17]:
show_recommendations(user_id=10, show_hist=True)

--------------------------------------------------
User Movie History:

Title: The Intern (2015)
Genre: Comedy
User Rating: 5.0

Title: First Daughter (2004)
Genre: Comedy|Romance
User Rating: 5.0

Title: Skyfall (2012)
Genre: Action|Adventure|Thriller|IMAX
User Rating: 5.0

Title: Dark Knight Rises, The (2012)
Genre: Action|Adventure|Crime|IMAX
User Rating: 5.0

Title: Troy (2004)
Genre: Action|Adventure|Drama|War
User Rating: 5.0

Title: King's Speech, The (2010)
Genre: Drama
User Rating: 5.0

Title: Notebook, The (2004)
Genre: Drama|Romance
User Rating: 5.0

Title: Despicable Me (2010)
Genre: Animation|Children|Comedy|Crime
User Rating: 5.0

Title: Education, An (2009)
Genre: Drama|Romance
User Rating: 5.0

Title: Batman Begins (2005)
Genre: Action|Crime|IMAX
User Rating: 5.0

Title: Spectre (2015)
Genre: Action|Adventure|Crime
User Rating: 5.0

Title: Casino Royale (2006)
Genre: Action|Adventure|Thriller
User Rating: 5.0

Title: Holiday, The (2006)
Genre: Comedy|Romance
User Rating

In [18]:
show_recommendations(user_id=10, show_rec=True)

--------------------------------------------------
Recommended Movies:

Title: Lawrence of Arabia (1962)
Genre: Adventure|Drama|War
Predicted Rating: 5.0

Title: Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
Genre: Mystery|Sci-Fi|Thriller
Predicted Rating: 5.0

Title: Shining, The (1980)
Genre: Horror
Predicted Rating: 4.0

Title: Moonstruck (1987)
Genre: Comedy|Romance
Predicted Rating: 4.0

Title: Die Hard (1988)
Genre: Action|Crime|Thriller
Predicted Rating: 4.0

Title: 101 Dalmatians (One Hundred and One Dalmatians) (1961)
Genre: Adventure|Animation|Children
Predicted Rating: 4.0

Title: Congo (1995)
Genre: Action|Adventure|Mystery|Sci-Fi
Predicted Rating: 4.0

Title: Outbreak (1995)
Genre: Action|Drama|Sci-Fi|Thriller
Predicted Rating: 4.0

Title: City Slickers II: The Legend of Curly's Gold (1994)
Genre: Adventure|Comedy|Western
Predicted Rating: 4.0

Title: Ghost (1990)
Genre: Comedy|Drama|Fantasy|Romance|Thriller
Predicted Rating: 3.0



## Item Based Recommendation

In [19]:
get_table_size(movie_user_table.T.columns)

Estimated Table Size: 721.41 MB


In [None]:
%%time
find_column_similarity(movie_user_table.T, 'item_similarity.dat')

In [20]:
with open('item_similarity.dat', 'rb') as f:
    item_similarity_df = pickle.load(f)

In [21]:
item_similarity_df

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1.000000,0.330978,0.487109,1.000000,0.310971,0.106465,0.208402,0.968246,0.095913,-0.021409,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.330978,-1.000000,0.419564,0.000000,0.562791,0.163510,0.430261,0.415227,0.277350,0.016626,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.487109,0.419564,-1.000000,0.000000,0.602266,0.345069,0.554088,0.333333,0.458591,-0.050276,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.000000,0.000000,0.000000,-1.000000,0.654654,0.000000,0.203653,0.000000,0.000000,0.870388,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.310971,0.562791,0.602266,0.654654,-1.000000,0.291302,0.609119,0.555556,0.319173,0.218263,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.106465,0.163510,0.345069,0.000000,0.291302,-1.000000,-0.123897,-0.801784,-0.172891,0.420222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.208402,0.430261,0.554088,0.203653,0.609119,-0.123897,-1.000000,0.816497,0.745356,0.186891,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.968246,0.415227,0.333333,0.000000,0.555556,-0.801784,0.816497,-1.000000,0.000000,0.557086,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.095913,0.277350,0.458591,0.000000,0.319173,-0.172891,0.745356,0.000000,-1.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,-0.021409,0.016626,-0.050276,0.870388,0.218263,0.420222,0.186891,0.557086,0.000000,-1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Dummy Code

In [9]:
%%time
res = np.zeros((no_of_users, no_of_users), dtype=np.float)
for i in range(no_of_users):
    for j in range(no_of_users):
        if i == j:
            res[i][j] = -1
        elif i > j:
            res[i][j] = res[j][i]
        else:
            common = find_common(movie_user_table, i+1, j+1)
            if len(common) > 0:
                sim = pearsonr(movie_user_table[i+1].iloc[common], 
                               movie_user_table[j+1].iloc[common])[0]
                if not pd.isna(sim):
                    res[i][j] = sim
similarity_df = pd.DataFrame(res, columns=np.arange(1, no_of_users+1), 
             index=np.arange(1, no_of_users+1))
with open('pearson.dat', 'wb') as f:
    pickle.dump(similarity_df, f)

  r = r_num / r_den


Wall time: 3min 39s


In [140]:
def get_recommendation2(user_based, person):
    sim_to_user = df[person].sort_values(ascending=False)[:5]
    totals = {}
    simSums = {}
    for other, sim in sim_to_user.items():
        for item in user_based[other].dropna().index:
            if item not in user_based[person].dropna().index:
                totals.setdefault(item, 0)
                totals[item] += user_based[other][item] * sim
                simSums.setdefault(item, 0)
                simSums[item] += sim
    rankings=[(total/simSums[item],item) for item,total in totals.items( )]
    rankings.sort(reverse=True)
    return rankings

In [151]:
get_recommendation2(user_based, 2)

[(5.0, 98491),
 (5.0, 45499),
 (5.0, 40629),
 (5.0, 30816),
 (5.0, 27801),
 (5.0, 8636),
 (5.0, 6711),
 (5.0, 6618),
 (5.0, 6539),
 (5.0, 6333),
 (5.0, 5349),
 (5.0, 4446),
 (5.0, 3996),
 (5.0, 3793),
 (5.0, 2943),
 (5.0, 2916),
 (5.0, 2872),
 (5.0, 2580),
 (5.0, 2431),
 (5.0, 2357),
 (5.0, 2273),
 (5.0, 2193),
 (5.0, 2161),
 (5.0, 2105),
 (5.0, 2028),
 (5.0, 1923),
 (5.0, 1917),
 (5.0, 1719),
 (5.0, 1678),
 (5.0, 1673),
 (5.0, 1653),
 (5.0, 1639),
 (5.0, 1617),
 (5.0, 1545),
 (5.0, 1446),
 (5.0, 1372),
 (5.0, 1356),
 (5.0, 1280),
 (5.0, 1265),
 (5.0, 1249),
 (5.0, 1245),
 (5.0, 1210),
 (5.0, 1097),
 (5.0, 1060),
 (5.0, 994),
 (5.0, 919),
 (5.0, 914),
 (5.0, 858),
 (5.0, 838),
 (5.0, 613),
 (5.0, 593),
 (5.0, 590),
 (5.0, 562),
 (5.0, 552),
 (5.0, 527),
 (5.0, 509),
 (5.0, 480),
 (5.0, 457),
 (5.0, 446),
 (5.0, 434),
 (5.0, 428),
 (5.0, 380),
 (5.0, 367),
 (5.0, 337),
 (5.0, 329),
 (5.0, 316),
 (5.0, 307),
 (5.0, 296),
 (5.0, 265),
 (5.0, 260),
 (5.0, 39),
 (5.0, 36),
 (5.0, 28),
 (5.0

In [8]:
def sim_distance(prefs,person1,person2):
    
    common = find_common(prefs, person1, person2)
    
    if len(common)==0: return 0
            
    sum_of_squares = sum(np.power(
        prefs[person1].iloc[common] - prefs[person2].iloc[common], 2))
    
    return 1/(1+sum_of_squares)

In [9]:
def custom_pearsonr(x1, x2):
    n = len(x1)
    
    sum1=np.sum(x1)
    sum2=np.sum(x2)
    
    sum1Sq=np.sum(pow(x1, 2))
    sum2Sq=np.sum(pow(x2, 2))
    
    pSum=np.sum(x1 * x2)
    
    num=pSum-(sum1*sum2/n)
    den=pow(((sum1Sq - pow(sum1, 2)/n)*(sum2Sq-pow(sum2, 2)/n)), 0.5)
        
    if den==0: return 0

    r=num/den 
    return r

In [89]:
%%time
res = np.zeros((no_of_users, no_of_users), dtype=np.float)
for i in range(no_of_users):
    for j in range(no_of_users):
        if i == j:
            res[i][j] = -1
        elif i > j:
            res[i][j] = res[j][i]
        else:
            res[i][j] = pearson_for_movies(user_based, i+1, j+1)
df1 = pd.DataFrame(res, columns=np.arange(1, no_of_users+1), 
             index=np.arange(1, no_of_users+1))
with open('custom_pearson.dat', 'wb') as f:
    pickle.dump(df1, f)

Wall time: 4min 27s


In [98]:
with open('custom_pearson.dat', 'rb') as f:
    df1 = pickle.load(f)

In [99]:
df1

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
1,-1.000000,0.000000,0.079819,0.207983,0.268749,-0.291636,-0.118773,0.469668,0.918559,-0.037987,...,0.091574,0.000000,-0.061503,-0.407556,-0.164871,0.066378,0.174557,0.268070,-0.175412,-0.032086
2,0.000000,-1.000000,0.000000,0.000000,0.000000,0.000000,-0.991241,0.000000,0.000000,0.037796,...,-0.387347,0.000000,-1.000000,0.000000,0.000000,0.583333,0.000000,-0.125000,0.000000,0.623288
3,0.079819,0.000000,-1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.433200,0.000000,0.000000,-0.791334,-0.333333,-0.395092,0.000000,0.569562
4,0.207983,0.000000,0.000000,-1.000000,-0.336525,0.148498,0.542861,0.117851,0.000000,0.485794,...,-0.222113,0.396641,0.090090,-0.080296,0.400124,0.144603,0.116518,-0.170501,-0.277350,-0.043786
5,0.268749,0.000000,0.000000,-0.336525,-1.000000,0.043166,0.158114,0.028347,0.000000,-0.777714,...,0.000000,0.153303,0.234743,0.067791,-0.364156,0.244321,0.231080,-0.020546,0.384111,0.040582
6,-0.291636,0.000000,0.000000,0.148498,0.043166,-1.000000,-0.126595,-0.200062,0.000000,0.957427,...,-0.292770,-0.027495,-0.112141,-0.090297,0.100735,-0.049192,0.255639,0.125428,0.193649,0.115580
7,-0.118773,-0.991241,0.000000,0.542861,0.158114,-0.126595,-1.000000,0.220416,0.925000,-0.156764,...,-0.280496,0.110432,0.343649,0.560968,0.416186,0.137771,0.402792,0.008081,0.420288,0.341233
8,0.469668,0.000000,0.000000,0.117851,0.028347,-0.200062,0.220416,-1.000000,0.000000,-1.000000,...,1.000000,0.190902,0.249252,0.534375,-0.217405,0.253582,0.251280,0.434423,0.141860,0.167931
9,0.918559,0.000000,0.000000,0.000000,0.000000,0.000000,0.925000,0.000000,-1.000000,0.000000,...,0.577350,0.000000,0.376715,0.000000,0.158777,0.572700,0.000000,0.336625,0.000000,0.615638
10,-0.037987,0.037796,0.000000,0.485794,-0.777714,0.957427,-0.156764,-1.000000,0.000000,-1.000000,...,-0.397276,-0.944911,-0.424631,-1.000000,0.268684,-0.382955,-0.241121,-0.571043,0.000000,-0.205081
