In [1]:
import matplotlib.pylab as plt

%matplotlib inline 
plt.style.use('seaborn-whitegrid')
plt.rc('text', usetex=True)
plt.rc('font', family='times')
plt.rc('xtick', labelsize=10) 
plt.rc('ytick', labelsize=10) 
plt.rc('font', size=12) 

In [2]:
import pandas as pd
import numpy as np
import math
import matplotlib.pylab as plt
from math import isnan


# Load Data set
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('../data/ml-100k/u.user', sep='|', names=u_cols, encoding='iso-8859-1')

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('../data/ml-100k/u.data', sep='\t', names=r_cols,  encoding='iso-8859-1')

# the movies file contains columns indicating the movie's genres
# let's only load the first three columns of the file with usecols
m_cols = ['movie_id', 'title', 'release_date']
movies = pd.read_csv('../data/ml-100k/u.item', sep='|', names=m_cols, usecols=range(3), encoding='iso-8859-1')

# Construccion del DataFrame
data = pd.merge(pd.merge(ratings, users), movies)
data = data[['user_id','title', 'movie_id','rating','release_date','sex','age']]


print("The movielens database has\n"
    +"    " + str(data.shape[0]) + " ratings\n"
    +"      ", data.user_id.nunique(),"users\n"
    +"     ", data.movie_id.nunique(), "movies.")

print(data.head())

The movielens database has
    100000 ratings
       943 users
      1682 movies.
   user_id         title  movie_id  rating release_date sex  age
0      196  Kolya (1996)       242       3  24-Jan-1997   M   49
1      305  Kolya (1996)       242       5  24-Jan-1997   M   23
2        6  Kolya (1996)       242       4  24-Jan-1997   M   42
3      234  Kolya (1996)       242       4  24-Jan-1997   M   60
4       63  Kolya (1996)       242       3  24-Jan-1997   M   31


In [3]:
from multiprocessing import Pool

def par_similarity_calc(sim_obj, user_list):
    for person1 in user_list:
        sim.setdefault(person1, {})
        a = data_train[data_train['user_id']==person1][['movie_id']]
        data_reduced = pd.merge(data_train, a, on='movie_id')
        for person2 in allUsers:
            # no es comparem am nosalres mateixos => we don’t compare ourselves to ourselves
            if person1 == person2: 
                continue
            self.sim.setdefault(person2, {})
            if(person1 in self.sim[person2] ):
                continue # since is a simetric matrix
            sim=self.sim_method(data_reduced, person1, person2, self.min_common_items)
            #print person1,person2,sim
            if(sim<0):
                self.sim[person1][person2] = 0
                self.sim[person2][person1] = 0
            else:
                self.sim[person1][person2] = sim
                self.sim[person2][person1] = sim

    
class CollaborativeFiltering3:
    """ Collaborative filtering using a custom sim(u,u'). """
    
    def __init__(self,DataFrame, similarity=SimPearson,min_common_items=10,max_sim_users=10):
        """ Constructor """
        self.sim_method=similarity# Gets recommendations for a person by using a weighted average
        self.df=DataFrame
        self.sim = pd.DataFrame(np.sum([0]), columns=data_train.user_id.unique(), \
                                index=data_train.user_id.unique())
        self.min_common_items=min_common_items
        self.max_sim_users=max_sim_users

    def learn(self):
        """ Prepare data structures for estimation. Similarity matrix for users """
        allUsers = set(self.df['user_id'])
        self.sim = {}
        pool = Pool(processes=12)
        pool.map(par_similarity_calc, allUsers)
        for person1 in allUsers:
            self.sim.setdefault(person1, {})
            a = data_train[data_train['user_id']==person1][['movie_id']]
            data_reduced = pd.merge(data_train, a, on='movie_id')
            for person2 in allUsers:
                # no es comparem am nosalres mateixos => we don’t compare ourselves to ourselves
                if person1 == person2: 
                    continue
                self.sim.setdefault(person2, {})
                if(person1 in self.sim[person2] ):
                    continue # since is a simetric matrix
                sim=self.sim_method(data_reduced, person1, person2, self.min_common_items)
                #print person1,person2,sim
                if(sim<0):
                    self.sim[person1][person2] = 0
                    self.sim[person2][person1] = 0
                else:
                    self.sim[person1][person2] = sim
                    self.sim[person2][person1] = sim
                
        self.mean_ratings = data_train[['user_id','movie_id','rating']] \
                .groupby('user_id')['rating'] \
                .mean()
                
                
    def estimate(self, user_id, movie_id):
        
        totals={}
        movie_users=self.df[self.df['movie_id'] ==movie_id]
        rating_num=0.0
        rating_den=0.0
        allUsers=set(movie_users['user_id'])
        listOrdered=sorted([(self.sim[user_id][other],other) for other in allUsers if user_id!=other],reverse=True)
        
        for item in range(min(len(listOrdered),self.max_sim_users)):
            other=listOrdered[item][1]
            rating_num += self.sim[user_id][other] * (float(movie_users[movie_users['user_id']==other]['rating']-self.mean_ratings[other]))
            rating_den += self.sim[user_id][other]
        if rating_den==0: 
            if self.df.rating[self.df['movie_id']==movie_id].mean()>0:
                # return the mean movie rating if there is no similar for the computation
                return self.df.rating[self.df['movie_id']==movie_id].mean()
            else:
                # else return mean user rating 
                return self.df.rating[self.df['user_id']==user_id].mean()
        return self.mean_ratings[user_id]+rating_num/rating_den

NameError: name 'SimPearson' is not defined