In [1]:
import numpy as np
import pandas as pd

# set some print options
np.set_printoptions(precision=4)
np.set_printoptions(threshold=5)
np.set_printoptions(suppress=True)
pd.set_option('precision', 3, 'notebook_repr_html', True, )

# init random gen
np.random.seed(2)

## Import Data

In [2]:
users = pd.read_table('data/ml-1m/users.dat',sep='::',header = None,
                      names = ['user_id','gender','age','occupation','zip'],engine='python')

ratings = pd.read_table('data/ml-1m/ratings.dat',sep='::',header = None,
                      names = ['user_id','movie_id','rating','timestamp'],engine='python')

movies = pd.read_table('data/ml-1m/movies.dat',sep='::',header = None,
                      names = ['movie_id','title','genres'],engine='python')

ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
#Merge

movielens = pd.merge(pd.merge(ratings,users),movies)
movielens.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama


## Training and Testing

1. Use a smaller dataset for testing purposes
2. Split data into train and test datasets

In [4]:
# Subset
df = movielens.ix[np.random.choice(movielens.index,size=10000,replace=False)]
print (movielens.shape)
print (df.shape)

(1000209, 10)
(10000, 10)


In [5]:
#Make sure that there are at least two ratings per user
twousers = pd.value_counts(df.user_id, sort=False) > 1
twousers2 = twousers[twousers].index

df1 = df.select(lambda x: df.loc[x,'user_id'] in twousers2)

In [6]:
df1['genres'] = df1.genres.apply(lambda x : x.split('|'))

In [8]:
def assign_test_set(df):
    sampled_ids = np.random.choice(df.index,size = np.int64(np.ceil(df.index.size*0.2)), replace = False)
    
    df.ix[sampled_ids, 'for_testing'] = True
    
    return df

In [9]:
df1['for_testing'] = False
grouped = df1.groupby('user_id',group_keys=False).apply(assign_test_set)

In [10]:
movielens_train = df1[grouped['for_testing']==False]
movielens_test = df1[grouped['for_testing']==True]
print (movielens_train.shape)
print (movielens_test.shape)
print (df1.shape)

(5801, 11)
(2641, 11)
(8442, 11)


In [11]:
#Store the data
movielens_train.to_csv('data/my_generated_movielens_train.csv')
movielens_test.to_csv('data/my_generated_movielens_test.csv')

## Evaluation Criterion

### RMSE

In [18]:
def rmse(y_pred,y_true):
    """ Compute Mean Squared Error """
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

### Evaluation

In [23]:
def evaluate(predictor, evaluator=rmse, similarityfunc = pearson):
    """ Evaluates the predictor using the evaluation function """
    
    ids_to_estimate = zip(movielens_test.user_id,movielens_test.movie_id)
    estimated = np.array([predictor(u,i,similarityfunc) for u,i in ids_to_estimate])
    true = movielens_test.rating.values
    
    return evaluator(estimated,true)

In [None]:
def predictor(user_id,movie_id):
    """ Single Value Prediction using the optimized prediction algorithm """
    
    return 2

In [None]:
evaluate(predictor)

### Simple Recommendation Engine - Ratings' Mean

In [None]:
def content_mean(user_id,movie_id):
    """ Takes mean of every movie rating over the target user """
    
    x = movielens_train.user_id == user_id
    return movielens_train.loc[x,'rating'].mean()

print ("RMSE score for contect mean method = {}".format(evaluate(content_mean)))

In [None]:
df1.head(1)

## Mean + Gender based filtering

In [None]:
user_info = users.set_index('user_id')


def collab_gender(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u') based on gender. """
    
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    
    means_by_gender = ratings_by_others.pivot_table('rating', index='movie_id', columns='gender')
    user_gender = user_info.ix[user_id, 'gender']
    if user_gender in means_by_gender.columns: 
        return means_by_gender.ix[movie_id, user_gender]
    else:
        return means_by_gender.ix[movie_id].mean()

print ('RMSE for collab_gender: {}'.format(evaluate(collab_gender)))

### Class with an implicit functionsfor aggregation

In [None]:
class CollabGenderReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_gender = movielens_train.pivot_table('rating', index='movie_id', columns='gender')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        if movie_id not in self.means_by_gender.index: 
            return 3.0
        
        user_gender = user_info.ix[user_id, 'gender']
        if ~np.isnan(self.means_by_gender.ix[movie_id, user_gender]):
            return self.means_by_gender.ix[movie_id, user_gender]
        else:
            return self.means_by_gender.ix[movie_id].mean()

reco = CollabGenderReco()
reco.learn()
print ('RMSE for CollabGenderReco: {}'.format(evaluate(collab_gender)))

## Mean + Genre based filtering

In [None]:
def listmatcher(list1,list2):
    """ Compares two lists are created fraction depending on their common elements """
    l = []
    for i in list1:
        for j in list2:
                if i == j:
                    l.append(i)
    
    return len(l)*1.0/len(list1)

def collab_genre(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u') based on gender. """
    
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    user_condition2 = movielens_train.user_id == user_id
    
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    user_genres = movielens_train.loc[user_condition2 & movie_condition, 'genres'] # User's genres
    
    means_by_gender = ratings_by_others.pivot_table('rating', index='movie_id', columns='gender')
    user_gender = user_info.ix[user_id, 'gender']
    if user_gender in means_by_gender.columns: 
        return means_by_gender.ix[movie_id, user_gender]
    else:
        return means_by_gender.ix[movie_id].mean()

print ('RMSE for collab_gender: {}'.format(evaluate(collab_gender)))

In [None]:
user_id = 1527
movie_id = 2443

In [None]:
def listmatcher(list1,list2):
    """ Compares two lists are created fraction depending on their common elements """
    l = []
    for i in list1:
        for j in list2:
                if i == j:
                    l.append(i)
    
    return len(l)*1.0/len(list1)

In [None]:
user_condition = movielens_train.user_id != user_id
movie_condition = movielens_train.movie_id == movie_id
user_condition2 = movielens_train.user_id == user_id

In [None]:
ratings_by_others = movielens_train.loc[user_condition]

In [None]:
user_genres = movielens_train.loc[user_condition2 & movie_condition, 'genres'].iloc[0]

In [None]:
# ratings_by_others['GenreScore'] = 0.0
# for i in range(len(ratings_by_others)):
#     ratings_by_others['GenreScore'].iloc[i] = listmatcher(user_genres,ratings_by_others['genres'].iloc[i])

In [None]:
# ratings_by_others

## Custom Similarity Functions

** Custom similarity functions can be used to measure 'similarity' betweeen two persons for collaborative recommender systems **
* Euclidean Similarity
$$ sim(x,y) = \frac{1}{1 + \sqrt{\sum (x - y)^2}}$$

In [12]:
def euclidean(s1, s2):
    """Take two pd.Series objects and return their euclidean 'similarity'."""
    diff = s1 - s2
    return 1 / (1 + np.sqrt(np.sum(diff ** 2)))

- Cosine similarity

$$ sim(x,y) = \frac{(x . y)}{\sqrt{(x . x) (y . y)}} $$

In [13]:
def cosine(s1, s2):
    """Take two pd.Series objects and return their cosine similarity."""
    return np.sum(s1 * s2) / np.sqrt(np.sum(s1 ** 2) * np.sum(s2 ** 2))

- Pearson correlation

$$ sim(x,y) = \frac{(x - \bar x).(y - \bar y)}{\sqrt{(x - \bar x).(x - \bar x) * (y - \bar y)(y - \bar y)}} $$

In [14]:
def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))

- Jaccard similarity

$$ sim(x,y) = \frac{(x . y)}{(x . x) + (y . y) - (x . y)} $$

In [15]:
def jaccard(s1, s2):
    dotp = np.sum(s1 * s2)
    return dotp / (np.sum(s1 ** 2) + np.sum(s2 ** 2) - dotp)

def binjaccard(s1, s2):
    dotp = (s1.index & s2.index).size
    return dotp / (s1.sum() + s2.sum() - dotp)

In [24]:
class CollabReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = df1.pivot_table('rating', index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id, similarityfunc = pearson):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 3.0
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: similarityfunc(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)
        
reco = CollabReco()
reco.learn()
print ('RMSE for Pearson - CollabReco: {}'.format(evaluate(reco.estimate)))
print ('RMSE for Euclidean - CollabReco: {}'.format(evaluate(reco.estimate,similarityfunc=euclidean)))
print ('RMSE for Cosine - CollabReco: {}'.format(evaluate(reco.estimate,similarityfunc=cosine)))



RMSE for Pearson - CollabReco: 1.08629365954674
RMSE for Euclidean - CollabReco: 1.0197094450339468
RMSE for Cosine - CollabReco: 1.1464737191831096
