# Personalized Page Rank

Members: Peter Weber, Huang Chen

### Data import

In [102]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import cProfile
from IPython.core.debugger import set_trace

In [2]:
training_data = pd.read_csv('./data/training.csv', sep=',')

urm = pd.pivot_table(training_data[['user_id','item_id','rating']],columns='item_id',index='user_id',values='rating')
urm.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,-9.912859,,,-8.459001,-9.753888,,,,...,,,,,,,,,,
1,,,5.489137,4.388006,,,,,8.833517,,...,,,0.245176,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,6.463533,,,...,,,,,,,,,,
4,8.866705,,-4.101858,,0.853917,,,3.894698,,,...,,,,,,,2.869197,,1.381025,


In [3]:
training_data.head()

Unnamed: 0,user_id,item_id,rating
0,13291,98,-0.670408
1,19559,8,1.436404
2,32928,50,1.711739
3,34459,29,-10.0
4,68339,19,4.27797


In [4]:
target_user_items = pd.read_csv('./data/target_user_items.csv')
target_user_items.shape

target_user_items.head()

target_users = target_user_items.user_id.values
train_users = training_data.user_id.values

missing_users_bool = np.array((1-np.in1d(target_users, train_users)), dtype = bool)
missing_users = target_users[missing_users_bool]
missing_users

array([72797, 63003, 63003, 67871, 67871, 48845, 54070, 70413, 70413,
       63176, 63176, 64052, 66872, 57741, 57741, 56801, 53899, 62624,
       57485, 53066, 63480, 66606, 66606, 49873, 58710])

In [5]:
sample_submission = pd.read_csv('./data/submision_sample.csv')
sample_submission.head()

Unnamed: 0,id,rating
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


### Helper funcitons

In [98]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

def index2user(index, UI):
    user_index = np.arange(len(UI.index))
    users = dict(zip(user_index,UI.index))
    return(users[index])

def user2index(user, UI):
    user_index = np.arange(len(UI.index))
    user_index2id = dict(zip(UI.index, user_index))
    return(user_index2id[user])

def index2item(index, UI):
    item_index = np.arange(len(UI.columns))
    items = dict(zip(item_index,UI.columns))
    return(items[index])

def item2index(item, UI):
    item_index = np.arange(len(UI.columns))
    item_index2id = dict(zip(UI.columns, item_index))
    return(item_index2id[item])

#index2item(99, urm)
#item2index(100, urm)
#index2user(73338, urm)
#user2index(73420, urm)
user2index(0,urm)

0

### Construct Matrices

In [80]:
# Now, we construct the matrix S which we later use to build the restart vector.
def construct_pagerank_matrix_weighted(UI):
    """
    input
        - UI: User-Item matrix
    output
        - UI and UI_T, both column stochastic, and with values between 0 and 1
    """
    ## Normalize to values between 0 and 1
    UI_fill = UI.fillna(UI.min())    
    UI_norm = UI_fill + abs(UI.min())
    
    ## Make matrix column stochastic
    col_sums = UI_norm.sum(axis = 0)
    row_sums = UI_norm.sum(axis = 1)
    UI_col_stoch = UI_norm/col_sums
    UI_T_col_stoch = (UI_norm.T)/row_sums
    
    return(UI_col_stoch.fillna(0), UI_T_col_stoch.fillna(0))

# We contrust the matrix A.
def construct_pagerank_matrix_binary(UI):
    """
    input
        - UI: User-Item matrix
    output
        - UI and UI_T, both column stochastic, and having all equal values per column
    """
    UI_binary = UI.notnull().astype('int')
    
    ## Make matrix column stochastic
    col_sums = UI_binary.sum(axis = 0)
    row_sums = UI_binary.sum(axis = 1)

    UI_col_stoch = UI_binary/col_sums
    UI_T_col_stoch = (UI_binary.T)/row_sums
    
    return(UI_col_stoch.fillna(0), UI_T_col_stoch.fillna(0))

### Power Method

In [83]:
# With the given matrix A,At and the restart vector from the matrix S, 
# we use Power Method to compute similarity between users
def solve_power_method_uu(A, A_T, restart_vec, alpha, tol, max_iter = 1000, print_conv = False):
    """
    There are two systems of equations, the first is the upper right from lectures
    and the second is the lower left from lectures
    Solve them separately and restart in the item step
    
    Inputs
        - A: pagerank matrix, column stochastic 
        - A_T: its inverse, column stochastic
        - restart vec: user individual restart vector
        
    Outputs
        - x_u_p1: user_vector of most similar users
        - x_i_p1: item_vector of most liked items for user defined by restart vector
    """
    #set_trace()
    
    iter_ = 0
    users, items = A.shape
    
    ## TODO: user initialization vector from user
    x_u = np.random.random((users,1))
    x_u = x_u/np.linalg.norm(x_u)
    
    while iter_ <= max_iter:
        iter_ += 1
        ## computation upper right
        x_u_p1 = A.dot((1-alpha) * A_T.dot(x_u) + alpha * restart_vec)
        x_u_p1 = x_u_p1/np.linalg.norm(x_u_p1)
        
        ## calculate iter error
        error = np.linalg.norm(x_u - x_u_p1)
        
        ## convergence criteria
        if error < tol:
            if print_conv:
                print("Convergence!")
            break
            
        if iter_ == max_iter:
            print("PM did not converge!!!")
         
        ## set intial values in loop
        x_u = x_u_p1
        
    return(x_u_p1)

### Solve personalized pagerank

In [147]:
# Method that construct the necesary matrices and then use them to calculate users similarities vector for each user.
def solve_PPR_uu(UI, user_ids, alpha, tol, missing_users):
    """
    inputs:
        - UI: user-item matrix
        - user_ids: all user_ids of set that we want to predict
        - alpha: weight param in power method
        - tol: tolerance
        - missing_users: users that exist in target set but not in training set
        
    outputs:
        - user_similarity: user similiraity matrix for all users in target/prediction set
    """    
        
        
    num_users, num_items = UI.shape
    
    ## Pagerang matrix
    A, A_T = construct_pagerank_matrix_binary(UI)  
    A, A_T = A.values.copy(), A_T.values.copy()
    
    ## Personalization matrix for restart vector
    _, E_T = construct_pagerank_matrix_weighted(UI)
    E_T = E_T.values.copy()
    
    user_similarity = {}
    
    counter = 0
    
    existing_users = set(user_ids) - set(missing_users)
    
    for user in existing_users:
        counter += 1
                    
        if float(counter % 100) == 0.0:
            print(counter)
        index = user2index(user, UI)
        
        ## restart vector
        restart_vec = E_T[:,index]
        restart_vec = restart_vec.reshape(-1 ,1)
        
        x_u = solve_power_method_uu(A, A_T, restart_vec, alpha, tol)
        user_similarity[user] = x_u.flatten()
    return(user_similarity)

#user_ids = urm.index[0:-1:10000]
#user_sim = solve_PPR_uu(urm, user_ids, 0.5, 1e-6)

In [26]:
def get_k_most_similar_users_dict(user_id, user_sim, UI, k, return_indices = True):
    """
    Inputs:
        - user_id: id of user in UI
        - UU: User-user similarity matrix, output of solve_PPR_uu
        - UI: user-item matrix
        - k: number of most similar users
    Outputs:
        - user_array: array of similarity scores for user with user_idx
        - sorted_users: user_ids of k most similar users
    """
    #user_idx = user2index(user_id, UI)
    user_sim_array = user_sim[user_id]
    sorted_indices = user_sim_array.argsort()[-k:][::-1]
    if return_indices:
        return(user_sim_array, np.array(sorted_indices))
    else:
        sorted_users = [index2user(u_idx, UI) for u_idx in sorted_indices]
        return(user_sim_array, np.array(sorted_users))
    
#get_k_most_similar_users_dict(10000, user_sim, urm, k = 100)

In [151]:
def predict_dict(user_id, item_id, user_sim, UI, k, missing_users):
    """
    This prediction function uses standard prediction equation such as Eq. 2.2
    on page 52 in Aggarwal, Recommender Systems
    
    inputs:
        - user_id, item_id
        - user_sim: output of solve_PPR_uu
        - UI: user-item matrix from dataset
        - k: most similar users
        - missing_users: user that exist in the target set, but not in the training set
        
    output:
        - rating for given user_id, item_id
    """
    
    if user_id in missing_users:
        return(pd.Series(0))
    
    similarities, user_indices = get_k_most_similar_users_dict(user_id, user_sim, UI, k)
    user_idx = user2index(user_id, UI)
    item_idx = item2index(item_id, UI)
    
    rating_num = 0.0
    rating_den = 0.0
    
    user_mean = UI[UI.index == user_id].mean(skipna = True, axis = 1)

    for other_idx in user_indices:
        if other_idx == user_idx: continue
        other_mean = UI.iloc[other_idx, :].mean(skipna = True)
        other_rating = UI.iloc[other_idx, item_idx]
        if not pd.isnull(other_rating):
            rating_num += similarities[other_idx] * (other_rating - other_mean)
        rating_den += similarities[other_idx]
    
    if rating_den == 0.0 or rating_num == 0.0:
        prediction = user_mean
    else:
        prediction = user_mean + rating_num/rating_den
    return(prediction)


#cProfile.run('predict(1, 1, user_user, urm)')
#predict_dict(user_id = 10000, item_id = 1, user_sim = user_sim, UI = urm, k = 100, missing_users = missing_users)

### Train/Test split

In [125]:
_, test = train_test_split(training_data, test_size = 0.0005, random_state = 4)

In [126]:
test.shape

(475, 3)

In [127]:
def set_test_ratings_to_na(UI, test):
    UI_ = UI.copy()
    counter = 0
    for user_id, item_id in zip(test.user_id.values, test.item_id.values):
        counter += 1
        user_idx = user2index(user_id, UI)
        item_idx = item2index(item_id, UI)
        UI_.iloc[user_idx, item_idx] = np.NaN
        if float(counter % 100.0) == 0.0:
            print(counter)
    return(UI_)    

train = set_test_ratings_to_na(urm, test)
train.head()

100
200
300
400


item_id,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,-9.912859,,,-8.459001,-9.753888,,,,...,,,,,,,,,,
1,,,5.489137,4.388006,,,,,8.833517,,...,,,0.245176,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,6.463533,,,...,,,,,,,,,,
4,8.866705,,-4.101858,,0.853917,,,3.894698,,,...,,,,,,,2.869197,,1.381025,


## Training

In [132]:
user_user = solve_PPR_uu(train, user_ids = test.user_id, alpha = 0.1, tol = 1e-8)

100
200
300
400


## Testing

In [133]:
def predict_ratings_on_test_set(test, user_user_sim, k):
    """
    inputs:
        - test: test_set or set to predict on
        - user_user_sim: output of solve_PPR_uu
        
    outputs:
        rating_pred: vector of ratings for test set
    
    """
    rating_pred = []
    counter = 0
    for user_id, item_id in zip(test.user_id, test.item_id):
        counter += 1
        rating_pred.append(predict_dict(user_id, item_id, user_user_sim, train, k, missing_users).max(skipna=True))
        if float(counter % 200) == 0.0:
            print(counter)
    return(rating_pred)

In [136]:
rating_pred_ = predict_ratings_on_test_set(test, user_user, k = 50)

200
400


In [137]:
test.rating
compute_rmse(rating_pred_, test.rating)

4.4993274909401872

## Prediction for submission

In [None]:
user_user_submission = solve_PPR_uu(urm, user_ids = target_user_items.user_id, alpha = 0.2, tol = 1e-6, 
                                    missing_users = missing_users)

In [152]:
rating_submission = predict_ratings_on_test_set(target_user_items, user_user_submission, k = 50)

200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800
4000
4200
4400
4600
4800
5000
5200
5400
5600
5800
6000
6200
6400
6600
6800
7000
7200
7400
7600
7800
8000
8200
8400
8600
8800
9000
9200
9400
9600
9800
10000
10200
10400
10600
10800
11000
11200
11400
11600
11800
12000
12200
12400
12600
12800
13000
13200
13400
13600
13800
14000
14200
14400
14600
14800
15000
15200
15400
15600
15800
16000
16200
16400
16600
16800
17000
17200
17400
17600
17800
18000
18200
18400
18600
18800
19000
19200
19400
19600
19800
20000
20200
20400
20600
20800
21000
21200
21400
21600
21800
22000
22200
22400
22600
22800
23000
23200
23400
23600
23800
24000
24200
24400
24600
24800
25000
25200
25400
25600
25800
26000
26200
26400
26600
26800
27000
27200
27400
27600
27800
28000
28200
28400
28600
28800
29000
29200
29400
29600
29800
30000
30200
30400
30600
30800
31000
31200
31400
31600
31800
32000
32200
32400
32600
32800
33000
33200
33400
33600
33800
34000
34200
34400
34600
34800
35000
352

In [153]:
sample_submission.tail()

Unnamed: 0,id,rating
49995,49995,0
49996,49996,0
49997,49997,0
49998,49998,0
49999,49999,0


In [154]:
submission = pd.DataFrame({
    'id': np.arange(sample_submission.shape[0]),
    'rating': rating_submission
})

submission.tail()

Unnamed: 0,id,rating
49995,49995,1.111051
49996,49996,0.256974
49997,49997,3.883351
49998,49998,1.377812
49999,49999,-0.074726


In [155]:
submission.to_csv('submission.csv', index = False)

### This gave our Kaggle result of 4.69

# Item-item based CF

**We use this as base line. Item-item CF gave us 4.80 on Kaggle! **

In [46]:
# Use the movielens dataset with 100,000 ratings
%autosave 150
%matplotlib inline
import operator
import pandas as pd
import numpy as np
import math
import random
import matplotlib.pyplot as plt; plt.rcdefaults()
import cProfile
from timeit import default_timer as timer

Autosaving every 150 seconds


In [37]:
# Number of items
N = 10
# Number of nearest neighbors
NN = N

In [22]:
from scipy.stats import pearsonr
from scipy.spatial.distance import euclidean

def pearson_similarity_items(df, item1, item2, min_common_users=1):
    """
    Returns a Pearson correlation score for item1 and item2
    """    
     # GET USERS OF ITEM1
    users_item1 = df[df['item_id'] == item1]
    # GET USERS OF ITEM2
    users_item2 = df[df['item_id'] == item2]
    
    # FIND SHARED USERS
    users_common = pd.merge(users_item1, users_item2, on = 'user_id')
    if len(users_common)==0:
        return 0    
    if len(users_common)<min_common_users:
        return 0    
    corr=pearsonr(users_common['rating_x'],users_common['rating_y'])[0]
    if np.isnan(corr):
        return 0
    return corr

def cos_similarity_items(df, item1, item2, min_common_users=1):
    """
    Returns a cosine similarity score for item1 and item2
    """        
    # GET USERS OF ITEM1
    users_item1 = df[df['item_id'] == item1]
    
    # GET USERS OF ITEM2
    users_item2 = df[df['item_id'] == item2]
    
    # FIND SHARED USERS
    users_common = pd.merge(users_item1, users_item2, on = 'user_id')
    if len(users_common)==0:
        return 0    
    if(len(users_common)<min_common_users):
        return 0  

    num = users_common['rating_x'].dot(users_common['rating_y'])
    den = np.sqrt(users_common['rating_x'].dot(users_common['rating_x'])*\
                  users_common['rating_y'].dot(users_common['rating_y']))
    cos_sim = num/den
    if(np.isnan(cos_sim)):
        return 0
    return cos_sim


def adjcos_similarity_items(df_, item1, item2, min_common_users=1):
    """
    Returns an adjusted cosine similarity score for item1 and item2
    """
    df = df_.copy()
    user_means = df.groupby(['user_id'], axis=0)['rating'].transform('mean')
    df['rating'] = df['rating'] - user_means
    
    # GET USERS OF ITEM1
    users_item1 = df[df['item_id'] == item1]
    
    # GET USERS OF ITEM2
    users_item2 = df[df['item_id'] == item2]
    
    # FIND SHARED USERS
    u_common = pd.merge(users_item1, users_item2, on = 'user_id')
    if len(u_common)==0:
        return 0    
    if(len(u_common)<min_common_users):
        return 0 
    
    num = u_common['rating_x'].dot(u_common['rating_y'])
    den = np.sqrt(u_common['rating_x'].dot(u_common['rating_x'])*u_common['rating_y'].dot(u_common['rating_y']))
    adjcos = num/den
    if(np.isnan(adjcos)):
        return 0
    return adjcos

In [55]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

def evaluate(estimate_f,data_train,data_test):
    """ RMSE-based predictive performance """
    ids_to_estimate = zip(data_test.user_id, data_test.item_id)
    estimated = np.array([estimate_f(u,i) if u in data_train.user_id else 0 for (u,i) in ids_to_estimate ])
    real = data_test.rating.values
    return compute_rmse(estimated, real)

def evaluate_k(estimate_f,data_train,data_test,k):
    """ RMSE-based predictive performance. Takes the number k of nearest neighbors as input """
    ids_to_estimate = zip(data_test.user_id, data_test.item_id)
    estimated = np.array([estimate_f(u,i,k) if u in data_train.user_id else 0 for (u,i) in ids_to_estimate ])
    real = data_test.rating.values
    return compute_rmse(estimated, real)

In [143]:
class CollaborativeFiltering:
    """ Collaborative filtering using a custom sim(i,i'). """
    
    def __init__(self,df, similarity=adjcos_similarity_items):
        """ Constructor """
        self.sim_method=similarity# Gets recommendations for a person by using a weighted average
        self.df=df
        self.sim={}   
        
    def get_sim(self):
        """ Return similarity for debugging reasons """ 
        return self.sim    
        
    def train(self):
        """ Prepare data structures for estimation. Similarity matrix for items """
        all_items = set(self.df['item_id'])
        for item1 in all_items:
            self.sim.setdefault(item1, {})
            a=data_train[data_train['item_id']==item1][['user_id']]
            data_reduced=pd.merge(data_train,a,on='user_id')
            for item2 in all_items:
                
                if item1==item2: continue
                self.sim.setdefault(item2, {})
                if(item1 in self.sim[item2]):continue
                sim=self.sim_method(data_reduced,item1,item2)
                if(sim<0):
                    self.sim[item1][item2]=0
                    self.sim[item2][item1]=0
                else:
                    self.sim[item1][item2]=sim
                    self.sim[item2][item1]=sim
        
    def get_most_similar_items(self, item_id, k):
        sorted_sim_of_item = sorted(self.sim[item_id].items(), key=operator.itemgetter(1), reverse = True)
        most_similar_items = [sorted_sim_of_item[i][0] for i in range(k-1)]
        return(most_similar_items)
            
    def predict_k(self, user_id, item_id, k):
        
        # Extract k most similar items
        most_similar_items = set(self.get_most_similar_items(item_id, k))
        
        totals={}
        user_items=self.df[self.df['user_id'] == user_id]
        rating_num=0.0
        rating_den=0.0
        all_items=set(user_items['item_id'])
        
        # Intersection of k most similar items with items that have been commonly rated
        intersect_items = most_similar_items & all_items
        
        for other in intersect_items:
            if item_id==other: continue 
            rating_num += self.sim[item_id][other] * float(user_items[user_items['item_id']==other]['rating'])
            rating_den += self.sim[item_id][other]

        if rating_den==0: 
            if self.df.rating[self.df['user_id']==user_id].mean()>0:
                # return the mean user rating if there is no similar for the computation
                return self.df.rating[self.df['user_id']==user_id].mean()
            else:
                # else return mean item rating 
                return self.df.rating[self.df['item_id']==item_id].mean()
        
        return rating_num/rating_den

    
    def predict(self, user_id, item_id):
        
        totals={}
        user_items=self.df[self.df['user_id'] == user_id]
        rating_num=0.0
        rating_den=0.0
        all_items=set(user_items['item_id'])
        
        for other in all_items:
            if item_id==other: continue 
            rating_num += self.sim[item_id][other] * float(user_items[user_items['item_id']==other]['rating'])
            rating_den += self.sim[item_id][other]

        if rating_den==0: 
            if self.df.rating[self.df['user_id']==user_id].mean()>0:
                # return the mean user rating if there is no similar for the computation
                return self.df.rating[self.df['user_id']==user_id].mean()
            else:
                # else return mean item rating 
                return self.df.rating[self.df['item_id']==item_id].mean()
        
        return rating_num/rating_den

In [185]:
np.random.seed(111)

def assign_to_set(df):
    sampled_ids = np.random.choice(df.index,
                                   size=np.int64(np.ceil(df.index.size * 0.01)),
                                   replace=False)
    df.loc[sampled_ids, 'for_testing'] = True
    return df

In [152]:
### Entire data set
data = pd.read_csv('./data/training.csv', sep=',')
data['for_testing'] = False
grouped = data.groupby('user_id', group_keys=False).apply(assign_to_set)
data_train = data[grouped.for_testing == False]
data_test = data[grouped.for_testing == True]

print("Training data_set has "+ str(data_train.shape[0]) +" ratings")
print("Test data set has "+ str(data_test.shape[0]) +" ratings")
print("The dataset has ", data.item_id.nunique(), " items")

print(data.head())

Training data_set has 876661 ratings
Test data set has 73339 ratings
The dataset has  100  items
   user_id  item_id     rating  for_testing
0    13291       98  -0.670408        False
1    19559        8   1.436404        False
2    32928       50   1.711739        False
3    34459       29 -10.000000        False
4    68339       19   4.277970        False


In [170]:
data_train.head()

Unnamed: 0,user_id,item_id,rating,for_testing
1,19559,8,1.436404,False
2,32928,50,1.711739,False
3,34459,29,-10.0,False
4,68339,19,4.27797,False
5,7685,11,7.441743,False


In [None]:
data_test.head()

In [None]:
def get_user_mean(user_id, df_train):
    user = df_train[df_train["user_id"] == user_id] 
    mean = user.rating.mean()
    return(mean)
    
    
def construct_user_mean_dict(df_train):
    user_ids = df_train.user_id.unique()
    
    counter = 0
    user_means = {}
    for user in user_ids:
        counter += 1
        user_means[user] = get_user_mean(user, df_train)
        if float(counter % 1000) == 0.0:
            print(counter)
    return(user_means)

def predict_mean_ratings_on_test_set(df_train):
    user_means = construct_user_mean_dict(df_train)

    mean_ratings = []

    for user in data_test.user_id:
        try:
            mean_ratings.append(user_means[user])
        except:
            mean_ratings.append(0.0)
    return(mean_ratings)
        
compute_rmse(predict_mean_ratings_on_test_set(data_train), data_test.rating)

In [198]:
reco_item_adjcos = CollaborativeFiltering(data, similarity=adjcos_similarity_items)
reco_item_adjcos.train()

In [199]:
reco_item_adjcos.predict(19559,8)

7.9631772074794087

## Predict test set

In [183]:
def predict_ratings_on_test_set_item_item_class(test):
    rating_pred = []
    counter = 0
    for user_id, item_id in zip(test.user_id, test.item_id):
        counter += 1
        rating_pred.append(reco_item_adjcos.predict(user_id, item_id))
        if float(counter % 1000) == 0.0:
            print(counter)
    return(rating_pred)

def correct_for_NaNs(rating_pred):
    rating_pred_ = pd.Series(test.rating_pred)
    print(rating_pred_.isnull().sum(), "NaNs encountered!")
    
    rating_pred_[rating_pred_.isnull()] = 0
    
    return(rating_pred_)

In [187]:
#test_ratings = predict_ratings_on_test_set_item_item_class(data_test)
#test_ratings

In [188]:
compute_rmse(data_test.rating, test_ratings)

5.0680789900118981

In [200]:
submission_ratings = predict_ratings_on_test_set_item_item_class(target_user_items)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000


In [None]:
sample_submission.tail()

submission = pd.DataFrame({
    'id': np.arange(sample_submission.shape[0]),
    'rating': submission_ratings
})

submission.tail()

submission.to_csv('submission.csv', index = False)