In [10]:
import numpy as np, time, sys
from numba import jit
import os

import numpy as np
from scipy import sparse

class DataLoader:
    def __init__(self):
        pass

    @staticmethod
    def create_review_matrix(file_path):
        data = np.array([[int(tok) for tok in line.split(',')[:3]]
                         for line in open(file_path)])

        ij = data[:, :2]
        ij -= 1
        values = data[:, 2]
        review_matrix = sparse.csc_matrix((values, ij.T)).astype(float)
        return review_matrix

def get_user_ratings(user_id, review_matrix):
    user_reviews = review_matrix[user_id]
    user_reviews = user_reviews.toarray().ravel()
    user_rated_items, = np.where(user_reviews > 0)
    user_ratings = user_reviews[user_rated_items]
    return user_ratings

def get_item_ratings(item_id, review_matrix):
    item_reviews = review_matrix[:, item_id]
    item_reviews = item_reviews.toarray().ravel()
    item_rated_users, = np.where(item_reviews > 0)
    item_reviews = item_reviews[item_rated_users]
    return item_reviews

def create_user_feature_matrix(review_matrix, NUM_FEATURES, FEATURE_INIT_VALUE):
    num_users = review_matrix.shape[0]
    user_feature_matrix = np.empty((NUM_FEATURES, num_users))
    user_feature_matrix[:] = FEATURE_INIT_VALUE
    return user_feature_matrix

def create_item_feature_matrix(review_matrix, NUM_FEATURES, FEATURE_INIT_VALUE):
    num_items = review_matrix.shape[1]
    item_feature_matrix = np.empty((NUM_FEATURES, num_items))
    item_feature_matrix[:] = FEATURE_INIT_VALUE
    return item_feature_matrix

@jit(nopython=True)
def predict_rating(user_id, item_id, user_feature_matrix, item_feature_matrix):
    rating = 1.
    for f in range(user_feature_matrix.shape[0]):
        rating += user_feature_matrix[f, user_id] * item_feature_matrix[f, item_id]

    # We trim the ratings in case they go above or below the stars range
    if rating > 10: rating = 10
    elif rating < 1: rating = 1
    return rating

@jit(nopython=True)
def sgd_inner(feature, A_row, A_col, A_data, user_feature_matrix, item_feature_matrix, NUM_FEATURES):
    K = 0.40
    LEARNING_RATE = 0.0001
    squared_error = 0
    for k in range(len(A_data)):
        user_id = A_row[k]
        item_id = A_col[k]
        rating = A_data[k]
        p = predict_rating(user_id, item_id, user_feature_matrix, item_feature_matrix)
        err = rating - p

        squared_error += err ** 2

        user_feature_value = user_feature_matrix[feature, user_id]
        item_feature_value = item_feature_matrix[feature, item_id]
        #for j in range(NUM_FEATURES):
        user_feature_matrix[feature, user_id] += \
            LEARNING_RATE * (err * item_feature_value - K * user_feature_value)
        item_feature_matrix[feature, item_id] += \
            LEARNING_RATE * (err * user_feature_value - K * item_feature_value)

    return squared_error

def calculate_features(A_row, A_col, A_data, user_feature_matrix, item_feature_matrix, NUM_FEATURES):

    MIN_IMPROVEMENT = 0.0001
    MIN_ITERATIONS = 100
    rmse = 0
    last_rmse = 0
    print(len(A_data))
    num_ratings = len(A_data)
    for feature in xrange(NUM_FEATURES):
        iter = 0
        while (iter < MIN_ITERATIONS) or  (rmse < last_rmse - MIN_IMPROVEMENT):
            last_rmse = rmse
            squared_error = sgd_inner(feature, A_row, A_col, A_data, user_feature_matrix, item_feature_matrix, 
                                      NUM_FEATURES)
            rmse = (squared_error / num_ratings) ** 0.5
            iter += 1
        print ('Squared error = %f' % squared_error)
        print ('RMSE = %f' % rmse)
        print ('Feature = %d' % feature)
    return last_rmse


LAMBDA = 0.00
FEATURE_INIT_VALUE = 0.1
NUM_FEATURES = 20

file_path = 'data/new_int_train_no_header_plus1.csv'

A = DataLoader.create_review_matrix(file_path)

user_feature_matrix = create_user_feature_matrix(A, NUM_FEATURES, FEATURE_INIT_VALUE)
item_feature_matrix = create_item_feature_matrix(A, NUM_FEATURES, FEATURE_INIT_VALUE)

users, item = A.nonzero()
A = A.tocoo()

rmse = calculate_features(A.row, A.col, A.data, user_feature_matrix, item_feature_matrix, NUM_FEATURES )
print('rmse', rmse)

6103246
Squared error = 21998854.356444
RMSE = 1.898539
Feature = 0
Squared error = 22083282.562901
RMSE = 1.902179
Feature = 1
Squared error = 22061244.124484
RMSE = 1.901230
Feature = 2
Squared error = 22040371.973131
RMSE = 1.900330
Feature = 3
Squared error = 22020585.737158
RMSE = 1.899477
Feature = 4
Squared error = 22001812.805751
RMSE = 1.898667
Feature = 5
Squared error = 21983985.838911
RMSE = 1.897898
Feature = 6
Squared error = 21967043.064170
RMSE = 1.897166
Feature = 7
Squared error = 21950929.151474
RMSE = 1.896470
Feature = 8
Squared error = 21935592.328850
RMSE = 1.895808
Feature = 9
Squared error = 21920984.029196
RMSE = 1.895176
Feature = 10
Squared error = 21907060.290272
RMSE = 1.894574
Feature = 11
Squared error = 21893780.446140
RMSE = 1.894000
Feature = 12
Squared error = 21881106.987641
RMSE = 1.893452
Feature = 13
Squared error = 21869004.067446
RMSE = 1.892928
Feature = 14
Squared error = 21857438.567979
RMSE = 1.892427
Feature = 15
Squared error = 21846380.9

In [8]:
len(item_feature_matrix)

20

In [13]:
import pandas as pd

dat = pd.read_csv('data/new_train.csv')

In [14]:
dat.head()

Unnamed: 0,item,user_id,rating
0,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A079789010EVSPIBCSWFO,6
1,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A13U02TNYRFNOI,6
2,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A1QYORNO0GY308,5
3,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A1SUU1QIRDZXJC,1
4,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A2ELH6CUC5Y8J4,3


In [27]:
dat = pd.read_csv('data/new_int_train.csv')

In [29]:
dat['user_int'] = dat['user_int'] + 1
dat['item_int'] = dat['item_int'] + 1
dat.head()

Unnamed: 0,user_int,item_int,rating
0,14,1,6
1,303,1,6
2,1955,1,5
3,2104,1,1
4,3644,1,3


In [30]:
dat.to_csv('data/new_int_train_no_header_plus1.csv', header = None, index = False)

In [19]:
df = pd.read_csv('data/sample_train_int_no_header_plus1.csv')

In [28]:
dat.head()

Unnamed: 0,user_int,item_int,rating
0,13,0,6
1,302,0,6
2,1954,0,5
3,2103,0,1
4,3643,0,3
