In [1]:
%pylab

Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas as pd
import numpy as np

In [3]:
train_data = pd.read_csv("data/train_data.csv")
train_data.shape

(14654, 5)

In [4]:
train_matrix = pd.pivot_table(train_data, values='quantity', 
                              index=['customer_id'], 
                              columns=['item_id'],
                             aggfunc=np.sum,
                             fill_value=0)   #change train from dataframe to matrix form
# row is customer, column is item and then the value is aggregated sum of quantity
train_matrix.shape

(4889, 3365)

In [5]:
train_matrix.columns

Index([u'1210D1354197520001', u'1210D1360397080400', u'1210D1360397080500',
       u'1210D1360497090600', u'1210D1379044670990', u'1210D1859044850650',
       u'1210D1859417340201', u'1210D1859494380800', u'1210D1859845550500',
       u'1210D1860015980200',
       ...
       u'U1Y06591470250', u'U1Y06591470700', u'U1Y08102210001',
       u'U1Y09632700003', u'U1Y09632700200', u'U1Y09845540210',
       u'U1Y22612330752', u'U1Y23481760990', u'U1Y25045530210',
       u'U1Y29591490990'],
      dtype='object', name=u'item_id', length=3365)

In [5]:
train = train_matrix.values   # change from pandas dataframe to numpy ndarray
np.count_nonzero(train)

14197

In [6]:
P = np.zeros((4889, 3365))   # P is the preference matrix

In [7]:
R = train_matrix    # keep the naming coordinate with paper "cf for implicit feedback"

In [8]:
P[train > 0] = 1
np.count_nonzero(P)     # p_ui values are derived by binarizaing the r_ui values

14197

In [9]:
alpha = 40
C = np.ones((4889, 3365))     # C matrix is the confidence matrix 
C += alpha * R   # c_ui = 1 + alpha * r_ui

In [10]:
np.unique(C, return_counts=True)    # return the unique value in C and count also

(array([   1.,   41.,   81.,  121.,  161.,  201.]),
 array([16437288,    13761,      419,       14,        2,        1]))

In [11]:
X = np.random.uniform(0, 1, size=(4889, 3)) # initialize latent factors bet
#ween 0 and 1, first we let the dimension of latent factors to be 3
Y = np.random.uniform(0, 1, size=(3365, 3)) 

In [12]:
C_u = np.zeros((3365, 3365))   # construct the diagonal matrix
for index in xrange(3365): 
    C_u[index][index] = C[9][index]
np.unique(C_u, return_counts=True)

(array([  0.,   1.,  41.]), array([11319860,     3356,        9]))

In [13]:
I_u = np.zeros((3365, 3365))   # identity matrix
np.fill_diagonal(I_u, 1)

In [14]:
np.unique(C_u - I_u, return_counts=True)

(array([  0.,  40.]), array([11323216,        9]))

In [15]:
Y_T_times_Y = np.dot(Y.transpose(), Y)
Y_T_times_Y.shape

(3, 3)

In [16]:
I_3 = np.zeros((3, 3))
np.fill_diagonal(I_3, 1)

In [17]:
from numpy.linalg import inv
first_part = inv(np.dot(np.dot(Y.transpose(), C_u - I_u), Y) + 2 * I_3)

In [18]:
second_part = np.dot(np.dot(Y.transpose(), C_u), P[9])

In [19]:
x_9 = np.dot(first_part, second_part)

In [20]:
x_9

array([ 0.56029361,  0.5916096 ,  1.06838685])

In [72]:
X[9]

array([ 0.4011029 ,  0.20649245,  0.83249256])

In [73]:
np.dot(x_9, Y[9])

1.0408194367112722

In [21]:
X[9] = x_9

In [22]:
X[9]

array([ 0.56029361,  0.5916096 ,  1.06838685])

In [27]:
np.dot(X, Y.transpose())[9][9]

1.8185309827621685

In [28]:
np.dot(X[9].transpose(), Y[9])

1.8185309827621685

In [33]:
P[:,0].shape

(4889,)

In [35]:
np.power(X, 2)

array([[ 0.1499916 ,  0.88185053,  0.95358553],
       [ 0.17831645,  0.06503   ,  0.01558804],
       [ 0.01165189,  0.80102577,  0.44786766],
       ..., 
       [ 0.20566497,  0.81565136,  0.49844689],
       [ 0.03514966,  0.24249871,  0.70672438],
       [ 0.03740752,  0.02938357,  0.0729263 ]])

In [12]:
np.savetxt("data/X.csv", X, delimiter=",")   # save the computed X to file to keep the current round result

In [13]:
X_c = np.loadtxt("data/X.csv", delimiter=",")

In [14]:
X_c.shape

(4889, 3)

In [15]:
X_c == X

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       ..., 
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]], dtype=bool)

In [16]:
np.unique(X_c == X)

array([ True], dtype=bool)