### Evaluating common datasets across NFM and surpriselib

**Objective:** To evaluate & contrast prediction accuracy between builtin Surpiselib algos and NFM methhods on various datasets.

**The workflow of the notebook is as:**
* Understanding format in which data is served to methods in either implementation.
    
* Modify inbuilt Surpirse datasets- Jester, ml-100k in compliance & to run with NFM implementation.
* Trim frappe dataset in complaince with surpriselib `Reader` class for evaluating results.

In [8]:
from surprise import Reader
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
import os

#### loading explicit u.data file out of primary dataset directory;
#To ascertain if there's any internal linking of u_id & i_id and interaction between its various other files during training.

file_path = os.path.expanduser('C:\\Users\\might\\Desktop\\jupyter notebooks\\u.data')


rdr = Reader(line_format='user item rating', sep ='\t', rating_scale=(1,5))

data =Dataset.load_from_file(file_path, rdr)
trainset,testset= train_test_split(data, test_size=0.2)

In [28]:
data.raw_ratings

[('196', '242', 3.0, None),
 ('186', '302', 3.0, None),
 ('22', '377', 1.0, None),
 ('244', '51', 2.0, None),
 ('166', '346', 1.0, None),
 ('298', '474', 4.0, None),
 ('115', '265', 2.0, None),
 ('253', '465', 5.0, None),
 ('305', '451', 3.0, None),
 ('6', '86', 3.0, None),
 ('62', '257', 2.0, None),
 ('286', '1014', 5.0, None),
 ('200', '222', 5.0, None),
 ('210', '40', 3.0, None),
 ('224', '29', 3.0, None),
 ('303', '785', 3.0, None),
 ('122', '387', 5.0, None),
 ('194', '274', 2.0, None),
 ('291', '1042', 4.0, None),
 ('234', '1184', 2.0, None),
 ('119', '392', 4.0, None),
 ('167', '486', 4.0, None),
 ('299', '144', 4.0, None),
 ('291', '118', 2.0, None),
 ('308', '1', 4.0, None),
 ('95', '546', 2.0, None),
 ('38', '95', 5.0, None),
 ('102', '768', 2.0, None),
 ('63', '277', 4.0, None),
 ('160', '234', 5.0, None),
 ('50', '246', 3.0, None),
 ('301', '98', 4.0, None),
 ('225', '193', 4.0, None),
 ('290', '88', 4.0, None),
 ('97', '194', 3.0, None),
 ('157', '274', 4.0, None),
 ('181'

In [12]:
algo =SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x22ae1492748>

In [14]:
testset[0]

('426', '526', 4.0)

In [16]:
algo.predict(testset[0][0],testset[0][1], r_ui= testset[0][2], verbose=True)

user: 426        item: 526        r_ui = 4.00   est = 3.64   {'was_impossible': False}


Prediction(uid='426', iid='526', r_ui=4.0, est=3.6444892082845555, details={'was_impossible': False})

In [27]:
#Now for any proxy test values
algo.predict('856900','19631600', r_ui= 6, verbose=True)

user: 856900     item: 19631600   r_ui = 6.00   est = 3.53   {'was_impossible': False}


Prediction(uid='856900', iid='19631600', r_ui=6, est=3.5314875, details={'was_impossible': False})

In [13]:
cross_validate(algo, data, measures = ['rmse'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9344  0.9296  0.9306  0.9428  0.9381  0.9351  0.0049  
Fit time          3.74    3.69    3.77    3.68    3.80    3.73    0.05    
Test time         0.22    0.13    0.10    0.11    0.14    0.14    0.04    


{'test_rmse': array([0.9344457 , 0.92956255, 0.93055358, 0.94282514, 0.93814235]),
 'fit_time': (3.7379872798919678,
  3.685133934020996,
  3.770923614501953,
  3.682131052017212,
  3.7968103885650635),
 'test_time': (0.21743035316467285,
  0.1336367130279541,
  0.10375761985778809,
  0.1077430248260498,
  0.13862967491149902)}

* ### NFM load data methods

In [1]:
from neural_factorization_machine import LoadData as DATA

In [2]:
import os#loads Train_data, Test_data & Val_data from dataset
data= DATA.LoadData(os.path.join(os.getcwd(),'neural_factorization_machine\\data'), '\\frappe', 'square_loss')

# of training: 202027
# of validation: 57722
# of test: 28860


In [25]:
len(data.Train_data['Y'])

202027

In [3]:
train, val, test= data.construct_data('square_loss')

# of training: 202027
# of validation: 57722
# of test: 28860


In [4]:
train['X']

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [559, 4451, 48, 58, 20, 5, 6, 80, 434, 435],
 [531, 3651, 38, 12, 4, 5, 6, 7, 54, 153],
 [82, 3489, 18, 3, 4, 5, 71, 80, 8, 120],
 [198, 1504, 18, 30, 20, 5, 6, 72, 135, 136],
 [600, 3777, 18, 3, 4, 5, 6, 26, 27, 602],
 [123, 79, 48, 25, 20, 5, 6, 7, 8, 9],
 [179, 2989, 48, 12, 4, 5, 6, 7, 54, 9],
 [441, 2321, 48, 19, 20, 5, 6, 26, 35, 99],
 [2614, 1083, 139, 25, 20, 5, 6, 26, 35, 99],
 [144, 3563, 38, 42, 20, 13, 6, 7, 146, 147],
 [1287, 729, 2, 19, 20, 5, 6, 26, 887, 1289],
 [313, 4636, 38, 19, 20, 5, 6, 26, 465, 466],
 [622, 3602, 2, 19, 20, 5, 6, 26, 35, 522],
 [5304, 749, 48, 19, 20, 5, 6, 80, 8, 94],
 [803, 668, 112, 25, 20, 5, 6, 45, 66, 9],
 [200, 448, 2, 19, 20, 5, 6, 26, 202, 203],
 [362, 442, 2, 12, 4, 5, 6, 45, 54, 9],
 [879, 1840, 2, 42, 20, 5, 6, 26, 54, 153],
 [362, 3682, 112, 58, 20, 5, 6, 72, 54, 9],
 [142, 4114, 38, 30, 20, 5, 6, 45, 8, 9],
 [1401, 5168, 38, 3, 4, 5, 6, 26, 54, 153],
 [330, 378, 2, 30, 20, 5, 6, 7, 54, 586],
 [1371, 4

* ##### Running LoadData.map_features and LoadData.read_features methods

In [6]:
frappe_path = 'neural_factorization_machine\\data\\frappe'
os.listdir(frappe_path)

['frappe.test.libfm',
 'frappe.train.libfm',
 'frappe.validation.libfm',
 'README.txt']

In [7]:
frappe_trainpath= os.path.join(frappe_path,'frappe.train.libfm')
frappe_testpath=os.path.join(frappe_path,'frappe.test.libfm')
frappe_valpath=os.path.join(frappe_path,'frappe.validation.libfm')

In [29]:
features = {}

f= open(frappe_testpath)#passing just the test_data to read_features() method
line =f.readline()
i = len(features)

while line:
    items =line.strip().split(' ')
    for item in items[1:]:
        if item not in features:
            features[ item ] = i
            i = i + 1
    line = f.readline()
f.close()

In [15]:
#list of items in one record from data file
line.strip().split(' ')#first index holds label y

['-1',
 '451:1',
 '4149:1',
 '5041:1',
 '5046:1',
 '5053:1',
 '5055:1',
 '5058:1',
 '5060:1',
 '5069:1',
 '5149:1']

In [23]:
features

{'451:1': 0,
 '4149:1': 1,
 '5041:1': 2,
 '5046:1': 3,
 '5053:1': 4,
 '5055:1': 5,
 '5058:1': 6,
 '5060:1': 7,
 '5069:1': 8,
 '5149:1': 9,
 '91:1': 10,
 '3503:1': 11,
 '5047:1': 12,
 '5056:1': 13,
 '5065:1': 14,
 '5095:1': 15,
 '168:1': 16,
 '983:1': 17,
 '5040:1': 18,
 '5050:1': 19,
 '5054:1': 20,
 '5207:1': 21,
 '620:1': 22,
 '1743:1': 23,
 '5045:1': 24,
 '5051:1': 25,
 '5061:1': 26,
 '5073:1': 27,
 '46:1': 28,
 '2692:1': 29,
 '5049:1': 30,
 '5086:1': 31,
 '5211:1': 32,
 '576:1': 33,
 '4933:1': 34,
 '5075:1': 35,
 '71:1': 36,
 '966:1': 37,
 '5043:1': 38,
 '5172:1': 39,
 '43:1': 40,
 '974:1': 41,
 '5048:1': 42,
 '5252:1': 43,
 '2928:1': 44,
 '5062:1': 45,
 '14:1': 46,
 '2396:1': 47,
 '5039:1': 48,
 '5076:1': 49,
 '107:1': 50,
 '4380:1': 51,
 '80:1': 52,
 '2662:1': 53,
 '5070:1': 54,
 '5243:1': 55,
 '190:1': 56,
 '1093:1': 57,
 '5052:1': 58,
 '5105:1': 59,
 '131:1': 60,
 '1432:1': 61,
 '5099:1': 62,
 '5215:1': 63,
 '116:1': 64,
 '986:1': 65,
 '5074:1': 66,
 '92:1': 67,
 '4253:1': 68,
 

* #### Encoding labels to unique features in dataset file
    * Here in feat dict keys are features & values are encoded labels; {feature:label}

In [31]:
def map_features(): # map the feature entries in all files, kept in self.features dictionary
    features = {}
    read_features(features, frappe_trainpath)
    read_features(features, frappe_testpath)
    read_features(features, frappe_valpath)
    #print("features_M:", len(self.features))
    return len(features), features

In [32]:
def read_features(features, file): # read_features() file
    f = open(file)
    line = f.readline()
    i = len(features)
    while line:
        items = line.strip().split(' ')
        for item in items[1:]:
            if item not in features:#adds unique features to dictionary
                features[ item ] = i
                i = i + 1
        line = f.readline()
    f.close()

In [33]:
features_M, feats = map_features()

In [None]:
feat#this dictionary holds one hot encoding label for each unique feature;
#is used to map one-hot label corresponding to every feature later during creation of one-hot vector representation(X_) of each instance

In [103]:
print('number of unique features: ', features_M, '\n\nfeatures and encoded label: ', feats)

number of unique features:  5382 

features and encoded label:  {'451:1': 0, '4149:1': 1, '5041:1': 2, '5046:1': 3, '5053:1': 4, '5055:1': 5, '5058:1': 6, '5060:1': 7, '5069:1': 8, '5149:1': 9, '91:1': 10, '3503:1': 11, '5047:1': 12, '5056:1': 13, '5065:1': 14, '5095:1': 15, '168:1': 16, '983:1': 17, '5040:1': 18, '5050:1': 19, '5054:1': 20, '5207:1': 21, '620:1': 22, '1743:1': 23, '5045:1': 24, '5051:1': 25, '5061:1': 26, '5073:1': 27, '46:1': 28, '2692:1': 29, '5049:1': 30, '5086:1': 31, '5211:1': 32, '576:1': 33, '4933:1': 34, '5075:1': 35, '71:1': 36, '966:1': 37, '5043:1': 38, '5172:1': 39, '43:1': 40, '974:1': 41, '5048:1': 42, '5252:1': 43, '2928:1': 44, '5062:1': 45, '14:1': 46, '2396:1': 47, '5039:1': 48, '5076:1': 49, '107:1': 50, '4380:1': 51, '80:1': 52, '2662:1': 53, '5070:1': 54, '5243:1': 55, '190:1': 56, '1093:1': 57, '5052:1': 58, '5105:1': 59, '131:1': 60, '1432:1': 61, '5099:1': 62, '5215:1': 63, '116:1': 64, '986:1': 65, '5074:1': 66, '92:1': 67, '4253:1': 68, '16:1

* #### LoadData.construct_data and read_data methods to load file and cosntruct dataset

In [37]:
#Following generates X_ & Y_for_logloss, which later outputs data.Train_data
f = open(frappe_trainpath)
X_ = []
Y_ = []
Y_for_logloss = []

In [39]:
line = f.readline()
while line:
    items = line.strip().split(' ')
    Y_.append( 1.0*float(items[0]) )

    if float(items[0]) > 0:# > 0 as 1; others as 0
        v = 1.0
    else:
        v = 0.0
    Y_for_logloss.append( v )

    X_.append([feats[item] for item in items[1:]])#X_ holds the one hot vector representation of each training sample
    line = f.readline()
f.close()

In [100]:
print('Y_: ', Y_[:10],'\n\nY_for_logloss: ',Y_for_logloss[:10])
#Target value of 1 in Y_for_logloss means the app is used under the cotext

Y_:  [-1.0, -1.0, 1.0, -1.0, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0] 

Y_for_logloss:  [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0]


In [104]:
len(X_)

202027

In [42]:
X_#feature vector

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 2, 12, 4, 13, 6, 14, 15, 9],
 [16, 17, 18, 19, 20, 5, 6, 7, 8, 21],
 [22, 23, 24, 25, 20, 5, 6, 26, 27, 9],
 [28, 29, 18, 30, 20, 5, 6, 7, 31, 32],
 [33, 34, 2, 30, 20, 13, 6, 26, 35, 9],
 [36, 37, 38, 30, 20, 5, 6, 26, 8, 39],
 [40, 41, 18, 42, 20, 5, 6, 7, 8, 43],
 [16, 44, 18, 25, 20, 5, 6, 45, 8, 9],
 [46, 47, 48, 12, 4, 5, 6, 26, 49, 9],
 [50, 51, 18, 3, 4, 5, 6, 26, 8, 9],
 [52, 53, 2, 12, 4, 5, 6, 26, 54, 55],
 [56, 57, 48, 58, 20, 5, 6, 26, 59, 9],
 [60, 61, 38, 19, 20, 5, 6, 26, 62, 63],
 [64, 65, 48, 25, 20, 5, 6, 45, 66, 9],
 [67, 68, 2, 19, 20, 13, 6, 45, 8, 9],
 [69, 70, 48, 3, 4, 13, 71, 72, 27, 73],
 [74, 75, 18, 3, 4, 5, 6, 7, 8, 9],
 [76, 77, 18, 25, 20, 5, 6, 26, 27, 73],
 [78, 79, 18, 58, 20, 5, 6, 80, 81, 9],
 [82, 83, 38, 19, 20, 5, 6, 26, 8, 84],
 [85, 86, 18, 25, 20, 5, 71, 45, 66, 9],
 [10, 87, 48, 12, 4, 5, 6, 88, 15, 9],
 [89, 90, 38, 25, 20, 13, 6, 7, 91, 9],
 [92, 93, 24, 12, 4, 13, 6, 80, 8, 94],
 [74, 95, 48, 12, 

* #### outputs from data.construct_dataset method

In [47]:
import numpy as np

Data_Dic = {}
X_lens = [ len(line) for line in X_]
indexs = np.argsort(X_lens)
Data_Dic['Y'] = [ Y_for_logloss[i] for i in indexs]
Data_Dic['X'] = [ X_[i] for i in indexs]

In [82]:
print(X_lens[0],X_lens[2], X_lens[134679], X_lens[134680])
print('also:',X_[0],X_[2], X_[134697],X_[134680])

10 10 10 10
also: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] [16, 17, 18, 19, 20, 5, 6, 7, 8, 21] [362, 3682, 112, 58, 20, 5, 6, 72, 54, 9] [531, 3651, 38, 12, 4, 5, 6, 7, 54, 153]


In [74]:
np.argsort([56,12,98,23])

array([1, 3, 0, 2], dtype=int64)

In [81]:
## why indexes after argsort are in such order [0 134679 134680. . ]
# if X_lens = [10, 10, 10, 10, 10, 10, 10, 10, 10, 10,. . 10] ?

indexs = np.argsort(X_lens)
print('X_lens:',X_lens[:10],'\n\n indexes of X_lens in sorted order:',indexs[:10])

X_lens: [10, 10, 10, 10, 10, 10, 10, 10, 10, 10] 

 indexes of X_lens in sorted order: [     0 134679 134680 134681 134682 134683 134684 134685 134686 134687]


In [59]:
np.argsort([89,89,89,89,89])

array([0, 1, 2, 3, 4], dtype=int64)

In [60]:
np.argsort(X_lens)[:10]

array([     0, 134679, 134680, 134681, 134682, 134683, 134684, 134685,
       134686, 134687], dtype=int64)

In [83]:
Data_Dic['X']

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [559, 4451, 48, 58, 20, 5, 6, 80, 434, 435],
 [531, 3651, 38, 12, 4, 5, 6, 7, 54, 153],
 [82, 3489, 18, 3, 4, 5, 71, 80, 8, 120],
 [198, 1504, 18, 30, 20, 5, 6, 72, 135, 136],
 [600, 3777, 18, 3, 4, 5, 6, 26, 27, 602],
 [123, 79, 48, 25, 20, 5, 6, 7, 8, 9],
 [179, 2989, 48, 12, 4, 5, 6, 7, 54, 9],
 [441, 2321, 48, 19, 20, 5, 6, 26, 35, 99],
 [2614, 1083, 139, 25, 20, 5, 6, 26, 35, 99],
 [144, 3563, 38, 42, 20, 13, 6, 7, 146, 147],
 [1287, 729, 2, 19, 20, 5, 6, 26, 887, 1289],
 [313, 4636, 38, 19, 20, 5, 6, 26, 465, 466],
 [622, 3602, 2, 19, 20, 5, 6, 26, 35, 522],
 [5304, 749, 48, 19, 20, 5, 6, 80, 8, 94],
 [803, 668, 112, 25, 20, 5, 6, 45, 66, 9],
 [200, 448, 2, 19, 20, 5, 6, 26, 202, 203],
 [362, 442, 2, 12, 4, 5, 6, 45, 54, 9],
 [879, 1840, 2, 42, 20, 5, 6, 26, 54, 153],
 [362, 3682, 112, 58, 20, 5, 6, 72, 54, 9],
 [142, 4114, 38, 30, 20, 5, 6, 45, 8, 9],
 [1401, 5168, 38, 3, 4, 5, 6, 26, 54, 153],
 [330, 378, 2, 30, 20, 5, 6, 7, 54, 586],
 [1371, 4

* ##### loading ml-tag dataset as dataframe

In [6]:
import pandas as pd
df=pd.read_csv('neural_factorization_machine\\data\\frappe\\frappe.train.libfm', sep="\t", header= None)
df.head(7)

Unnamed: 0,0
0,-1 451:1 4149:1 5041:1 5046:1 5053:1 5055:1 50...
1,-1 91:1 3503:1 5041:1 5047:1 5053:1 5056:1 505...
2,1 168:1 983:1 5040:1 5050:1 5054:1 5055:1 5058...
3,-1 620:1 1743:1 5045:1 5051:1 5054:1 5055:1 50...
4,-1 46:1 2692:1 5040:1 5049:1 5054:1 5055:1 505...
5,-1 576:1 4933:1 5041:1 5049:1 5054:1 5056:1 50...
6,1 71:1 966:1 5043:1 5049:1 5054:1 5055:1 5058:...


In [7]:
record= df[0].iloc[0]
record= record.split(' ')

In [8]:
print(record, '\n', len(record))

['-1', '451:1', '4149:1', '5041:1', '5046:1', '5053:1', '5055:1', '5058:1', '5060:1', '5069:1', '5149:1'] 
 11


In [9]:
record

['-1',
 '451:1',
 '4149:1',
 '5041:1',
 '5046:1',
 '5053:1',
 '5055:1',
 '5058:1',
 '5060:1',
 '5069:1',
 '5149:1']

* ### Evaluating results by training a FM on ml-100k dataset
* Objective is to first build a user_movie_rating matrix

* ##### Approach 1: with surpriselib

In [132]:

from surprise import Reader
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
import os

#### loading explicit u.data file out of primary dataset directory;
#To ascertain if there's any internal linking of u_id & i_id and interaction between its various other files during training.

file_path = os.path.expanduser('C:\\Users\\might\\Desktop\\jupyter notebooks\\u.data')

rdr = Reader(line_format='user item rating', sep ='\t', rating_scale=(1,5))

data =Dataset.load_from_file(file_path, rdr)
trainset = data.build_full_trainset()#trainset,testset= train_test_split(data, test_size=0.2) # to load full trainset

In [125]:
import numpy as np
trainset_FM= trainset.build_testset()#contains the list of tuples containing (u_id,mov_id, rating)

trainset_FMx= np.asarray(trainset_FM, dtype=np.int32)#,dty

In [13]:
pe=np.int8)#converting to numpy array

In [128]:
print(type(trainset_FMx),'\n\n',trainset_FMx,'\n\n', trainset_FMx.shape)

<class 'numpy.ndarray'> 

 [['196' '242' '3.0']
 ['196' '393' '4.0']
 ['196' '381' '4.0']
 ...
 ['941' '1' '5.0']
 ['941' '294' '4.0']
 ['941' '1007' '4.0']] 

 (100000, 3)


* first column corresponds to user, therefore gauges unique user & frequency, uni_user_array holds unique values of user ids.
* second colum corresponds to movie, therefore gauges unique movie & frequency, uni_mov_array holds unique values of movie ids.

In [14]:
uni_user_array, user_freq = np.unique(trainset_FMx[:,0], return_counts=True)
uni_mov_array, mov_freq = np.unique(trainset_FMx[:,1], return_counts=True)

In [260]:
uni_user_li = [int(uni_user_array[i]) for i in range(uni_user_array.shape[0])]# converting the array of string elements to list int elements
uni_user_li.sort()#sort the uni_user_li

uni_mov_li = [int(uni_mov_array[i]) for i in range(uni_mov_array.shape[0])]
uni_mov_li.sort()#sort the uni_mov_li

In [146]:
trainset_FMx

array([['196', '242', '3.0'],
       ['196', '393', '4.0'],
       ['196', '381', '4.0'],
       ...,
       ['941', '1', '5.0'],
       ['941', '294', '4.0'],
       ['941', '1007', '4.0']], dtype='<U4')

In [160]:
trainset_FMx[1,2]

'4.0'

In [186]:
print(trainset_FMx[:,1]=='393','\n\nindices where movie item 393 is spotted:\n', np.where(trainset_FMx[:,1]=='393'))

[False  True False ... False False False] 

indices where movie item 393 is spotted:
 (array([    1,   163,   431,   597,  1626,  1888,  2099,  2287,  3078,
        3328,  3681,  4274,  4517,  4933,  5135,  5306,  5880,  5990,
        7043,  7356,  8397,  9576,  9886, 10964, 11263, 11896, 12315,
       12630, 13053, 13216, 13508, 13658, 14071, 15236, 15778, 16065,
       16541, 17153, 17268, 17473, 17874, 18213, 18450, 19277, 19530,
       19961, 20845, 21170, 21601, 21766, 21932, 22728, 23437, 23737,
       23979, 25210, 25737, 26796, 27500, 28454, 28813, 29063, 29455,
       30217, 30332, 31507, 32534, 33557, 34778, 35101, 35559, 36689,
       38035, 39305, 39868, 40032, 40342, 40842, 41201, 41935, 42433,
       42988, 43419, 43584, 43947, 44727, 45664, 45812, 46363, 47034,
       47912, 48599, 49183, 50298, 51194, 51441, 51776, 52851, 53278,
       53771, 53882, 54411, 54823, 54910, 55416, 55717, 55943, 56584,
       57023, 57408, 57680, 58000, 58430, 58753, 58914, 59752, 60930,
   

In [45]:
a= np.array([True, False, True])
b= np.array([True, True, False])

print([a[i] and b[i] for i in range(len(a))])

[True, False, False]


In [215]:
print(np.array(trainset_FMx[:,0]=='196')[1], np.array(trainset_FMx[:,1]=='393')[1])

True True


In [268]:
a= np.array(trainset_FMx[:,0]=='196')
b=np.array(trainset_FMx[:,1]=='393')

np.where([a[i] and b[i] for i in range(len(a))])[0]#to determine index of particular (ui,mi)

array([1], dtype=int64)

In [244]:
print('uni_user_list: ',uni_user_li,'\n\nuni_mov_list: ', uni_mov_array)

uni_user_list:  [1, 10, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 11, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 12, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 13, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 14, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 15, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 16, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 17, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 19, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 2, 20, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 21, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 22, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 23, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 24, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 25, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 26, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 27, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 28, 2

In [248]:
print(len(uni_user_li), len(uni_mov_li))

943 1682


In [247]:
trainset_FMx.shape

(100000, 3)

In [267]:
user_mov_mat = np.zeros((len(uni_user_li),len(uni_mov_li)), dtype= np.float)#Defining user_movie_rating matrix
print(user_mov_mat.shape)

(943, 1682)


In [286]:
#Following generate user_mov_rating_matrix with array derived from u.data loaded with surpriselib reader class.
import time
tick = time.time()

for i in range(user_mov_mat.shape[0]):
    #ui= uni_user_li[i]
    
    ui_pos_in_trainset = np.array(trainset_FMx[:,0]== uni_user_li[i])#outputs boolean array shaped (10000,)
    #array holding True values where ith u_id is found, col 0 in trainset is user_ids,
    #Note: Array holds multiple True values are there for multiple presence of same user_id
    
    for j in range(user_mov_mat.shape[1]):
        #mi= uni_mov_li[j]
        
        mi_pos_in_trainset = np.array(trainset_FMx[:,1]== uni_mov_li[j])#outputs boolean array shaped (10000,)
        #array holding True values where jth movie_id is found, col 1 in trainset is movie_ids;
        #Array holds multiple True values corresponding to multiple presence of same movie_id
        
        ui_mi_intercept = np.where([ui_pos_in_trainset[k] and mi_pos_in_trainset[k] for k in range(trainset_FMx.shape[0])])[0][0]#holds index position of (ui,mi) in trainset_FMx, each (ui,mi) is Unique
        user_mov_mat[i][j]= np.float(trainset_FMx[ui_mi_intercept][2])#outputs a float type rating value at ui_mi_intercept, train_FMx= [[user_id, mov_id, rating],..]
        
        
print('total exe time: ', time.time() - tick)

  This is separate from the ipykernel package so we can avoid doing imports until
  if __name__ == '__main__':


KeyboardInterrupt: 

* ##### Approach 1.2: loading data, u.data directly with pandas


In [1]:
import numpy as np#when u.data is loaded with pandas
import pandas as pd
import os
file_path = os.path.expanduser('C:\\Users\\might\\Desktop\\jupyter notebooks\\u.data')
df = pd.read_csv(file_path,sep='\t', header=None)


trainset_FM = np.array(df)
trainset_FM

array([[      196,       242,         3, 881250949],
       [      186,       302,         3, 891717742],
       [       22,       377,         1, 878887116],
       ...,
       [      276,      1090,         1, 874795795],
       [       13,       225,         2, 882399156],
       [       12,       203,         3, 879959583]], dtype=int64)

* first column corresponds to user, therefore gauges unique user & frequency, uni_user_array holds unique values of user ids.
* second colum corresponds to movie, therefore gauges unique movie & frequency, uni_mov_array holds unique values of movie ids.

In [2]:
uni_user_array, user_freq = np.unique(trainset_FM[:,0], return_counts=True)#returns already sorted user_ids
uni_mov_array, mov_freq = np.unique(trainset_FM[:,1], return_counts=True)

In [131]:
print('num of unique users: ', uni_user_array.shape,'\nnum of unique movies: ', uni_mov_array.shape)

num of unique users:  (943,) 
num of unique movies:  (1682,)


In [9]:
#Returns True where user user_id is 3 in trainset_FMx
print(uni_user_array[2],'\n', np.array(trainset_FM[:,0]== uni_user_array[2]),'\nUser_id 3 is at positions:',np.where(np.array(trainset_FM[:,0]== uni_user_array[2])))

3 
 [False False False ... False False False] 
User_id 3 is at positions: (array([ 1257,  1343,  1682,  2523,  3758,  3840,  4419,  5897,  6178,
        7110,  7287,  8263,  9021,  9472,  9900, 10568, 10869, 11227,
       11234, 12095, 14381, 14654, 15455, 15962, 18385, 18651, 24412,
       24716, 26927, 27361, 27975, 29583, 31937, 32672, 33981, 34629,
       35195, 35264, 37188, 37462, 37734, 38670, 40899, 41910, 46486,
       48447, 49801, 51966, 53950, 65959, 77245, 85726, 92178, 96761],
      dtype=int64),)


In [51]:
a= np.array(trainset_FMx[:,0]==196)
b=np.array(trainset_FMx[:,1]==393)

np.where([a[i] and b[i] for i in range(trainset_FMx.shape[0])])[0][0]#Returns index where (u_id, mov_id) is (196,393)

940

In [99]:
ui_pos_in_trainset = np.array(trainset_FMx[:,0]==196)# uni_user_array[2])
mi_pos_in_trainset = np.array(trainset_FMx[:,1]== 393)#uni_mov_array[2])

ui_mi_intercept = np.where([ui_pos_in_trainset[k] and mi_pos_in_trainset[k] for k in range(trainset_FMx.shape[0])])[0][0]
print(ui_mi_intercept)

940


* Following matrix will over-write rating values across user_ids at x axis, movie ids at y_axis.

In [40]:
user_mov_mat = np.zeros((uni_user_array.shape[0], uni_mov_array.shape[0]), dtype= np.float)
print(user_mov_mat.shape)

(943, 1682)


In [93]:
user_mov_mat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [104]:
#Following generate user_mov_rating_matrix with array derived from u.data loaded with pandas dataframe.
import time
tick = time.time()
for i in range(user_mov_mat.shape[0]):
    #ui= uni_user_li[i]
    ui_pos_in_trainset = np.array(trainset_FMx[:,0]== uni_user_array[i])#outputs boolean array shaped (10000,)
    #array holding True values where ith u_id is found, col 0 in trainset is user_ids,
    #Note: Array holds multiple True values are there for multiple presence of same user_id
    
    for j in range(user_mov_mat.shape[1]):
        #mi= uni_mov_li[j]
        mi_pos_in_trainset = np.array(trainset_FMx[:,1]== uni_mov_array[j])#outputs boolean array shaped (10000,)
        #array holding True values where jth movie_id is found, col 1 in trainset is movie_ids;
        #Array holds multiple True values corresponding to multiple presence of same movie_id
        
        ui_mi_intercept = np.where([ui_pos_in_trainset[k] and mi_pos_in_trainset[k] for k in range(trainset_FMx.shape[0])])#[0][0]#holds index position of (ui,mi) in trainset_FMx, each (ui,mi) is Unique
        #user_mov_mat[i][j]= np.float(trainset_FMx[ui_mi_intercept][2])#outputs a float type rating value at ui_mi_intercept, train_FMx= [[user_id, mov_id, rating],..]
        
        
print('total exe time: ', time.time() - tick)

KeyboardInterrupt: 

* both 1.1 and 1.2 are Inefficient, taking too much execution time.

* There for above rating matrix we assume, users are represented row-wise in increasing order from 0 to 943
* Likewise movies are arranged in increasing order of movie ids across columns from 0 to 1640;

**Note: In case the user_ids & movie_ids are in no particular order i.e., unsorted. A new mapping of movie ids & user ids of user rating them needs to be defined as separate .csv file with movie ids, ratings correctly placed alongside movie names**


* ##### Approach 2: Creating dataframe instead of user_mov_rating matrix

In [3]:
print('num of unique users: ', uni_user_array.shape,'\nnum of unique movies: ', uni_mov_array.shape)

num of unique users:  (943,) 
num of unique movies:  (1682,)


In [4]:
df_ratings = pd.DataFrame(index= uni_user_array, columns=uni_mov_array)
#Here user_ids are aligned vertically down, & movie_ids horizonatlly rightwards

In [9]:
df_ratings.head(5)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [10]:
#df_ratings.apply(lambda x: x+' 5', axis=1)# = ['hah']*1000+ ['yeah']*682

In [6]:
trainset_FM

array([[      196,       242,         3, 881250949],
       [      186,       302,         3, 891717742],
       [       22,       377,         1, 878887116],
       ...,
       [      276,      1090,         1, 874795795],
       [       13,       225,         2, 882399156],
       [       12,       203,         3, 879959583]], dtype=int64)

In [67]:
#testing intersection for any user_id = 1, movie_id = 1
np.any(np.where(trainset_FM[:,0]==1)[0] == np.where(trainset_FM[:,1]==1)[0])#Errorprone because both np.where terms output different numb of values.
#trainset_FM[np.where(trainset_FM[:,0]==1)]

  """Entry point for launching an IPython kernel.


False

* two methods to find intersect between trainset_FM [:,0] and trainset_FM [:,1] 

In [81]:
%time np.where([np.array(trainset_FM[:,0]==1)[i] and np.array(trainset_FM[:,1]==1)[i] for i in range(trainset_FM.shape[0])])[0][0]
#Same as before, taking too much execution time

Wall time: 4.85 s


32236

In [83]:
%time np.intersect1d(np.where(trainset_FM[:,1]==1)[0], np.where(trainset_FM[:,0]==1))[0]
#much faster execution

Wall time: 965 µs


32236

In [11]:
ui= np.where(trainset_FM[:,0]==1)[0]#outputs places of occurence for user_id 1
ui[0][:10]

array([202, 305, 333, 334, 478, 639, 687, 820, 933, 972], dtype=int64)

In [16]:
#Check for any user_id value 1, which of all(1982) movie_ids have a valid rating values

ui_movie_intercepts = [np.intersect1d(ui, np.where(trainset_FM[:,1]==col)[0]) for col in np.array(df_ratings.columns)]
print('lenght of ui_movie_intercepts list for user_id = 1:', len(ui_movie_intercepts),'\n', ui_movie_intercepts[:5])
#Some values in list ui_movie_intercepts contain empty value, as every user has rated only limited movies, Not all movies.

lenght of ui_movie_intercepts list for user_id = 1: 1682 
 [array([32236], dtype=int64), array([23171], dtype=int64), array([83307], dtype=int64), array([62631], dtype=int64), array([47638], dtype=int64)]


* removing empty arrays from uni_movie_intercepts

In [15]:
#remove the empty arrays in ui_movie_intercepts
ui_movie_intercepts= np.array(ui_movie_intercepts)
ui_movie_intercepts = ui_movie_intercepts[ui_movie_intercepts!=0]#to eliminate 0 shaped numpy arrays within
print('new length of ui_movie_intercepts after trimming: ', ui_movie_intercepts.shape)

new length of ui_movie_intercepts after trimming:  (272,)


  This is separate from the ipykernel package so we can avoid doing imports until


* Alternate method to remove empty arrays from uni_movie_itercepts -- **Much faster than above**

In [19]:
ui_movie_intercepts = list(filter(lambda x: x.shape[0] != 0, ui_movie_intercepts))
print('New length of ui_movie_intercepts after trimming: ',len(ui_movie_intercepts),'\n\nsample ui_movie_intercepts: ', ui_movie_intercepts[:10])

New length of ui_movie_intercepts after trimming:  272 

sample ui_movie_intercepts:  [array([32236], dtype=int64), array([23171], dtype=int64), array([83307], dtype=int64), array([62631], dtype=int64), array([47638], dtype=int64), array([5533], dtype=int64), array([70539], dtype=int64), array([31650], dtype=int64), array([20175], dtype=int64), array([13542], dtype=int64)]


In [244]:
trainset_FM[:6]

array([[      196,       242,         3, 881250949],
       [      186,       302,         3, 891717742],
       [       22,       377,         1, 878887116],
       [      244,        51,         2, 880606923],
       [      166,       346,         1, 886397596],
       [      298,       474,         4, 884182806]], dtype=int64)

In [247]:
ui_movie_intercepts= list(filter(lambda x: x.shape[0] != 0, ui_movie_intercepts))
sample_rating_list = [trainset_FM[int(ui_movie_intercepts[i])][2] for i in range(len(ui_movie_intercepts))]
#The ratings value list here has no sense of what column it belongs to.

print('length of rating list for a user_id ui: ', len(sample_rating_list),'\nlist: ', sample_rating_list[:7])

length of rating list for a user_id ui:  272 
list:  [5, 3, 4, 3, 3, 5, 4]


* **Note** : Although `[trainset_FM[int(ui_movie_intercepts[i])][2] for i in range(len(ui_movie_intercepts))]`
  correctly fetches rating value with indices of corresponding `(ui, mi ) pair`, but storing it as a list and later row-wise
  assigning df_ratings as:
  `df_ratings.loc[idx]= [trainset_FM[int(ui_movie_intercepts[i])][2] for i in range(len(ui_movie_intercepts))]`
  could mistake the true placement of a rating corresponding to `(ui,mi)`;
  Since the generated list lacks positional congnizance, thus equating it as row assignment ops in df_ratings could misplace rating values under some other `mi` value.

* Following outputs the list of ratings corresponding to (ui,mi) itercepts

In [32]:
for i in range(5):#range(len(ui_movie_intercepts))
    print('\nRecord at intercept postion: {} in trainset_FM :{}'.format(ui_movie_intercepts[i], trainset_FM[int(ui_movie_intercepts[i])]))


Record at intercept postion: [32236] in trainset_FM :[        1         1         5 874965758]

Record at intercept postion: [23171] in trainset_FM :[        1         2         3 876893171]

Record at intercept postion: [83307] in trainset_FM :[        1         3         4 878542960]

Record at intercept postion: [62631] in trainset_FM :[        1         4         3 876893119]

Record at intercept postion: [47638] in trainset_FM :[        1         5         3 889751712]


* **To prevent misplacing of ratings value under wrong (ui, mi) position in dataframe; Instead of using only list of ratings, a dictionary {mi:ratings_value} can be used for every ui in  df_ratings.loc[ui]**
    * Approach : 1

In [51]:
import time
t1 = time.time()
mi_order_key= [trainset_FM[int(ui_movie_intercepts[i])][1] for i in range(len(ui_movie_intercepts))]#Outputs the list of movie_ids holding valid ui_mi intercepts, for precise mapping ratings to (ui,mi) intercept
corresponding_ratings_value = [trainset_FM[int(ui_movie_intercepts[i])][2] for i in range(len(ui_movie_intercepts))]
mi_ratings_dict = dict(zip(mi_order_key, corresponding_ratings_value))
print('exe time: ', time.time()-t1)

exe time:  0.0009975433349609375


* Approach 2

In [52]:
t1 = time.time()
mi_ratings_dict= dict()
for i in range(len(ui_movie_intercepts)):
    mi_ratings_dict.update({trainset_FM[int(ui_movie_intercepts[i])][1]:trainset_FM[int(ui_movie_intercepts[i])][2]})#mi_ratings_dict = {movie_id: corresponding_rating}
print('exe time:', time.time()-t1)
print(mi_ratings_dict)

exe time: 0.0
{1: 5, 2: 3, 3: 4, 4: 3, 5: 3, 6: 5, 7: 4, 8: 1, 9: 5, 10: 3, 11: 2, 12: 5, 13: 5, 14: 5, 15: 5, 16: 5, 17: 3, 18: 4, 19: 5, 20: 4, 21: 1, 22: 4, 23: 4, 24: 3, 25: 4, 26: 3, 27: 2, 28: 4, 29: 1, 30: 3, 31: 3, 32: 5, 33: 4, 34: 2, 35: 1, 36: 2, 37: 2, 38: 3, 39: 4, 40: 3, 41: 2, 42: 5, 43: 4, 44: 5, 45: 5, 46: 4, 47: 4, 48: 5, 49: 3, 50: 5, 51: 4, 52: 4, 53: 3, 54: 3, 55: 5, 56: 4, 57: 5, 58: 4, 59: 5, 60: 5, 61: 4, 62: 3, 63: 2, 64: 5, 65: 4, 66: 4, 67: 3, 68: 4, 69: 3, 70: 3, 71: 3, 72: 4, 73: 3, 74: 1, 75: 4, 76: 4, 77: 4, 78: 1, 79: 4, 80: 4, 81: 5, 82: 5, 83: 3, 84: 4, 85: 3, 86: 5, 87: 5, 88: 4, 89: 5, 90: 4, 91: 5, 92: 3, 93: 5, 94: 2, 95: 4, 96: 5, 97: 3, 98: 4, 99: 3, 100: 5, 101: 2, 102: 2, 103: 1, 104: 1, 105: 2, 106: 4, 107: 4, 108: 5, 109: 5, 110: 1, 111: 5, 112: 1, 113: 5, 114: 5, 115: 5, 116: 3, 117: 3, 118: 3, 119: 5, 120: 1, 121: 4, 122: 3, 123: 4, 124: 5, 125: 3, 126: 2, 127: 5, 128: 4, 129: 5, 130: 3, 131: 1, 132: 4, 133: 4, 134: 4, 135: 4, 136: 3, 137: 

In [55]:
import time
tick = time.time()
for idx in list(df_ratings.index):
    ui= np.where(trainset_FM[:,0]==idx)[0]#array of all indices across 0th column in trainset_FM, where user_id idx is spotted 
    
    #outputs list of indices of valid combination of user_id nad mov_id as (1,1),(1,2)..(1,1982) in trainset_FM 
    ui_movie_intercepts = np.array([np.intersect1d(ui, np.where(trainset_FM[:,1]==col)[0]) for col in np.array(df_ratings.columns)])
    #2nd array in np.intersect1d above changes with change in value of col, i.e., Movie_id
    
    ui_movie_intercepts= list(filter(lambda x: x.shape[0] != 0, ui_movie_intercepts))#dropping empty index arrays / invalid (ui,mi)
    #length of ui_movie_intercepts is trimmed down
    
    mi_ratings_dict= dict()
    for i in range(len(ui_movie_intercepts)):
        mi_ratings_dict.update({trainset_FM[int(ui_movie_intercepts[i])][1]:trainset_FM[int(ui_movie_intercepts[i])][2]})#mi_ratings_dict = {movie_id: corresponding_rating}
    
    #following stores the list of rating values corresponding to (1,1), (1,2). . (1,mov_id ) from trainset_FM into df_ratings, row-wise
    df_ratings.loc[idx]= mi_ratings_dict
    #df_ratings.loc[idx]= [trainset_FM[int(ui_movie_intercepts[i])][2] for i in range(len(ui_movie_intercepts))]
    
print('exe time: ', time.time()-tick)
#current execution time only accounts for defining & over-writing ui_movie_intercept arrays, NOT THE ASSIGNMENT OF INDIVIDUAL DATAFRAME VALUE
#Time may certainly soar up with assignment of values in df_ratings elements

exe time:  257.60632252693176


In [66]:
df_ratings.head(10)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
6,4.0,,,,,,2.0,4.0,4.0,,...,,,,,,,,,,
7,,,,5.0,,,5.0,5.0,5.0,4.0,...,,,,,,,,,,
8,,,,,,,3.0,,,,...,,,,,,,,,,
9,,,,,,5.0,4.0,,,,...,,,,,,,,,,
10,4.0,,,4.0,,,4.0,,4.0,,...,,,,,,,,,,


In [130]:
import time
tick = time.time()
for idx in list(df_ratings.index):
    ui= np.where(trainset_FM[:,0]==idx)[0]#array of all indices across 0th column in trainset_FM, where user_id idx is spotted 
    
    #outputs list of indices of valid combination of user_id nad mov_id as (1,1),(1,2)..(1,1982) in trainset_FM 
    ui_movie_intercepts = np.array([np.intersect1d(ui, np.where(trainset_FM[:,1]==col)[0]) for col in np.array(df_ratings.columns)])
    #2nd array in np.intersect1d above changes with change in value of col, i.e., Movie_id
    
    
    #df_ratings.loc[idx]= []
    
print('exe time: ', time.time()-tick)
#current execution time only accounts for defining & over-writing ui_movie_intercept arrays, NOT THE ASSIGNMENT OF INDIVIDUAL DATAFRAME VALUE
#Time may certainly soar up with assignment of values in df_ratings elements

exe time:  125.42661714553833


In [None]:
n_u = 

In [None]:
def user_mov_mat(mat):
    