#### This Notebook implements various standard CF methods using the PCP procedures as input and Spesialist procedures as output

## Load the data

In [1]:
import numpy as np
import pandas as pd

# http://surprise.readthedocs.io/en/stable/getting_started.html
# I believe in loading all the datasets from pandas df 
# you can also load dataset from csv and whatever suits


# load the data in a form suitable for recommender system models: user, item, rating

ratings = pd.read_csv('data_processing/CF_data_train.csv') # reading data in pandas df
ratings_test = pd.read_csv('data_processing/CF_data_test.csv') # reading data in pandas df


from surprise import Reader, Dataset


# Train data:

## to load dataset from pandas df, we need `load_fromm_df` method in surprise lib

ratings_dict = {'itemID': list(ratings.Proc),
                'userID': list(ratings.PC_enc),
                'rating': list(ratings.rating)}

df = pd.DataFrame(ratings_dict)

# A reader is still needed but only the rating_scale param is required.
# The Reader class is used to parse a file containing ratings.
reader = Reader(rating_scale=(0, 1.0))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

########################

# Test data:

## to load dataset from pandas df, we need `load_fromm_df` method in surprise lib

ratings_test_dict = {'itemID': list(ratings_test.Proc),
                'userID': list(ratings_test.PC_enc),
                'rating': list(ratings_test.rating)}

df_test = pd.DataFrame(ratings_test_dict)

# A reader is still needed but only the rating_scale param is required.
# The Reader class is used to parse a file containing ratings.
reader = Reader(rating_scale=(0, 1.0))

# The columns must correspond to user id, item id and ratings (in that order).
data_test = Dataset.load_from_df(df_test[['userID', 'itemID', 'rating']], reader)




# Run the models

### Train

In [2]:
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise.prediction_algorithms.co_clustering import CoClustering

# sample random trainset and testset
# test set is made of 25% of the ratings.
#trainset, testset = train_test_split(data, test_size=.25)

# We'll use the famous SVD algorithm.
algo = CoClustering()

# Train the algorithm on the trainset, and predict ratings for the testset

#build full trainset and use cross validation for evaluation
trainset= data.build_full_trainset()

#testset = data_test.build_full_trainset()

algo.fit(trainset)



# Than predict ratings for all pairs (u, i) that are NOT in the training set.
#testset = testset.build_anti_testset()
#predictions = algo.test(testset)

<surprise.prediction_algorithms.co_clustering.CoClustering at 0x7f8252917198>

### Test

In [3]:
#tempset = data_test.build_full_trainset()
#testset = tempset.build_testset()
# first load the whole test data set
data_test = pd.read_csv('data_processing/data_test.csv')
#df_test = pd.read_csv('data/data_test.csv').sort_values('PC_enc')

# Then only pick the PC_enc + specialist procedures from the test dataset for prediction
my_test = data_test[[data_test.columns[0]]+ list(data_test.columns[407:])].sort_values('PC_enc')



print(my_test.columns)
print(my_test['PC_enc'].head())

df_new = pd.DataFrame({'PC_enc':my_test['PC_enc']})
#print(my_test.columns[0])
#print(my_test.columns[1])

for col in my_test.columns[1:]:
    #print(col)
    A = []
    for enc in my_test['PC_enc']:
        pred = algo.predict(uid=enc, iid=col, r_ui=0, verbose=False).est
        #print(col, enc, pred)
        A.append(pred)
    
    df_new[col] = A
    
print(df_new.head())

Index(['PC_enc', 'F0_y_y', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8',
       'F9', 'F10_y', 'F11_y', 'F12_y', 'F13_y', 'F14_y', 'F15_y', 'F16_y',
       'F17_y', 'F18_y', 'F19_y', 'F20_y', 'F21_y', 'F22_y', 'F23_y', 'F24_y',
       'F25_y', 'F26_y', 'F27_y', 'F28_y', 'F29_y', 'F30_y', 'F31_y', 'F32_y',
       'F33_y', 'F34_y', 'F35_y', 'F36_y', 'F37_y', 'F38_y', 'F39_y', 'F40_y',
       'F41_y', 'F42_y', 'F43_y', 'F44_y', 'F45_y', 'F46_y', 'F47_y', 'F48_y',
       'F49_y', 'F50_y', 'F51_y', 'F52_y', 'F53_y', 'F54_y', 'F55_y', 'F56_y',
       'F57_y', 'F58_y', 'F59_y', 'F60_y'],
      dtype='object')
660    131023122942
883    131023156693
885    131023214462
887    131023214801
888    131023242322
Name: PC_enc, dtype: int64
           PC_enc    F0_y_y        F1        F2   F3        F4        F5  \
660  131023122942  0.419003  0.285912  0.126592  0.0  0.165865  0.174652   
883  131023156693  0.419003  0.285912  0.126592  0.0  0.165865  0.174652   
885  131023214462  0.419003  0.28

In [4]:
# save the results
df_new.to_csv('evaluation/data/CF_CClustering_pred.csv', index=False)

In [53]:
## Draft

def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
#df['err'] = abs(df.est - df.rui)
#best_predictions = df.sort_values(by='err')[:10]
#worst_predictions = df.sort_values(by='err')[-10:]
#print(worst_predictions)

In [56]:

print(df[df.uid==131024494490])

             uid  iid  rui       est                    details   Iu     Ui
0   131024494490  F60  0.0  0.000000  {'was_impossible': False}  122  21680
1   131024494490   F6  0.0  0.156303  {'was_impossible': False}  122  21680
2   131024494490  F59  0.0  0.000000  {'was_impossible': False}  122  21680
3   131024494490  F58  0.0  0.000000  {'was_impossible': False}  122  21680
4   131024494490  F57  0.0  0.010011  {'was_impossible': False}  122  21680
5   131024494490  F56  0.0  0.000000  {'was_impossible': False}  122  21680
6   131024494490  F55  0.0  0.000000  {'was_impossible': False}  122  21680
7   131024494490  F54  0.0  0.000000  {'was_impossible': False}  122  21680
8   131024494490  F53  0.0  0.000000  {'was_impossible': False}  122  21680
9   131024494490  F16  0.0  0.025264  {'was_impossible': False}  122  21680
10  131024494490  F51  0.0  0.011449  {'was_impossible': False}  122  21680
11  131024494490  F44  0.0  0.000458  {'was_impossible': False}  122  21680
12  13102449

### Process data. 

In [15]:
i= 34
A = np.logical_or(X,Y).astype(int)
print(X[i,0:30])
print(Y[i,0:30])
print(A[i,0:30])

# shuffle the data
I = np.random.permutation(N)

A = A[I,:]


# train and test
N_train = 4500

A_train = A[:N_train,:]
A_test = A[N_train:,:]


[1 0 1 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
[0 0 1 0 0 0 1 1 0 1 1 0 0 1 1 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0]
[1 0 1 1 1 0 1 1 0 1 1 0 0 1 1 0 1 0 0 1 0 1 0 1 0 0 0 0 1 0]


## Model: MF

### Find Jackard Similarity

In [23]:
p=5
J = np.zeros(100)
for i in range(100):
    A = set(np.argsort(-pred[i,:])[:p])
    B = set(np.where(A_test[i,:]==1)[0])
    J[i] = len(A.intersection(B))/p #/len(A.union(B))
    print(A,B)

print(np.mean(J))

{43, 15, 53, 54, 29} {0, 32, 2, 4, 5, 37, 7, 10, 11, 14, 46, 16, 17, 18, 50, 23, 30}
{34, 36, 39, 15, 31} {0, 1, 12}
{0, 43, 15, 53, 26} {0, 1, 2, 39, 8, 15, 29}
{43, 15, 53, 27, 31} {0, 1, 4}
{34, 43, 13, 26, 27} {0, 1, 6, 39, 44, 13, 15, 50, 56, 29}
{1, 43, 53, 26, 31} {0, 1, 2, 39, 8, 29}
{13, 46, 54, 23, 26} {0, 6, 39, 13, 50, 56, 58, 29}
{43, 15, 52, 21, 22} {0, 1, 4, 5, 10, 28, 30, 31}
{43, 14, 50, 23, 26} {4, 43, 50, 21, 24, 30}
{7, 43, 15, 53, 27} {0, 2, 6}
{1, 14, 21, 23, 26} {0, 4, 39, 8, 10, 50, 56, 58, 29, 31}
{43, 13, 15, 53, 27} {2, 4, 6, 10, 13}
{1, 43, 13, 15, 53} {0, 1, 4, 36, 10, 17, 57, 28}
{34, 43, 15, 53, 27} {0, 1, 2, 4, 6, 9, 20}
{43, 13, 15, 53, 26} {0, 1, 39, 8, 15, 29}
{1, 3, 40, 21, 26} {0, 50, 21}
{43, 15, 53, 27, 31} {0, 1}
{43, 15, 53, 26, 27} {0, 6, 39, 13, 29}
{34, 43, 13, 53, 27} {2, 5, 6, 7, 9, 10, 11, 44, 13, 19, 21, 23}
{15, 52, 21, 22, 26} {0, 1, 4, 10, 21, 28, 30}
{0, 1, 36, 13, 26} {26}
{15, 53, 22, 24, 26} {27, 34, 19, 21}
{36, 5, 15, 52, 29} {4,

### probability threshold

In [49]:
#print( set(pred[i,np.where(pred[25,:]>0.2)[0]]))

J = np.zeros(1000)
for i in range(1000):
    A = set(np.where(pred[i,:]>0.4)[0])
    B = set(np.where(y_test[i,:]==1)[0])
    J[i] = len(A.intersection(B))/len(A) #/len(A.union(B))

print(np.mean(J))

0.4736147716541023


### Check the percentage of the labels

In [50]:
print(np.mean(y_test,axis=0))
print(np.mean(pred,axis=0))

[0.56284658 0.43715342 0.21072089 0.00924214 0.24399261 0.1922366
 0.1284658  0.12107209 0.12939002 0.07486137 0.16358595 0.05822551
 0.08225508 0.08780037 0.06099815 0.10813309 0.04990758 0.09704251
 0.04251386 0.03789279 0.04158965 0.07855823 0.025878   0.08595194
 0.0194085  0.0323475  0.03327172 0.01571165 0.11460259 0.11460259
 0.01848429 0.05360444 0.02495379 0.02310536 0.04343808 0.02310536
 0.04066543 0.02402957 0.01756007 0.11460259 0.01571165 0.00554529
 0.02680222 0.01478743 0.01478743 0.01201479 0.00831793 0.02033272
 0.03142329 0.01201479 0.05360444 0.00369686 0.02402957 0.00739372
 0.01756007 0.0064695  0.02218115 0.01848429 0.02125693 0.0064695
 0.0064695 ]
[0.7230316  0.500941   0.328649   0.1599551  0.28746173 0.3005466
 0.31592745 0.24927387 0.13052641 0.1918064  0.22012499 0.21638663
 0.25570133 0.312817   0.22577952 0.21150313 0.20382686 0.32516876
 0.14882538 0.13547121 0.19084738 0.09776655 0.14411257 0.15672007
 0.2337083  0.11784337 0.16430606 0.20043522 0.14219

In [51]:
print(np.where(pred[:,3]>0.3)[0].shape[0]/pred.shape[0])
print(np.mean(pred[:,3]))
print(np.std(pred[:,3]))

0.011090573012939002
0.1599551
0.033828806


### Print some of the predictions

In [57]:
J = np.zeros(100)
for i in range(100):
    A = set(np.where(pred[i,:]>0.4)[0])
    B = set(np.where(y_test[i,:]==1)[0])
    J[i] = len(A.intersection(B))/len(A) #/len(A.union(B))
    print(A,B) #, pred[i,np.array(list(B))])



{0, 1} {0, 1, 2}
{0, 1} {0, 1, 39, 8, 12, 44, 29, 31}
{0, 1} {0, 1}
{0, 1} {4}
{0, 1, 2, 5, 6, 7, 9, 13, 17, 53, 27} {0, 1, 4, 6, 8, 9, 14, 49, 18, 19, 20, 21, 23, 25, 27}
{0, 1} {6}
{0, 1} {48}
{0, 1} {16, 32, 23, 7}
{0, 1} {0, 5}
{0, 1} {0, 1, 31}
{0, 1} {48}
{0, 1} {0, 5, 39, 40, 29}
{0, 1} {0, 1, 10, 28}
{0, 1} {2, 4, 6, 9, 13, 22}
{0, 1} {12}
{0, 1} {0, 1}
{0, 1, 6, 13, 60} {0, 1, 33, 34, 14, 20, 21}
{0, 1} {0, 1, 5, 39, 29}
{0, 1} {0, 1, 26}
{0, 1} {23}
{0, 1} {0, 1, 2, 4}
{0, 1} {21}
{0, 1} {0, 4, 5, 10, 17, 28}
{0, 1} {2}
{0, 1} {0, 4, 5, 17, 31}
{0, 1} {0, 1, 10, 12, 26}
{0, 1, 6} {0, 1, 35, 49}
{0, 1} set()
{0, 1} {0, 4}
{0, 1} {0, 5, 10, 11, 43, 17, 28}
{0, 1} {0}
{0, 1} {21, 4, 5, 52}
{0, 1} {16, 4, 5, 7}
{0, 1} {22}
{0, 1, 6} {0, 1, 39, 40, 29}
{0, 1} {0}
{0, 1} {0, 1, 4, 5, 13}
{0, 1} {0, 1, 39, 8, 12, 15, 29}
{0, 1, 6} {0, 1, 2, 42, 14}
{0, 1} {0, 1}
{0, 1, 2} {0, 1, 2, 4, 6, 9, 20}
{0, 1} {0, 4, 31}
{0, 1} {2, 5, 22}
{0, 1} {0, 34, 8, 14, 20, 26}
{0, 1, 6, 13, 17, 24, 6

In [55]:
A = {1,2}
print(np.array(list(A)))

[1 2]
