# Formation Data Science Challenge
This notebook includes the following steps for processing data and implementing collaborative algorithm for **Option 1** recommendation.

* Transforming data
* Training and tuning of model parameters
* Evaluation of model performance and selecting the optimal model

In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from collections import Counter
import time
import sys
sys.path.append("..")
import scripts.data_layer as data_layer

## Data

In [3]:
input_file = 'trx_data.csv'
customer_file = 'recommend_1.csv'
output_file = 'option1_recommendations.csv'

In [4]:
formation = data_layer.Formation()

In [5]:
customers = formation.load_data(customer_file)
transactions = formation.load_data(input_file)

In [7]:
print("There are", customers.customerId.nunique(), "customers to be recommended, while overall there are", transactions.customerId.nunique(), "customers in the transactional records.")

There are 1000 customers to be recommended, while overall there are 24429 customers in the transactional records.


## Collaborative Filtering

### 1. Create user-item matrix

In [164]:
x = transactions.set_index('customerId')['products']

In [186]:
x.index.nunique()

24429

In [172]:
user = 0
dic_items = purchase_frequency(user)
dic_items

{1: 2,
 13: 1,
 19: 3,
 20: 1,
 31: 2,
 52: 1,
 69: 2,
 93: 3,
 136: 2,
 157: 1,
 198: 1,
 216: 1,
 255: 2,
 256: 1,
 260: 5}

In [179]:
# define items for each user listed
s = time.time()

dic_users = {}
for user in x.index.unique():
    dic_users[user] = items_bought(user)
    
print("Execution time:", (time.time()-s)/60, "minutes")

Execution time: 5.48611691792806 minutes


In [222]:
len(dic_users)

24429

In [201]:
# create user-item matrix
s = time.time()

df_matrix = pd.DataFrame(dic_users).T

print("Execution time:", (time.time()-s)/60, "minutes")

Execution time: 0.028925200303395588 minutes


## Use centered cosine similarity approach, aka pearson similarity

In [210]:
print(df_matrix.shape) # 24,429 users with 300 listed items
df_matrix.head()

(24429, 300)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,,2.0,,,,,,,,,...,,,,,,,,,,
1,,,6.0,,,,,,,,...,,,,1.0,,,1.0,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [280]:
matrix_user_items = np.array(df_matrix)
print(matrix_user_items.shape)
matrix_user_items

(24429, 300)


array([[ 0.,  2.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  6., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

### 1. Find similarity between every item i and j
* In order to do this, we want to first drop the user column
* We then want to calculate the similarity of each song with the rest of the songs.

In [214]:
(df_matrix.columns == range(300)).all()

True

In [215]:
# use numpy for more efficient, faster fill-in values

n_items = len(df_matrix.columns)
matrix_items = np.zeros([n_items, n_items])

In [218]:
matrix_items.shape

(300, 300)

In [226]:
df_matrix_orig = df_matrix.copy()

In [227]:
df_matrix = df_matrix.fillna(0)

In [234]:
# take item rows for all users (so each song will have a numpy array length of 24429)
# instead of having 0
s = time.time()

for i in range(n_items):
    for j in range(n_items):
        matrix_items[i][j] = 1-cosine(df_matrix[i], df_matrix[j]) #karena that item with that item should be 1 similarity

print("Execution time:", (time.time()-s)/60, "minutes")

Execution time: 0.23325901826222736 minutes


In [236]:
matrix_items.max()

1.0000000000000002

In [237]:
matrix_items.min()

0.0

In [233]:
matrix_items

array([[  0.00000000e+00,   7.67146932e-01,   8.61445036e-01, ...,
          9.67228316e-01,   9.51811953e-01,   9.60932982e-01],
       [  7.67146932e-01,   0.00000000e+00,   7.42186573e-01, ...,
          9.31358297e-01,   9.29584965e-01,   9.40035590e-01],
       [  8.61445036e-01,   7.42186573e-01,   0.00000000e+00, ...,
          9.41921995e-01,   9.07635296e-01,   9.66229562e-01],
       ..., 
       [  9.67228316e-01,   9.31358297e-01,   9.41921995e-01, ...,
          2.22044605e-16,   9.85465347e-01,   9.86978195e-01],
       [  9.51811953e-01,   9.29584965e-01,   9.07635296e-01, ...,
          9.85465347e-01,   0.00000000e+00,   9.79736556e-01],
       [  9.60932982e-01,   9.40035590e-01,   9.66229562e-01, ...,
          9.86978195e-01,   9.79736556e-01,   0.00000000e+00]])

### 2. Create size of the neighbor |N| = 10 as the closest neighbors to item i

In [299]:
n_neighbors = 10

matrix_neighbor_items = np.zeros([n_items, n_neighbors])
matrix_neighbor_indices = np.zeros([n_items, n_neighbors])

In [239]:
matrix_neighbor_items.shape

(300, 10)

In [305]:
for i in range(n_items):
    sorted_indices = np.argsort(matrix_items[i])[::-1][:n_neighbors]
    matrix_neighbor_indices[i] = sorted_indices
    matrix_neighbor_items[i] = matrix_items[i][sorted_indices]

In [306]:
matrix_neighbor_items[i][1:]

array([ 0.06917245,  0.06544502,  0.05996441,  0.05123681,  0.04829415,
        0.04083572,  0.04000979,  0.0393904 ,  0.03906702])

In [307]:
matrix_neighbor_indices[i][1:]

array([   4.,   24.,    1.,    9.,  242.,  144.,   13.,  262.,    0.])

### 3. Predict user items

In [262]:
df_matrix.index # user ids are in sorted format ascendingly

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            28581, 28583, 28585, 28588, 28590, 28593, 28596, 28598, 28604,
            28605],
           dtype='int64', length=24429)

In [264]:
n_users = len(df_matrix.index)
n_users

24429

In [281]:
matrix_pred = np.zeros([n_users, n_items])
matrix_pred.shape

(24429, 300)

In [333]:
# fill cosine similarity scores of the user-item matrix
s = time.time()

for user in range(n_users):
    for item in range(n_items):
        top_neighbor_item_scores = matrix_neighbor_items[item][1:]
        indices = matrix_neighbor_indices[item][1:].astype(np.int64)
        user_purchase = matrix_user_items[user][indices]
        matrix_pred[user][item] = sum(user_purchase*top_neighbor_item_scores)/sum(top_neighbor_item_scores)
        
print("Execution time:", (time.time()-s)/60, "minutes")

Execution time: 1.3145339846611024 minutes


In [334]:
print(matrix_pred.shape)
matrix_pred

(24429, 300)


array([[ 0.52140142,  0.31224023,  0.48867853, ...,  0.25403561,
         0.15000143,  0.35274162],
       [ 0.75409156,  1.06253278,  0.        , ...,  1.0563775 ,
         1.26938434,  0.        ],
       [ 0.        ,  0.0863715 ,  0.09070378, ...,  0.        ,
         0.08937676,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.1011597 ,
         0.        ,  0.        ]])

### 4. Ta da!!

In [339]:
n_recommendations = 10

In [340]:
matrix_recom_scores = np.zeros([n_users, n_recommendations])
matrix_recom_indices = np.zeros([n_users, n_recommendations])

for user in range(n_users):
    sorted_indices = np.argsort(matrix_pred[user])[::-1][:n_recommendations]
    matrix_recom_indices[user] = sorted_indices
    matrix_recom_scores[user] = matrix_pred[user][sorted_indices]

In [341]:
matrix_recom_scores

array([[ 1.19251116,  1.15045529,  1.12172604, ...,  0.92113121,
         0.79271792,  0.76159656],
       [ 1.90311407,  1.59852763,  1.44252037, ...,  1.23194691,
         1.2239327 ,  1.22190491],
       [ 1.19074757,  1.12578236,  1.00379735, ...,  0.33174185,
         0.31780255,  0.30647281],
       ..., 
       [ 0.11992877,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.14043791,  0.12566915,  0.12368609, ...,  0.0732264 ,
         0.        ,  0.        ],
       [ 0.16851937,  0.11596632,  0.1011597 , ...,  0.07353002,
         0.        ,  0.        ]])

In [342]:
matrix_recom_indices

array([[ 217.,  139.,  238., ...,  174.,   31.,   69.],
       [  39.,   14.,  254., ...,   41.,  150.,  227.],
       [  29.,   23.,  107., ...,  186.,  207.,   31.],
       ..., 
       [ 108.,  299.,  101., ...,   99.,  100.,  102.],
       [ 198.,   45.,  218., ...,  252.,  299.,   99.],
       [ 175.,  265.,  297., ...,  224.,  101.,   77.]])

In [360]:
df_recommend = pd.DataFrame(matrix_recom_indices)
df_recommend['customerId'] = df_matrix.index
top_10_recommends = df_recommend[['customerId']+list(df_recommend.columns[:n_recommendations])].astype(np.int64).set_index('customerId')
top_10_recommends

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,217,139,238,230,255,157,43,174,31,69
1,39,14,254,21,3,298,251,41,150,227
2,29,23,107,11,212,164,114,186,207,31
3,117,42,183,122,46,266,25,159,269,160
4,66,268,200,249,127,145,146,12,7,233
5,259,124,272,0,296,110,14,143,73,1
6,242,240,12,28,299,279,153,215,233,203
7,291,172,20,81,198,18,16,55,277,287
8,132,225,38,248,48,194,261,36,166,291
9,259,0,14,112,162,296,276,13,31,295


In [361]:
df_top10 = top_10_recommends.loc[customers.customerId]

### Output

In [362]:
df_top10['recommendedProducts'] = df_top10[list(range(n_recommendations))].apply(lambda x: '|'.join(x.fillna('').map(str)), axis=1)

In [364]:
df_top10

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,recommendedProducts
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1553,35,124,143,2,167,41,111,162,199,269,35|124|143|2|167|41|111|162|199|269
20400,280,122,246,284,299,98,94,95,96,97,280|122|246|284|299|98|94|95|96|97
19750,225,63,48,112,156,269,36,79,38,208,225|63|48|112|156|269|36|79|38|208
6334,14,3,254,162,161,263,19,150,151,1,14|3|254|162|161|263|19|150|151|1
27773,159,131,290,56,42,176,9,201,64,16,159|131|290|56|42|176|9|201|64|16
20027,208,244,66,39,96,263,200,249,273,197,208|244|66|39|96|263|200|249|273|197
10104,260,288,243,127,94,95,96,97,98,99,260|288|243|127|94|95|96|97|98|99
5522,194,36,8,48,79,38,229,261,166,180,194|36|8|48|79|38|229|261|166|180
9978,71,34,30,125,178,31,298,67,69,62,71|34|30|125|178|31|298|67|69|62
20966,189,60,72,171,19,103,69,178,0,209,189|60|72|171|19|103|69|178|0|209


In [367]:
df_top10[['recommendedProducts']].to_csv('../output/option1_recommendations.csv')

### Evaluation

In [None]:
# how does the model perform?
# need to use cross validation, and splitting train and test split!
