In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
features = pd.read_pickle('data/features.pickle')

# Preprocessing

In [3]:
features.dtypes

products_bought                 int64
made_purchase                    bool
events_count                    int64
product_views                   int64
total_duration                float64
mean_event_duration           float64
products_viewed                object
mean_time_per_product         float64
user_id                       float64
is_female                        bool
weekday_cos_min               float64
weekday_cos_max               float64
weekday_sin_min               float64
weekday_sin_max               float64
hour_sin_min                  float64
hour_sin_max                  float64
hour_cos_min                  float64
hour_cos_max                  float64
cat_0                          object
cat_1                          object
cat_2                          object
cat_3                          object
syntetic_duration                bool
delivery_duration             float64
syntetic_delivery_duration       bool
dtype: object

Feature matrix contains mostly real number or integer values but there are some non numerical types that we must take care of.

### Object columns

In [4]:
objects = ['products_viewed', 'cat_0', 'cat_1', 'cat_2', 'cat_3']

In [5]:
features[objects].head()

Unnamed: 0_level_0,products_viewed,cat_0,cat_1,cat_2,cat_3
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100001.0,{1001.0},{telefony i akcesoria},{telefony stacjonarne},{},{}
100002.0,"{1276.0, 1277.0}",{komputery},{tablety i akcesoria},{tablety},{}
100003.0,"{1276.0, 1277.0}",{komputery},{tablety i akcesoria},{tablety},{}
100004.0,"{1030.0, 1032.0, 1033.0, 1034.0, 1036.0, 1037....",{komputery},{monitory},{monitory lcd},{}
100005.0,"{1233.0, 1234.0}",{sprzęt rtv},{video},{odtwarzacze dvd},{}


all of those features are sets of labels that must be encoded

to do so we use one hot encoding technique as the labels are not ordinal

### Products viewed

In [6]:
processed_products = pd.get_dummies(features.products_viewed.apply(lambda r: str(r)))
processed_products.shape

(9714, 2018)

### Category 0

In [7]:
processed_cat0 = pd.get_dummies(features.cat_0.apply(lambda r: str(r)))
processed_cat0.shape

(9714, 4)

### Category 1

In [8]:
processed_cat1 = pd.get_dummies(features.cat_1.apply(lambda r: str(r)))
processed_cat1.shape

(9714, 11)

### Category 2

In [9]:
processed_cat2 = pd.get_dummies(features.cat_2.apply(lambda r: str(r)))
processed_cat2.shape

(9714, 12)

### Category 3

In [10]:
processed_cat3 = pd.get_dummies(features.cat_3.apply(lambda r: str(r)))
processed_cat3.shape

(9714, 3)

## New features matrix

In [11]:
preprocessed = features.copy()
preprocessed = preprocessed.drop(objects, axis=1)
datasets = [preprocessed, processed_products, processed_cat0, processed_cat1, processed_cat2, processed_cat3]
preprocessed = pd.concat(datasets, axis=1)
preprocessed.shape

(9714, 2068)

### Reducing dimensions

After performing one hot encoding on columns that are sets of labels we have 2068 features - this value could be reduced if we encoded the presence of any particular label with a boolean flag. That way a set of two labels would have two columns with True value. This also makes sense when adding two vectors - adding a set with label A to a set with label B will result in a set that contains both label A and label B. We will try and create another DataFrame, this time using the second technique.

In [12]:
def multi_hot_encoder(s: pd.Series):
    result = pd.DataFrame(index=s.index)
    numpy_data = pd.DataFrame(s.apply(list).tolist()).to_numpy()
    clean = numpy_data[np.logical_not(pd.isna(numpy_data))]
    unique = np.unique(clean)    
    for c in unique:
        result[c] = s.apply(lambda r: c in r)
    return result

In [21]:
preprocessed_multi_hot = features.copy()
preprocessed_multi_hot = preprocessed_multi_hot.drop(objects, axis=1)
datasets_multi_hot = [preprocessed_multi_hot]

for c in objects:
    print(c)
    datasets_multi_hot.append(multi_hot_encoder(features[c]))

preprocessed_multi_hot = pd.concat(datasets_multi_hot, axis=1)
preprocessed_multi_hot.shape

products_viewed
cat_0
cat_1
cat_2
cat_3


(9714, 121)

That method returned an encoded dataset with as few as 121 features. We will examine which gives better results later on.

# Split data

### One hot encoding

In [22]:
df = preprocessed.reset_index(drop=True)
train, dev, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
print(f'train data: {len(train)} rows, dev data: {len(dev)} rows, test data: {len(test)} rows')

train data: 5828 rows, dev data: 1943 rows, test data: 1943 rows


In [None]:
input_cols = preprocessed.drop('made_purchase', axis=1).columns
output_col = 'made_purchase'

### Multi hot encoding

In [24]:
df_m = preprocessed_multi_hot.reset_index(drop=True)
train_m, dev_m, test_m = np.split(df_m.sample(frac=1), [int(.6*len(df_m)), int(.8*len(df_m))])
print(f'multi hot train data: {len(train_m)} rows, dev data: {len(dev_m)} rows, test data: {len(test_m)} rows')

multi hot train data: 5828 rows, dev data: 1943 rows, test data: 1943 rows


In [30]:
input_cols_m = preprocessed_multi_hot.drop('made_purchase', axis=1).columns
output_col_m = 'made_purchase'

# Train

### One hot encoding

In [26]:
from sklearn import svm

In [27]:
x = train[input_cols]
y = train[output_col]

In [28]:
clf = svm.SVC()
clf.fit(x, y)

SVC()

### Multi hot encoding

In [31]:
x_m = train_m[input_cols_m]
y_m = train_m[output_col_m]

In [32]:
clf_m = svm.SVC()
clf_m.fit(x_m, y_m)

SVC()

# Adjustments

In [41]:
x_dev = dev[input_cols]
y_dev = dev[output_col]

# Test

### One hot encoder

In [33]:
response = clf.predict(test[input_cols])

In [34]:
results = pd.DataFrame(data={'y': test[output_col], 'response': response})
results['is_correct'] = results.y == results.response

In [35]:
print(f'the score of model a is {round(results.is_correct.sum() / len(results), 4)}')

the score of model a is 0.9048


### Multi hot encoder

In [37]:
response_m = clf_m.predict(test_m[input_cols_m])

In [39]:
results_m = pd.DataFrame(data={'y': test_m[output_col_m], 'response': response_m})
results_m['is_correct'] = results_m.y == results_m.response

In [40]:
print(f'the score of model a with multi encoding is {round(results_m.is_correct.sum() / len(results_m), 4)}')

the score of model a with multi encoding is 0.9074


# Conclusions

Although model a with multi encoding works way faster than with one hot encoding (121 vs 2068 features!) it's score is about the same. We will stick with multi encoding just because it's faster and easier to work with.