In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
features = pd.read_pickle('data/features.pickle')

# Preprocessing

In [3]:
features.dtypes

products_bought                 int64
made_purchase                    bool
events_count                    int64
product_views                   int64
total_duration                float64
mean_event_duration           float64
products_viewed                object
mean_time_per_product         float64
user_id                       float64
is_female                        bool
weekday_cos_min               float64
weekday_cos_max               float64
weekday_sin_min               float64
weekday_sin_max               float64
hour_sin_min                  float64
hour_sin_max                  float64
hour_cos_min                  float64
hour_cos_max                  float64
cat_0                          object
cat_1                          object
cat_2                          object
cat_3                          object
syntetic_duration                bool
delivery_duration             float64
syntetic_delivery_duration       bool
dtype: object

feature matrix contains mostly real number or integer values but there are some non numerical types that we must take care of

### Object columns

In [4]:
objects = ['products_viewed', 'cat_0', 'cat_1', 'cat_2', 'cat_3']

In [5]:
features[objects].head()

Unnamed: 0_level_0,products_viewed,cat_0,cat_1,cat_2,cat_3
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100001.0,{1001.0},{telefony i akcesoria},{telefony stacjonarne},{},{}
100002.0,"{1276.0, 1277.0}",{komputery},{tablety i akcesoria},{tablety},{}
100003.0,"{1276.0, 1277.0}",{komputery},{tablety i akcesoria},{tablety},{}
100004.0,"{1030.0, 1032.0, 1033.0, 1034.0, 1036.0, 1037....",{komputery},{monitory},{monitory lcd},{}
100005.0,"{1233.0, 1234.0}",{sprzęt rtv},{video},{odtwarzacze dvd},{}


all of those features are sets of labels that must be encoded

to do so we use one hot encoding technique as the labels are not ordinal

### Products viewed

In [6]:
features.products_viewed.apply(lambda r: str(r)).value_counts(normalize=True)

{1283.0}                                    0.066193
{1318.0}                                    0.064134
{1001.0}                                    0.060016
{1281.0, 1278.0}                            0.032119
{1276.0, 1277.0}                            0.028927
                                              ...   
{1056.0, 1048.0, 1050.0, 1051.0, 1055.0}    0.000103
{1002.0, 1075.0, 1076.0, 1077.0, 1078.0}    0.000103
{1056.0, 1057.0, 1049.0, 1051.0, 1052.0}    0.000103
{1032.0, 1036.0, 1037.0, 1038.0}            0.000103
{1288.0, 1292.0, 1285.0, 1286.0}            0.000103
Name: products_viewed, Length: 2018, dtype: float64

In [7]:
processed_products = pd.get_dummies(features.products_viewed.apply(lambda r: str(r)))
processed_products.shape

(9714, 2018)

### Category 0

In [8]:
features.cat_0.apply(lambda r: str(r)).value_counts(normalize=True)

{'sprzęt rtv'}              0.334054
{'telefony i akcesoria'}    0.254169
{'komputery'}               0.207227
{'gry i konsole'}           0.204550
Name: cat_0, dtype: float64

In [9]:
processed_cat0 = pd.get_dummies(features.cat_0.apply(lambda r: str(r)))
processed_cat0.shape

(9714, 4)

### Category 1

In [10]:
features.cat_1.apply(lambda r: str(r)).value_counts(normalize=True)

{'video'}                      0.200329
{'gry na konsole'}             0.134651
{'akcesoria telefoniczne'}     0.121577
{'telefony komórkowe'}         0.072576
{'monitory'}                   0.071032
{'tablety i akcesoria'}        0.070826
{'gry komputerowe'}            0.069899
{'przenośne audio i video'}    0.069590
{'drukarki i skanery'}         0.065370
{'audio'}                      0.064134
{'telefony stacjonarne'}       0.060016
Name: cat_1, dtype: float64

In [11]:
processed_cat1 = pd.get_dummies(features.cat_1.apply(lambda r: str(r)))
processed_cat1.shape

(9714, 11)

### Category 2

In [12]:
features.cat_2.apply(lambda r: str(r)).value_counts(normalize=True)

set()                                    0.202491
{'telewizory i akcesoria'}               0.135578
{'monitory lcd'}                         0.071032
{'tablety'}                              0.070826
{'odtwarzacze mp3 i mp4'}                0.069590
{'gry playstation3'}                     0.068561
{'gry xbox 360'}                         0.066090
{'biurowe urządzenia wielofunkcyjne'}    0.065370
{'zestawy głośnomówiące'}                0.064958
{'odtwarzacze dvd'}                      0.064752
{'słuchawki'}                            0.064134
{'zestawy słuchawkowe'}                  0.056619
Name: cat_2, dtype: float64

In [13]:
processed_cat2 = pd.get_dummies(features.cat_2.apply(lambda r: str(r)))
processed_cat2.shape

(9714, 12)

### Category 3

In [14]:
features.cat_3.apply(lambda r: str(r)).value_counts(normalize=True)

set()             0.864422
{'anteny rtv'}    0.069384
{'okulary 3d'}    0.066193
Name: cat_3, dtype: float64

In [15]:
processed_cat3 = pd.get_dummies(features.cat_3.apply(lambda r: str(r)))
processed_cat3.shape

(9714, 3)

## New features matrix

In [16]:
preprocessed = features.copy()
preprocessed = preprocessed.drop(objects, axis=1)
preprocessed.shape

(9714, 20)

In [17]:
datasets = [preprocessed, processed_products, processed_cat0, processed_cat1, processed_cat2, processed_cat3]
preprocessed = pd.concat(datasets, axis=1)
preprocessed.shape

(9714, 2068)

# Split data

In [18]:
df = preprocessed.reset_index(drop=True)
train, dev, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
print(f'train data: {len(train)} rows, dev data: {len(dev)} rows, test data: {len(test)} rows')

train data: 5828 rows, dev data: 1943 rows, test data: 1943 rows


In [28]:
input_cols = preprocessed.drop('made_purchase', axis=1).columns
output_col = 'made_purchase'

# Train

In [20]:
from sklearn import svm

In [29]:
x = train[input_cols]
y = train[output_col]

In [30]:
clf = svm.SVC()
clf.fit(x, y)

SVC()

# Test

In [34]:
response = clf.predict(test[input_cols])

In [36]:
results = pd.DataFrame(data={'y': test[output_col], 'response': response})
results['is_correct'] = results.y == results.response

In [37]:
print(f'the score of model a is {round(results.is_correct.sum() / len(results), 4)}')

the score of model a is 0.9058
