# Decison Trees

In [20]:
def predict(X_train, y_train, X_test, y_test):
    y_train_pred = clf.predict(X_train)
    print("Training accuracy on all features: {:.3f}".format(acc(y_train, y_train_pred)))

    y_test_pred = clf.predict(X_test)
    print("Testing accuracy on all features: {:.3f}".format(acc(y_test, y_test_pred)))

def cross_val_res(clf, X, y, fold):
    scores = cross_val_score(clf, X.values, y.values, cv=fold)
    print(scores)
    print(scores.mean())

In [56]:
import numpy as np
import pandas as pd
from sklearn import tree

#input_file = "data_3824.csv"
input_file = "data.csv"
data = pd.read_csv(input_file, header = 0)

In [6]:
data.head()

Unnamed: 0,user_id,days_last_order,total_orders,total_value,unique_days_visited,total_sessions,avg_duration_btw_login,avg_session_time,avg_page_visits,label
0,151,5,34,152975,130,404,1.39,437.01,27.42,0
1,332,7,3,2959,17,24,4.71,767.08,31.58,1
2,332,14,13,8791,58,101,1.88,274.68,21.73,0
3,443,22,15,7873,94,177,1.74,200.95,20.94,0
4,444,19,13,9875,101,198,1.82,221.55,18.62,0


In [57]:
df = data[data.columns[1:]]
df = df[df['total_orders']<100]
df.head()

Unnamed: 0,days_last_order,total_orders,total_value,unique_days_visited,total_sessions,avg_duration_btw_login,avg_session_time,avg_page_visits,label
0,5,34,152975,130,404,1.39,437.01,27.42,0
1,7,3,2959,17,24,4.71,767.08,31.58,1
2,14,13,8791,58,101,1.88,274.68,21.73,0
3,22,15,7873,94,177,1.74,200.95,20.94,0
4,19,13,9875,101,198,1.82,221.55,18.62,0


In [58]:
features = list(df.columns[:-1])
features

['days_last_order',
 'total_orders',
 'total_value',
 'unique_days_visited',
 'total_sessions',
 'avg_duration_btw_login',
 'avg_session_time',
 'avg_page_visits']

In [59]:
y = df["label"]
X = df[features]

In [48]:
y

0       1
1       0
2       0
3       0
4       0
       ..
3819    0
3820    1
3821    0
3822    1
3823    0
Name: label, Length: 3824, dtype: int64

In [60]:
cols = ['days_last_order', 'total_orders', 'total_value', 'unique_days_visited',
       'total_sessions', 'avg_duration_btw_login', 'avg_session_time',
       'avg_page_visits']
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score as acc

X_train, X_test, y_train, y_test = train_test_split(df[cols].values, df['label'].values, test_size = 0.25, random_state = 123)

y_train = y_train.ravel()
y_test = y_test.ravel()

print("Training dataset shape: {}, {}".format(X_train.shape, y_train.shape))
print("Testing dataset shape: {}, {}".format(X_test.shape, y_test.shape))

Training dataset shape: (3591, 8), (3591,)
Testing dataset shape: (1197, 8), (1197,)


In [61]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [12]:
X_train

array([[ 1.85257769, -0.35920455, -0.21550768, ..., -0.34826837,
         1.22861779,  1.53861403],
       [ 0.50965265, -0.61661894, -0.21223535, ...,  0.04973726,
         0.24950977, -0.81951837],
       [-0.7111883 ,  1.27108661, -0.00829815, ..., -0.7106317 ,
         2.39753455,  1.14545401],
       ...,
       [ 0.26548446,  0.15562424,  0.23807637, ..., -0.44331449,
         0.74857665,  0.54040144],
       [ 1.73049359,  0.75625782, -0.14858507, ...,  1.87937508,
        -0.03291488, -0.17556365],
       [-1.32160877, -0.70242374, -0.25241866, ..., -0.18193766,
        -0.57308455, -0.60928122]])

In [48]:
clf = tree.DecisionTreeClassifier(max_depth=8)
clf = clf.fit(X_train,y_train)

predict(X_train, y_train, X_test, y_test)
print("Cross validation")
cross_val_res(clf, X, y, 5)

Training accuracy on all features: 0.843
Testing accuracy on all features: 0.763
Cross validation
[0.76826722 0.76617954 0.77557411 0.75757576 0.76907001]
0.7673333289703601


In [53]:
clf = tree.DecisionTreeClassifier(max_depth=8)
clf = clf.fit(X_train_sm,y_train_sm)

predict(X_train, y_train, X_test, y_test)
print("Cross validation")
cross_val_res(clf, X, y, 5)

Training accuracy on all features: 0.489
Testing accuracy on all features: 0.312
Cross validation
[0.77453027 0.76200418 0.78079332 0.75966562 0.76907001]
0.7692126796726898


## Ensemble learning: using a random forest

In [54]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=8)
clf = clf.fit(X_train, y_train)

predict(X_train, y_train, X_test, y_test)
print("Cross validation")
cross_val_res(clf, X, y, 5)

Training accuracy on all features: 0.822
Testing accuracy on all features: 0.527
Cross validation
[0.80375783 0.79853862 0.80271399 0.79728318 0.8014629 ]
0.8007513039836127


In [101]:
%%time
!!!!!!!!!!!!!!!!!!!!!!!!!!! takes so much time
from sklearn import svm
# Build an SVC model
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)

predict(X_train, y_train, X_test, y_test)
print("Cross validation")
cross_val_res(clf, X, y, 5)

Training accuracy on all features: 0.774
Training accuracy on all features: 0.759
Cross validation
[0.7503268  0.75686275 0.74379085 0.7620915  0.7604712 ]
0.7547086199226636


In [62]:
%%time
from sklearn.metrics import accuracy_score
import xgboost as xgb

train = xgb.DMatrix(X_train, label=y_train)
test = xgb.DMatrix(X_test, label=y_test)
prev = 0
for depth in range(8):
    for eta in range(30):
        for epoch in range(20):
            param = {
                'max_depth': depth+1,
                'eta': (eta+1)/10,
                'objective': 'multi:softmax',
                'num_class': 3} 
            epochs = epoch 
            model = xgb.train(param, train, epochs)
            predictions = model.predict(test)
            acc = accuracy_score(y_test, predictions)*100
            #if epoch%5 == 0: print('.',end='')
            if prev<acc:
                prev = acc
                #print()
                print("depth={}, eta={}, epochs={}, acc={}".format(depth+1,(eta+1)/100,epoch,acc))
print('Done')

depth=1, eta=0.01, epochs=0, acc=79.03091060985797
depth=1, eta=0.05, epochs=14, acc=79.11445279866332
depth=1, eta=0.05, epochs=16, acc=79.36507936507937
depth=2, eta=0.06, epochs=7, acc=79.4486215538847
depth=2, eta=0.06, epochs=9, acc=79.53216374269006
depth=4, eta=0.03, epochs=11, acc=79.61570593149541
depth=4, eta=0.03, epochs=14, acc=79.69924812030075
depth=4, eta=0.03, epochs=15, acc=79.9498746867168
Done
Wall time: 2min 42s


In [120]:
import tensorflow as tf
# Training parameters.
learning_rate = 0.001
training_steps = 3000
batch_size = 500
display_step = 100
num_features = 8
num_classes = 2


In [32]:
from tensorflow import keras
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import RMSprop


In [33]:

train_images = X_train_sm.astype('float32')
test_images = X_test.astype('float32')
train_labels = keras.utils.to_categorical(y_train_sm, 2)
test_labels = keras.utils.to_categorical(y_test, 2)

In [34]:
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(8,)))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(2, activation='softmax'))

In [35]:
model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

In [36]:
history = model.fit(train_images, train_labels,
                    batch_size=100,
                    epochs=100,
                    verbose=2,
                    validation_data=(test_images, test_labels))

Epoch 1/100
58/58 - 1s - loss: 0.6777 - accuracy: 0.5684 - val_loss: 0.6183 - val_accuracy: 0.6483
Epoch 2/100
58/58 - 0s - loss: 0.6554 - accuracy: 0.6060 - val_loss: 0.6459 - val_accuracy: 0.5890
Epoch 3/100
58/58 - 0s - loss: 0.6489 - accuracy: 0.6237 - val_loss: 0.6271 - val_accuracy: 0.6115
Epoch 4/100
58/58 - 0s - loss: 0.6516 - accuracy: 0.6198 - val_loss: 0.6450 - val_accuracy: 0.5898
Epoch 5/100
58/58 - 0s - loss: 0.6388 - accuracy: 0.6363 - val_loss: 0.6165 - val_accuracy: 0.6124
Epoch 6/100
58/58 - 0s - loss: 0.6407 - accuracy: 0.6401 - val_loss: 0.6402 - val_accuracy: 0.5890
Epoch 7/100
58/58 - 0s - loss: 0.6397 - accuracy: 0.6424 - val_loss: 0.6162 - val_accuracy: 0.6157
Epoch 8/100
58/58 - 0s - loss: 0.6360 - accuracy: 0.6474 - val_loss: 0.6291 - val_accuracy: 0.6132
Epoch 9/100
58/58 - 0s - loss: 0.6313 - accuracy: 0.6550 - val_loss: 0.6466 - val_accuracy: 0.5764
Epoch 10/100
58/58 - 0s - loss: 0.6287 - accuracy: 0.6495 - val_loss: 0.6534 - val_accuracy: 0.5890
Epoch 11/

In [42]:
score = model.evaluate(test_images, test_labels, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.4950062930583954
Test accuracy: 0.7928153872489929


In [51]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X_train, y_train = sm.fit_resample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_sm.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_sm.shape))
  
print("After OverSampling, counts of label '1': {}".format(sum(y_train_sm == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_sm == 0)))

Before OverSampling, counts of label '1': 704
Before OverSampling, counts of label '0': 2887 

After OverSampling, the shape of train_X: (5774, 8)
After OverSampling, the shape of train_y: (5774,) 

After OverSampling, counts of label '1': 2887
After OverSampling, counts of label '0': 2887
