### Funciones y librerías usadas en el Notebook

In [1]:
import pandas as pd
import numpy as np
# from scipy import stats
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', 30)

In [2]:
# Call example:
  # extract_date_info(df, 'month')
def extract_date_info(df, parametro):
    if parametro == "date":
        df["date"] = [d.date() for d in df["visitStartTime"]]
    if parametro == "time":
        df["time"] = [d.time() for d in df["visitStartTime"]]
    if parametro == "hour":
        df["hour"] = [d.hour for d in df["visitStartTime"]]
    if parametro == "day":
        df["day"] = df["visitStartTime"].dt.day
    if parametro == "month":
        df['month'] = df["visitStartTime"].dt.month
    if parametro == "weekday":
        df['weekday'] = df["visitStartTime"].dt.weekday
    if parametro == "weekofyear": 
        df['weekofyear'] = df["visitStartTime"].dt.weekofyear
    return

# handmade mapping
browser = {'Chrome': 1, 'Safari': 2, 'Firefox': 3, 'Internet Explorer': 4, 'Edge': 5, 
           'Android Webview': 0, 'Safari (in-app)': 0, 'Opera Mini': 0, 'Opera': 0,
           'UC Browser': 0}
category = {'desktop': 1, 'mobile': 2, 'tablet': 3}
operatingSystem = {'Windows': 1, 'Macintosh': 2, 'Android': 3, 'iOS': 4,'Linux': 5,
    'Chrome OS': 6, '(not set)': 0, 'Windows Phone': 0, 'BlackBerry': 0, 'Samsung': 0}

In [3]:
df = pd.read_csv('../data/train_v2_cleaned.csv', index_col=0, parse_dates=['visitStartTime'], dtype={'fullVisitorId': 'str'})

  mask |= (ar1 == a)


In [4]:
df.head()

Unnamed: 0,channelGrouping,fullVisitorId,visitNumber,visitStartTime,browser,deviceCategory,operatingSystem,city,country,region,subContinent,adContent,adPosition,campaign,isTrueDirect,medium,source,timeOnSite,totalTransactionRevenue,transactions,prevPurchases
0,Direct,423043652415339154,3.0,2016-08-01 07:00:12,Safari,mobile,iOS,not available in demo dataset,United States,United States,Northern America,,,(not set),True,(none),(direct),,0.0,,0.0
1,Social,8294721032567046680,1.0,2016-08-01 07:04:26,Chrome,desktop,Windows,not available in demo dataset,Thailand,Thailand,Southeast Asia,,,(not set),,referral,youtube.com,,0.0,,0.0
2,Organic Search,7718623669497357235,1.0,2016-08-01 07:04:41,Amazon Silk,tablet,Android,not available in demo dataset,United States,United States,Northern America,,,(not set),,organic,google,40.0,0.0,,0.0
3,Organic Search,4798058133221713505,1.0,2016-08-01 07:06:01,Chrome,desktop,Windows,not available in demo dataset,Canada,Canada,Northern America,,,(not set),,organic,google,89.0,0.0,,0.0
4,Direct,5076747981380011349,1.0,2016-08-01 07:06:10,Chrome,desktop,Windows,Quezon City,Philippines,Metro Manila,Southeast Asia,,,(not set),True,(none),(direct),,0.0,,0.0


In [5]:
train_data = df.copy()

train_data["browser"] = train_data["browser"].map(browser)
train_data["deviceCategory"] = train_data["deviceCategory"].map(category)
train_data["operatingSystem"] = train_data["operatingSystem"].map(operatingSystem)

cat_cols = ["channelGrouping", "city", "browser", "country", "region",
            "subContinent", 'isTrueDirect', 'medium', "deviceCategory",
            "source", "adContent", "adPosition", "campaign", "operatingSystem"]

labelencoder = preprocessing.LabelEncoder()
for c in cat_cols:
    train_data[c] = labelencoder.fit_transform(train_data[c].astype(str))
    labels = train_data[c]
    labelencoder.fit(labels)
    labels = labelencoder.transform(labels)
    
extract_date_info(train_data, 'month')
extract_date_info(train_data, 'weekday')
extract_date_info(train_data, 'weekofyear')
extract_date_info(train_data, 'hour')
train_data.drop(['visitStartTime'], axis=1, inplace=True)
train_data.drop(['medium', "source", "adPosition", "campaign"], axis=1, inplace=True)

### Class orientation

In [6]:
train_cero = pd.DataFrame()
train_aux = pd.DataFrame()
train_merge = pd.DataFrame()

train_cero = train_data[train_data.totalTransactionRevenue == 0]
train_aux = train_data[train_data.totalTransactionRevenue > 0]
Q1 = train_aux['totalTransactionRevenue'].quantile(0.25)
Q3 = train_aux['totalTransactionRevenue'].quantile(0.75)
IQR = Q3 - Q1
train_aux = train_aux[train_aux['totalTransactionRevenue'] > (Q3+1.5*IQR)]

train_merge = train_merge.append(train_cero.sample(int(train_aux.shape[0])*7))
train_merge = train_merge.append(train_aux)

In [7]:
X = train_merge.drop(['totalTransactionRevenue', 'timeOnSite', 'transactions', 'fullVisitorId'], axis=1)
y = train_merge['totalTransactionRevenue']
y_binary = y.apply(lambda x: x if x == 0 else 1)

In [8]:
train_x = X
train_y = y_binary

In [9]:
clf = RandomForestClassifier(n_estimators=200, max_depth=40,
                            random_state=1, class_weight={0:.1, 1:.9})

In [10]:
clf.fit(train_x, train_y)

RandomForestClassifier(bootstrap=True, class_weight={0: 0.1, 1: 0.9},
            criterion='gini', max_depth=40, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=1,
            verbose=0, warm_start=False)

In [11]:
featImportance, featCols = (list(t) for t in zip(*sorted(zip(clf.feature_importances_, list(train_x)), reverse=True)))
for i in range(len(featCols)):
    print(featCols[i], ": ", featImportance[i])

country :  0.17469030335614483
visitNumber :  0.13543831679577337
subContinent :  0.07762584884994332
hour :  0.07600344563093751
weekofyear :  0.0753667194124979
deviceCategory :  0.07019233602703523
channelGrouping :  0.06240592570502895
isTrueDirect :  0.054633435184832324
operatingSystem :  0.05014153315350687
weekday :  0.048010519122435584
month :  0.04115905712551576
city :  0.040945804507155736
region :  0.034675462872228044
prevPurchases :  0.031903171176525115
browser :  0.023807872272935217
adContent :  0.003000248807504181


#### Test with test.csv

In [12]:
test = pd.read_csv('../data/test_v2_cleaned.csv', index_col=0, parse_dates=['visitStartTime'], dtype={'fullVisitorId': 'str'})

In [13]:
test_data = test.copy()

test_data["browser"] = test_data["browser"].map(browser)
test_data["deviceCategory"] = test_data["deviceCategory"].map(category)
test_data["operatingSystem"] = test_data["operatingSystem"].map(operatingSystem)

cat_cols = ["channelGrouping", "city", "browser", "country", "region",
            "subContinent", 'isTrueDirect', 'medium', "deviceCategory",
            "source", "adContent", "adPosition", "campaign", "operatingSystem"]

labelencoder = preprocessing.LabelEncoder()
for c in cat_cols:
    test_data[c] = labelencoder.fit_transform(test_data[c].astype(str))
    labels = test_data[c]
    labelencoder.fit(labels)
    labels = labelencoder.transform(labels)
    
extract_date_info(test_data, 'month')
extract_date_info(test_data, 'weekday')
extract_date_info(test_data, 'weekofyear')
extract_date_info(test_data, 'hour')
test_data.drop(['visitStartTime'], axis=1, inplace=True)
test_data.drop(['medium', "source", "adPosition", "campaign"], axis=1, inplace=True)

In [14]:
X_test = test_data.drop(['totalTransactionRevenue', 'timeOnSite', 'transactions', 'fullVisitorId'], axis=1)
y_test = test_data['totalTransactionRevenue']
y_test = y_test.apply(lambda x: x if x == 0 else 1)

In [15]:
y_pred_test = clf.predict(X_test)

In [16]:
print(mean_squared_error(y_test, y_pred_test))

0.010306557201516974


In [17]:
print(classification_report(y_test, y_pred_test))

             precision    recall  f1-score   support

        0.0       0.99      1.00      0.99    396995
        1.0       0.96      0.10      0.19      4594

avg / total       0.99      0.99      0.99    401589



In [18]:
print(confusion_matrix(y_test, y_pred_test))

[[396976     19]
 [  4120    474]]


### Rebalanced training with classification hint

In [19]:
train_data = df.copy()

train_data["browser"] = train_data["browser"].map(browser)
train_data["deviceCategory"] = train_data["deviceCategory"].map(category)
train_data["operatingSystem"] = train_data["operatingSystem"].map(operatingSystem)

cat_cols = ["channelGrouping", "city", "browser", "country", "region",
            "subContinent", 'isTrueDirect', 'medium', "deviceCategory",
            "source", "adContent", "adPosition", "campaign", "operatingSystem"]

labelencoder = preprocessing.LabelEncoder()
for c in cat_cols:
    train_data[c] = labelencoder.fit_transform(train_data[c].astype(str))
    labels = train_data[c]
    labelencoder.fit(labels)
    labels = labelencoder.transform(labels)
    
extract_date_info(train_data, 'month')
extract_date_info(train_data, 'weekday')
extract_date_info(train_data, 'weekofyear')
extract_date_info(train_data, 'hour')
train_data.drop(['visitStartTime'], axis=1, inplace=True)
train_data.drop(['medium', "source", "adPosition", "campaign"], axis=1, inplace=True)

In [20]:
train_cero = pd.DataFrame()
train_aux = pd.DataFrame()
train_merge = pd.DataFrame()

train_cero = train_data[train_data.totalTransactionRevenue == 0]
train_aux = train_data[train_data.totalTransactionRevenue > 0]

train_merge = train_merge.append(train_cero.sample(int(train_aux.shape[0]/4.5)))
train_merge = train_merge.append(train_aux)

In [21]:
X = train_merge.drop(['totalTransactionRevenue', 'timeOnSite', 'transactions', 'fullVisitorId'], axis=1)
X['classHint'] = clf.predict(X)
y = train_merge['totalTransactionRevenue']
y = np.log1p(y)

In [22]:
train_x = X
train_y = y

In [23]:
regr = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=200)

In [24]:
regr.fit(train_x, train_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [25]:
featImportance, featCols = (list(t) for t in zip(*sorted(zip(regr.feature_importances_, list(train_x)), reverse=True)))
for i in range(len(featCols)):
    print(featCols[i], ": ", featImportance[i])

country :  0.30472465263572884
classHint :  0.16651183743124037
weekofyear :  0.08914029276269203
hour :  0.08175849678303714
visitNumber :  0.06278250355924408
weekday :  0.05121576242860057
city :  0.0422797476289861
channelGrouping :  0.03828785452738326
region :  0.03194747580028781
deviceCategory :  0.030989036028874407
operatingSystem :  0.027055815332263276
month :  0.026489646430213513
subContinent :  0.013611666206911809
browser :  0.011796285747500287
isTrueDirect :  0.009132818509474396
adContent :  0.0078150830180344
prevPurchases :  0.004461025169527599


#### Test with test.csv

In [26]:
test = pd.read_csv('../data/test_v2_cleaned.csv', index_col=0, parse_dates=['visitStartTime'], dtype={'fullVisitorId': 'str'})

In [27]:
test_data = test.copy()

test_data["browser"] = test_data["browser"].map(browser)
test_data["deviceCategory"] = test_data["deviceCategory"].map(category)
test_data["operatingSystem"] = test_data["operatingSystem"].map(operatingSystem)

cat_cols = ["channelGrouping", "city", "browser", "country", "region",
            "subContinent", 'isTrueDirect', 'medium', "deviceCategory",
            "source", "adContent", "adPosition", "campaign", "operatingSystem"]

labelencoder = preprocessing.LabelEncoder()
for c in cat_cols:
    test_data[c] = labelencoder.fit_transform(test_data[c].astype(str))
    labels = test_data[c]
    labelencoder.fit(labels)
    labels = labelencoder.transform(labels)
    
extract_date_info(test_data, 'month')
extract_date_info(test_data, 'weekday')
extract_date_info(test_data, 'weekofyear')
extract_date_info(test_data, 'hour')
test_data.drop(['visitStartTime'], axis=1, inplace=True)
test_data.drop(['medium', "source", "adPosition", "campaign"], axis=1, inplace=True)

In [28]:
X_test = test_data.drop(['totalTransactionRevenue', 'timeOnSite', 'transactions', 'fullVisitorId'], axis=1)
X_test['classHint'] = clf.predict(X_test)
y_test = test_data['totalTransactionRevenue']
y_test = np.log1p(y_test)

In [29]:
y_pred_test = regr.predict(X_test)

In [30]:
print(mean_squared_error(y_test, y_pred_test))

1.5367903265246439


In [31]:
# undo the log to check the real values
y_pred_test = pd.DataFrame(y_pred_test).apply(lambda x: np.expm1(x))
y_test = pd.DataFrame(y_test).apply(lambda x: np.expm1(x))
print(y_pred_test.mean())
print(y_test.mean())

0    2.986849
dtype: float64
totalTransactionRevenue    1.534261
dtype: float64


In [32]:
y_pred_test.median()

0    0.894483
dtype: float64

In [33]:
y_pred_test.describe()

Unnamed: 0,0
count,401589.0
mean,2.986849
std,7.765001
min,0.0
25%,0.318933
50%,0.894483
75%,2.860894
max,797.173588


In [34]:
y_final_res = y_pred_test[0].apply(lambda x: x if x > 1.3 else 0)
y_final_res_purchases = y_final_res[y_final_res>0]

In [35]:
print(y_final_res.mean())
print(y_final_res_purchases.mean())

2.736426977676951
6.440248800280773


In [36]:
# binary evaluation of the results casting to zero all the values smaller that 1.3
y_pred_test_binary = pd.DataFrame(y_final_res)[0].apply(lambda x: x if x == 0 else 1)
y_test_binary = pd.DataFrame(y_test)['totalTransactionRevenue'].apply(lambda x: x if x == 0 else 1)
print(classification_report(y_test_binary, y_pred_test_binary))

             precision    recall  f1-score   support

        0.0       1.00      0.58      0.73    396995
        1.0       0.02      0.83      0.04      4594

avg / total       0.99      0.58      0.73    401589



In [37]:
print(confusion_matrix(y_test_binary, y_pred_test_binary))

[[230173 166822]
 [   783   3811]]


In [38]:
# binary evaluation of the results
y_pred_test_binary = pd.DataFrame(y_pred_test)[0].apply(lambda x: x if x == 0 else 1)
y_test_binary = pd.DataFrame(y_test)['totalTransactionRevenue'].apply(lambda x: x if x == 0 else 1)
print(classification_report(y_test_binary, y_pred_test_binary))

             precision    recall  f1-score   support

        0.0       1.00      0.04      0.08    396995
        1.0       0.01      1.00      0.02      4594

avg / total       0.99      0.05      0.08    401589



In [39]:
print(confusion_matrix(y_test_binary, y_pred_test_binary))

[[ 17279 379716]
 [     0   4594]]


In [40]:
# (TP+FP)*2 - TP*142