## Imports and cleaning

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

pd.set_option('display.max_columns', 30)

In [2]:
# Call example:
  # extract_date_info(df, 'month')
def extract_date_info(df, parametro):
    if parametro == "date":
        df["date"] = [d.date() for d in df["visitStartTime"]]
    if parametro == "time":
        df["time"] = [d.time() for d in df["visitStartTime"]]
    if parametro == "hour":
        df["hour"] = [d.hour for d in df["visitStartTime"]]
    if parametro == "day":
        df["day"] = df["visitStartTime"].dt.day
    if parametro == "month":
        df['month'] = df["visitStartTime"].dt.month
    if parametro == "weekday":
        df['weekday'] = df["visitStartTime"].dt.weekday
    if parametro == "weekofyear": 
        df['weekofyear'] = df["visitStartTime"].dt.weekofyear
    return

# handmade mapping
browser = {'Chrome': 1, 'Safari': 2, 'Firefox': 3, 'Internet Explorer': 4, 'Edge': 5, 
           'Android Webview': 0, 'Safari (in-app)': 0, 'Opera Mini': 0, 'Opera': 0,
           'UC Browser': 0}
category = {'desktop': 1, 'mobile': 2, 'tablet': 3}
operatingSystem = {'Windows': 1, 'Macintosh': 2, 'Android': 3, 'iOS': 4,'Linux': 5,
    'Chrome OS': 6, '(not set)': 0, 'Windows Phone': 0, 'BlackBerry': 0, 'Samsung': 0}

In [3]:
df = pd.read_csv('../data/train_v2_cleaned_nVisits.csv', index_col=0, parse_dates=['visitStartTime'], dtype={'fullVisitorId': 'str'})

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


In [4]:
df.head()

Unnamed: 0,channelGrouping,fullVisitorId,visitNumber,visitStartTime,browser,deviceCategory,operatingSystem,city,country,region,subContinent,adContent,adPosition,campaign,campaignCode,isTrueDirect,medium,source,timeOnSite,totalTransactionRevenue,transactions,comprasAnteriores
0,Direct,423043652415339154,3.0,2016-08-01 07:00:12,Safari,mobile,iOS,not available in demo dataset,United States,United States,Northern America,,,(not set),,True,(none),(direct),,0.0,,0.0
1,Social,8294721032567046680,1.0,2016-08-01 07:04:26,Chrome,desktop,Windows,not available in demo dataset,Thailand,Thailand,Southeast Asia,,,(not set),,,referral,youtube.com,,0.0,,0.0
2,Organic Search,7718623669497357235,1.0,2016-08-01 07:04:41,Amazon Silk,tablet,Android,not available in demo dataset,United States,United States,Northern America,,,(not set),,,organic,google,40.0,0.0,,0.0
3,Organic Search,4798058133221713505,1.0,2016-08-01 07:06:01,Chrome,desktop,Windows,not available in demo dataset,Canada,Canada,Northern America,,,(not set),,,organic,google,89.0,0.0,,0.0
4,Direct,5076747981380011349,1.0,2016-08-01 07:06:10,Chrome,desktop,Windows,Quezon City,Philippines,Metro Manila,Southeast Asia,,,(not set),,True,(none),(direct),,0.0,,0.0


In [5]:
train_data = df.copy()

train_data["browser"] = train_data["browser"].map(browser)
train_data["deviceCategory"] = train_data["deviceCategory"].map(category)
train_data["operatingSystem"] = train_data["operatingSystem"].map(operatingSystem)

cat_cols = ["channelGrouping", "city", "browser", "country", "region",
            "subContinent", 'isTrueDirect', 'medium', "deviceCategory",
            "source", "adContent", "adPosition", "campaign", "operatingSystem"]

labelencoder = preprocessing.LabelEncoder()
for c in cat_cols:
    train_data[c] = labelencoder.fit_transform(train_data[c].astype(str))
    labels = train_data[c]
    labelencoder.fit(labels)
    labels = labelencoder.transform(labels)
    
extract_date_info(train_data, 'month')
extract_date_info(train_data, 'weekday')
extract_date_info(train_data, 'weekofyear')
extract_date_info(train_data, 'hour')
train_data.drop(['visitStartTime', 'campaignCode'], axis=1, inplace=True)
train_data.drop(['medium', "source", "adPosition", "campaign"], axis=1, inplace=True)
# train_data.drop(["channelGrouping", 'isTrueDirect', 'medium', "source", "adContent", "adPosition", "campaign"], axis=1, inplace=True)

## Inbalanced training

In [None]:
X = train_data.drop(['totalTransactionRevenue', 'timeOnSite', 'transactions', 'fullVisitorId'], axis=1)
y = train_data['totalTransactionRevenue']
y = np.log1p(y)

In [None]:
# train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.20, random_state=1)
train_x = X
train_y = y

In [None]:
regr = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=200)

In [None]:
regr.fit(train_x, train_y)

In [None]:
featImportance, featCols = (list(t) for t in zip(*sorted(zip(regr.feature_importances_, list(train_x)), reverse=True)))
for i in range(len(featCols)):
    print(featCols[i], ": ", featImportance[i])

In [None]:
# regr.decision_path(train_x)

#### Test with test.csv

In [None]:
test = pd.read_csv('../data/test_v2_cleaned_nVisits.csv', index_col=0, parse_dates=['visitStartTime'], dtype={'fullVisitorId': 'str'})

In [None]:
test_data = test.copy()

# turn this into "high cardinality categorical"
test_data["browser"] = test_data["browser"].map(browser)
test_data["deviceCategory"] = test_data["deviceCategory"].map(category)
test_data["operatingSystem"] = test_data["operatingSystem"].map(operatingSystem)

cat_cols = ["channelGrouping", "city", "browser", "country", "region",
            "subContinent", 'isTrueDirect', 'medium', "deviceCategory",
            "source", "adContent", "adPosition", "campaign", "operatingSystem"]

labelencoder = preprocessing.LabelEncoder()
for c in cat_cols:
    test_data[c] = labelencoder.fit_transform(test_data[c].astype(str))
    labels = test_data[c]
    labelencoder.fit(labels)
    labels = labelencoder.transform(labels)
    
extract_date_info(test_data, 'month')
extract_date_info(test_data, 'weekday')
extract_date_info(test_data, 'weekofyear')
extract_date_info(test_data, 'hour')
test_data.drop(['visitStartTime', 'campaignCode'], axis=1, inplace=True)
test_data.drop(['medium', "source", "adPosition", "campaign"], axis=1, inplace=True)
# test_data.drop(["channelGrouping", 'isTrueDirect', 'medium', "source", "adContent", "adPosition", "campaign"], axis=1, inplace=True)

In [None]:
X_test = test_data.drop(['totalTransactionRevenue', 'timeOnSite', 'transactions', 'fullVisitorId'], axis=1)
y_test = test_data['totalTransactionRevenue']
y_test = np.log1p(y_test)

In [None]:
y_pred_test = regr.predict(X_test)

In [None]:
print(mean_squared_error(y_test, y_pred_test))

In [None]:
# undo the log to check the real values
y_pred_test = pd.DataFrame(y_pred_test).apply(lambda x: np.expm1(x))
y_test = pd.DataFrame(y_test).apply(lambda x: np.expm1(x))
print(y_pred_test.mean())
print(y_test.mean())

In [None]:
# y_pred_test.sample(100)
y_final_res = y_pred_test[0].apply(lambda x: x if x > 2 else 0)
y_final_res.mean()
y_final_res_purchases = y_final_res[y_final_res>0]

In [None]:
y_final_res_purchases.mean()

In [None]:
# evaluación binaria de los resultados
y_pred_test_binary = pd.DataFrame(y_final_res)[0].apply(lambda x: x if x == 0 else 1)
y_test_binary = pd.DataFrame(y_test)['totalTransactionRevenue'].apply(lambda x: x if x == 0 else 1)
print(classification_report(y_test_binary, y_pred_test_binary))

In [None]:
print(confusion_matrix(y_test_binary, y_pred_test_binary))

## Rebalanced training

In [403]:
train_cero = pd.DataFrame()
train_aux = pd.DataFrame()
train_merge = pd.DataFrame()

train_cero = train_data[train_data.totalTransactionRevenue == 0]
train_aux = train_data[train_data.totalTransactionRevenue > 0]
Q1 = train_aux['totalTransactionRevenue'].quantile(0.25)
Q3 = train_aux['totalTransactionRevenue'].quantile(0.75)
IQR = Q3 - Q1
# decir que por debajo no hay outliers
train_aux = train_aux[train_aux['totalTransactionRevenue'] < (Q3+1.5*IQR)]

train_merge = train_merge.append(train_cero.sample(int(train_aux.shape[0]/4.5)))
train_merge = train_merge.append(train_aux)
# train_merge = train_merge.sample(frac=1).reset_index(drop=True)

In [404]:
X = train_merge.drop(['totalTransactionRevenue', 'timeOnSite', 'transactions', 'fullVisitorId'], axis=1)
y = train_merge['totalTransactionRevenue']
y = np.log1p(y)

In [405]:
# train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.20, random_state=1)
train_x = X
train_y = y

In [406]:
regr = RandomForestRegressor(max_depth=40, random_state=0, n_estimators=200)

In [407]:
regr.fit(train_x, train_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=40,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [408]:
featImportance, featCols = (list(t) for t in zip(*sorted(zip(regr.feature_importances_, list(train_x)), reverse=True)))
for i in range(len(featCols)):
    print(featCols[i], ": ", featImportance[i])

country :  0.31417489225083484
weekofyear :  0.12022805943772663
hour :  0.113082751387266
visitNumber :  0.0730772094606962
weekday :  0.0723491941131789
city :  0.049856714287688286
channelGrouping :  0.049119071473061486
deviceCategory :  0.04524971666836344
region :  0.038253131189162036
month :  0.03660963327304144
operatingSystem :  0.029750085866018957
browser :  0.017431051608085238
subContinent :  0.014559591884105643
isTrueDirect :  0.012163896996843067
adContent :  0.010256091393978803
comprasAnteriores :  0.0038389087099492602


In [409]:
# regr.decision_path(train_x)

#### Test with test.csv

In [410]:
test = pd.read_csv('../data/test_v2_cleaned_nVisits.csv', index_col=0, parse_dates=['visitStartTime'], dtype={'fullVisitorId': 'str'})

In [411]:
test_data = test.copy()

# turn this into "high cardinality categorical"
test_data["browser"] = test_data["browser"].map(browser)
test_data["deviceCategory"] = test_data["deviceCategory"].map(category)
test_data["operatingSystem"] = test_data["operatingSystem"].map(operatingSystem)

cat_cols = ["channelGrouping", "city", "browser", "country", "region",
            "subContinent", 'isTrueDirect', 'medium', "deviceCategory",
            "source", "adContent", "adPosition", "campaign", "operatingSystem"]

labelencoder = preprocessing.LabelEncoder()
for c in cat_cols:
    test_data[c] = labelencoder.fit_transform(test_data[c].astype(str))
    labels = test_data[c]
    labelencoder.fit(labels)
    labels = labelencoder.transform(labels)
    
extract_date_info(test_data, 'month')
extract_date_info(test_data, 'weekday')
extract_date_info(test_data, 'weekofyear')
extract_date_info(test_data, 'hour')
test_data.drop(['visitStartTime'], axis=1, inplace=True)
# test_data.drop(["channelGrouping", 'isTrueDirect', 'medium', "source", "adContent", "adPosition", "campaign"], axis=1, inplace=True)
test_data.drop(['medium', "source", "adPosition", "campaign"], axis=1, inplace=True)

In [412]:
X_test = test_data.drop(['totalTransactionRevenue', 'timeOnSite', 'transactions', 'fullVisitorId'], axis=1)
y_test = test_data['totalTransactionRevenue']
# y_test = np.log1p(y_test)

In [413]:
y_pred_test = regr.predict(X_test)

In [414]:
print(mean_squared_error(y_test, y_pred_test))

4439.4494727979545


In [415]:
# undo the log to check the real values
y_pred_test = pd.DataFrame(y_pred_test).apply(lambda x: np.expm1(x))
y_test = pd.DataFrame(y_test).apply(lambda x: np.expm1(x))
print(y_pred_test.mean())
print(y_test.mean())

0    3.489733
dtype: float64
totalTransactionRevenue    inf
dtype: float64


  This is separate from the ipykernel package so we can avoid doing imports until


In [416]:
y_pred_test.median()

0    0.845437
dtype: float64

In [438]:
y_pred_test.describe()

Unnamed: 0,0
count,401589.0
mean,3.489733
std,6.471084
min,0.0
25%,0.194167
50%,0.845437
75%,3.361685
max,74.411992


In [448]:
y_final_res = y_pred_test[0].apply(lambda x: x if x > 1.3 else 0)
y_final_res_purchases = y_final_res[y_final_res>0]

In [449]:
print(y_final_res.mean())
print(y_final_res_purchases.mean())

3.261875191568776
7.918305494779776


In [450]:
# evaluación binaria de los resultados
y_pred_test_binary = pd.DataFrame(y_final_res)[0].apply(lambda x: x if x == 0 else 1)
y_test_binary = pd.DataFrame(y_test)['totalTransactionRevenue'].apply(lambda x: x if x == 0 else 1)
print(classification_report(y_test_binary, y_pred_test_binary))

             precision    recall  f1-score   support

        0.0       1.00      0.59      0.74    396995
        1.0       0.02      0.80      0.04      4594

avg / total       0.99      0.60      0.74    401589



In [451]:
print(confusion_matrix(y_test_binary, y_pred_test_binary))

[[235261 161734]
 [   897   3697]]


In [422]:
# evaluación binaria de los resultados
y_pred_test_binary = pd.DataFrame(y_pred_test)[0].apply(lambda x: x if x == 0 else 1)
y_test_binary = pd.DataFrame(y_test)['totalTransactionRevenue'].apply(lambda x: x if x == 0 else 1)
print(classification_report(y_test_binary, y_pred_test_binary))

             precision    recall  f1-score   support

        0.0       1.00      0.05      0.10    396995
        1.0       0.01      1.00      0.02      4594

avg / total       0.99      0.06      0.10    401589



In [423]:
print(confusion_matrix(y_test_binary, y_pred_test_binary))

[[ 20535 376460]
 [     2   4592]]


In [424]:
# (TP+FP)*2 - TP*142
# (3171+137021)*2 - 3171*142
3171*142/((3171+137021)*2)

1.605947557635243