**Nota importante**: este Notebook contiene una versión no balanceada y otra que sí. La primera está comentada, ya que no es nuestro regresor final y tarda mucho en ejecurse (entrena con todos los datos)

### Funciones y librerías usadas en el Notebook

In [43]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestRegressor

pd.set_option('display.max_columns', 30)

In [2]:
# Call example:
  # extract_date_info(df, 'month')
def extract_date_info(df, parametro):
    if parametro == "date":
        df["date"] = [d.date() for d in df["visitStartTime"]]
    if parametro == "time":
        df["time"] = [d.time() for d in df["visitStartTime"]]
    if parametro == "hour":
        df["hour"] = [d.hour for d in df["visitStartTime"]]
    if parametro == "day":
        df["day"] = df["visitStartTime"].dt.day
    if parametro == "month":
        df['month'] = df["visitStartTime"].dt.month
    if parametro == "weekday":
        df['weekday'] = df["visitStartTime"].dt.weekday
    if parametro == "weekofyear": 
        df['weekofyear'] = df["visitStartTime"].dt.weekofyear
    return

# handmade mapping
browser = {'Chrome': 1, 'Safari': 2, 'Firefox': 3, 'Internet Explorer': 4, 'Edge': 5, 
           'Android Webview': 0, 'Safari (in-app)': 0, 'Opera Mini': 0, 'Opera': 0,
           'UC Browser': 0}
category = {'desktop': 1, 'mobile': 2, 'tablet': 3}
operatingSystem = {'Windows': 1, 'Macintosh': 2, 'Android': 3, 'iOS': 4,'Linux': 5,
    'Chrome OS': 6, '(not set)': 0, 'Windows Phone': 0, 'BlackBerry': 0, 'Samsung': 0}

In [3]:
df = pd.read_csv('../data/train_v2_cleaned.csv', index_col=0, parse_dates=['visitStartTime'], dtype={'fullVisitorId': 'str'})

  mask |= (ar1 == a)


In [4]:
df.head()

Unnamed: 0,channelGrouping,fullVisitorId,visitNumber,visitStartTime,browser,deviceCategory,operatingSystem,city,country,region,subContinent,adContent,adPosition,campaign,isTrueDirect,medium,source,timeOnSite,totalTransactionRevenue,transactions,prevPurchases
0,Direct,423043652415339154,3.0,2016-08-01 07:00:12,Safari,mobile,iOS,not available in demo dataset,United States,United States,Northern America,,,(not set),True,(none),(direct),,0.0,,0.0
1,Social,8294721032567046680,1.0,2016-08-01 07:04:26,Chrome,desktop,Windows,not available in demo dataset,Thailand,Thailand,Southeast Asia,,,(not set),,referral,youtube.com,,0.0,,0.0
2,Organic Search,7718623669497357235,1.0,2016-08-01 07:04:41,Amazon Silk,tablet,Android,not available in demo dataset,United States,United States,Northern America,,,(not set),,organic,google,40.0,0.0,,0.0
3,Organic Search,4798058133221713505,1.0,2016-08-01 07:06:01,Chrome,desktop,Windows,not available in demo dataset,Canada,Canada,Northern America,,,(not set),,organic,google,89.0,0.0,,0.0
4,Direct,5076747981380011349,1.0,2016-08-01 07:06:10,Chrome,desktop,Windows,Quezon City,Philippines,Metro Manila,Southeast Asia,,,(not set),True,(none),(direct),,0.0,,0.0


In [5]:
train_data = df.copy()

train_data["browser"] = train_data["browser"].map(browser)
train_data["deviceCategory"] = train_data["deviceCategory"].map(category)
train_data["operatingSystem"] = train_data["operatingSystem"].map(operatingSystem)

cat_cols = ["channelGrouping", "city", "browser", "country", "region",
            "subContinent", 'isTrueDirect', 'medium', "deviceCategory",
            "source", "adContent", "adPosition", "campaign", "operatingSystem"]

labelencoder = preprocessing.LabelEncoder()
for c in cat_cols:
    train_data[c] = labelencoder.fit_transform(train_data[c].astype(str))
    labels = train_data[c]
    labelencoder.fit(labels)
    labels = labelencoder.transform(labels)
    
extract_date_info(train_data, 'month')
extract_date_info(train_data, 'weekday')
extract_date_info(train_data, 'weekofyear')
extract_date_info(train_data, 'hour')
train_data.drop(['visitStartTime'], axis=1, inplace=True)
train_data.drop(['medium', "source", "adPosition", "campaign"], axis=1, inplace=True)

### Inbalanced training

In [6]:
# X = train_data.drop(['totalTransactionRevenue', 'timeOnSite', 'transactions', 'fullVisitorId'], axis=1)
# y = train_data['totalTransactionRevenue']
# y = np.log1p(y)

In [7]:
# train_x = X
# train_y = y

In [8]:
# regr = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=200)

In [9]:
# regr.fit(train_x, train_y)

In [10]:
# featImportance, featCols = (list(t) for t in zip(*sorted(zip(regr.feature_importances_, list(train_x)), reverse=True)))
# for i in range(len(featCols)):
#     print(featCols[i], ": ", featImportance[i])

#### Test with test.csv

In [11]:
# test = pd.read_csv('../data/test_v2_cleaned_nVisits.csv', index_col=0, parse_dates=['visitStartTime'], dtype={'fullVisitorId': 'str'})

In [12]:
# test_data = test.copy()

# test_data["browser"] = test_data["browser"].map(browser)
# test_data["deviceCategory"] = test_data["deviceCategory"].map(category)
# test_data["operatingSystem"] = test_data["operatingSystem"].map(operatingSystem)

# cat_cols = ["channelGrouping", "city", "browser", "country", "region",
#             "subContinent", 'isTrueDirect', 'medium', "deviceCategory",
#             "source", "adContent", "adPosition", "campaign", "operatingSystem"]

# labelencoder = preprocessing.LabelEncoder()
# for c in cat_cols:
#     test_data[c] = labelencoder.fit_transform(test_data[c].astype(str))
#     labels = test_data[c]
#     labelencoder.fit(labels)
#     labels = labelencoder.transform(labels)
    
# extract_date_info(test_data, 'month')
# extract_date_info(test_data, 'weekday')
# extract_date_info(test_data, 'weekofyear')
# extract_date_info(test_data, 'hour')
# test_data.drop(['visitStartTime'], axis=1, inplace=True)
# test_data.drop(['medium', "source", "adPosition", "campaign"], axis=1, inplace=True)

In [13]:
# X_test = test_data.drop(['totalTransactionRevenue', 'timeOnSite', 'transactions', 'fullVisitorId'], axis=1)
# y_test = test_data['totalTransactionRevenue']
# y_test = np.log1p(y_test)

In [14]:
# y_pred_test = regr.predict(X_test)

In [15]:
# print(mean_squared_error(y_test, y_pred_test))

In [16]:
# # undo the log to check the real values
# y_pred_test = pd.DataFrame(y_pred_test).apply(lambda x: np.expm1(x))
# y_test = pd.DataFrame(y_test).apply(lambda x: np.expm1(x))
# print(y_pred_test.mean())
# print(y_test.mean())

In [17]:
# # y_pred_test.sample(100)
# y_final_res = y_pred_test[0].apply(lambda x: x if x > 2 else 0)
# y_final_res.mean()
# y_final_res_purchases = y_final_res[y_final_res>0]

In [18]:
# y_final_res_purchases.mean()

In [19]:
# # evaluación binaria de los resultados
# y_pred_test_binary = pd.DataFrame(y_final_res)[0].apply(lambda x: x if x == 0 else 1)
# y_test_binary = pd.DataFrame(y_test)['totalTransactionRevenue'].apply(lambda x: x if x == 0 else 1)
# print(classification_report(y_test_binary, y_pred_test_binary))

In [20]:
# print(confusion_matrix(y_test_binary, y_pred_test_binary))

### Rebalanced training

In [21]:
train_cero = pd.DataFrame()
train_aux = pd.DataFrame()
train_merge = pd.DataFrame()

train_cero = train_data[train_data.totalTransactionRevenue == 0]
train_aux = train_data[train_data.totalTransactionRevenue > 0]
# Q1 = train_aux['totalTransactionRevenue'].quantile(0.25)
# Q3 = train_aux['totalTransactionRevenue'].quantile(0.75)
# IQR = Q3 - Q1
# train_aux = train_aux[train_aux['totalTransactionRevenue'] < (Q3+1.5*IQR)]

train_merge = train_merge.append(train_cero.sample(int(train_aux.shape[0]/4.5)))
train_merge = train_merge.append(train_aux)

In [22]:
X = train_merge.drop(['totalTransactionRevenue', 'timeOnSite', 'transactions', 'fullVisitorId'], axis=1)
y = train_merge['totalTransactionRevenue']
y = np.log1p(y)

In [23]:
train_x = X
train_y = y

In [24]:
regr = RandomForestRegressor(max_depth=40, random_state=0, n_estimators=200)

In [25]:
regr.fit(train_x, train_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=40,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [26]:
featImportance, featCols = (list(t) for t in zip(*sorted(zip(regr.feature_importances_, list(train_x)), reverse=True)))
for i in range(len(featCols)):
    print(featCols[i], ": ", featImportance[i])

country :  0.26471054726585436
weekofyear :  0.12456750322342465
hour :  0.11130700993711314
visitNumber :  0.09335534781890763
weekday :  0.07321849995627526
channelGrouping :  0.056132615774948984
deviceCategory :  0.053161251444129254
city :  0.05314005582865515
region :  0.038500934699167245
month :  0.037677335920354126
operatingSystem :  0.032597978996562124
browser :  0.016881806894894942
prevPurchases :  0.0138342658276866
subContinent :  0.012832398357295483
isTrueDirect :  0.010760484549230063
adContent :  0.007321963505500986


#### Test with test.csv

In [27]:
test = pd.read_csv('../data/test_v2_cleaned.csv', index_col=0, parse_dates=['visitStartTime'], dtype={'fullVisitorId': 'str'})

In [28]:
test_data = test.copy()

# turn this into "high cardinality categorical"
test_data["browser"] = test_data["browser"].map(browser)
test_data["deviceCategory"] = test_data["deviceCategory"].map(category)
test_data["operatingSystem"] = test_data["operatingSystem"].map(operatingSystem)

cat_cols = ["channelGrouping", "city", "browser", "country", "region",
            "subContinent", 'isTrueDirect', 'medium', "deviceCategory",
            "source", "adContent", "adPosition", "campaign", "operatingSystem"]

labelencoder = preprocessing.LabelEncoder()
for c in cat_cols:
    test_data[c] = labelencoder.fit_transform(test_data[c].astype(str))
    labels = test_data[c]
    labelencoder.fit(labels)
    labels = labelencoder.transform(labels)
    
extract_date_info(test_data, 'month')
extract_date_info(test_data, 'weekday')
extract_date_info(test_data, 'weekofyear')
extract_date_info(test_data, 'hour')
test_data.drop(['visitStartTime'], axis=1, inplace=True)
test_data.drop(['medium', "source", "adPosition", "campaign"], axis=1, inplace=True)

In [29]:
X_test = test_data.drop(['totalTransactionRevenue', 'timeOnSite', 'transactions', 'fullVisitorId'], axis=1)
y_test = test_data['totalTransactionRevenue']
y_test = np.log1p(y_test)

In [30]:
y_pred_test = regr.predict(X_test)

In [31]:
print(mean_squared_error(y_test, y_pred_test))

2.6204511130776713


In [32]:
# undo the log to check the real values
y_pred_test = pd.DataFrame(y_pred_test).apply(lambda x: np.expm1(x))
y_test = pd.DataFrame(y_test).apply(lambda x: np.expm1(x))
print(y_pred_test.mean())
print(y_test.mean())

0    7.167771
dtype: float64
totalTransactionRevenue    1.534261
dtype: float64


In [33]:
print(y_pred_test.median())
y_pred_test.describe()

0    1.298727
dtype: float64


Unnamed: 0,0
count,401589.0
mean,7.167771
std,16.622969
min,0.0
25%,0.393892
50%,1.298727
75%,3.524955
max,309.578906


In [34]:
y_final_res = y_pred_test[0].apply(lambda x: x if x > 1.3 else 0)
y_final_res_purchases = y_final_res[y_final_res > 0]

In [35]:
y_final_res.sample(15)

309851     4.280262
228756    73.348605
156737     0.000000
167148     2.618932
157107     0.000000
9387       0.000000
334807     0.000000
295290    65.069683
146013     1.313256
70584      0.000000
238243     3.966950
195899     0.000000
203059     5.510222
19161      2.163953
158247     1.696512
Name: 0, dtype: float64

In [36]:
print(y_final_res.mean())
print(y_final_res_purchases.mean())

6.924975841377384
13.857232450024176


In [37]:
# binary evaluation of the results casting to zero all the values smaller that 1.3
y_pred_test_binary = pd.DataFrame(y_final_res)[0].apply(lambda x: x if x == 0 else 1)
y_test_binary = pd.DataFrame(y_test)['totalTransactionRevenue'].apply(lambda x: x if x == 0 else 1)
print(classification_report(y_test_binary, y_pred_test_binary))

             precision    recall  f1-score   support

        0.0       1.00      0.50      0.67    396995
        1.0       0.02      0.88      0.04      4594

avg / total       0.99      0.51      0.66    401589



In [38]:
print(confusion_matrix(y_test_binary, y_pred_test_binary))

[[200330 196665]
 [   570   4024]]


In [44]:
roc_auc_score(y_test_binary, y_pred_test_binary)

0.5215279789418003

In [40]:
# binary evaluation of the results
y_pred_test_binary = pd.DataFrame(y_pred_test)[0].apply(lambda x: x if x == 0 else 1)
y_test_binary = pd.DataFrame(y_test)['totalTransactionRevenue'].apply(lambda x: x if x == 0 else 1)
print(classification_report(y_test_binary, y_pred_test_binary))

             precision    recall  f1-score   support

        0.0       1.00      0.04      0.08    396995
        1.0       0.01      1.00      0.02      4594

avg / total       0.99      0.05      0.08    401589



In [41]:
print(confusion_matrix(y_test_binary, y_pred_test_binary))

[[ 17093 379902]
 [     0   4594]]


In [42]:
# (TP+FP)*2 - TP*142