### Funciones y librerías usadas en el Notebook

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', 30)

In [2]:
# Call example:
  # extract_date_info(df, 'month')
def extract_date_info(df, parametro):
    if parametro == "date":
        df["date"] = [d.date() for d in df["visitStartTime"]]
    if parametro == "time":
        df["time"] = [d.time() for d in df["visitStartTime"]]
    if parametro == "day":
        df["day"] = df["visitStartTime"].dt.day
    if parametro == "month":
        df['month'] = df["visitStartTime"].dt.month
    if parametro == "weekday":
        df['weekday'] = df["visitStartTime"].dt.weekday
    if parametro == "weekofyear": 
        df['weekofyear'] = df["visitStartTime"].dt.weekofyear

In [3]:
df = pd.read_csv('../data/train_v2_cleaned.csv', index_col=0, parse_dates=['visitStartTime'], dtype={'fullVisitorId': 'str'})

  mask |= (ar1 == a)


In [4]:
df.head()

Unnamed: 0,channelGrouping,fullVisitorId,visitNumber,visitStartTime,browser,deviceCategory,operatingSystem,city,country,region,subContinent,adContent,adPosition,campaign,isTrueDirect,medium,source,timeOnSite,totalTransactionRevenue,transactions,prevPurchases
0,Direct,423043652415339154,3.0,2016-08-01 07:00:12,Safari,mobile,iOS,not available in demo dataset,United States,United States,Northern America,,,(not set),True,(none),(direct),,0.0,,0.0
1,Social,8294721032567046680,1.0,2016-08-01 07:04:26,Chrome,desktop,Windows,not available in demo dataset,Thailand,Thailand,Southeast Asia,,,(not set),,referral,youtube.com,,0.0,,0.0
2,Organic Search,7718623669497357235,1.0,2016-08-01 07:04:41,Amazon Silk,tablet,Android,not available in demo dataset,United States,United States,Northern America,,,(not set),,organic,google,40.0,0.0,,0.0
3,Organic Search,4798058133221713505,1.0,2016-08-01 07:06:01,Chrome,desktop,Windows,not available in demo dataset,Canada,Canada,Northern America,,,(not set),,organic,google,89.0,0.0,,0.0
4,Direct,5076747981380011349,1.0,2016-08-01 07:06:10,Chrome,desktop,Windows,Quezon City,Philippines,Metro Manila,Southeast Asia,,,(not set),True,(none),(direct),,0.0,,0.0


In [5]:
df.dtypes

channelGrouping                    object
fullVisitorId                      object
visitNumber                       float64
visitStartTime             datetime64[ns]
browser                            object
deviceCategory                     object
operatingSystem                    object
city                               object
country                            object
region                             object
subContinent                       object
adContent                          object
adPosition                         object
campaign                           object
isTrueDirect                       object
medium                             object
source                             object
timeOnSite                        float64
totalTransactionRevenue           float64
transactions                      float64
prevPurchases                     float64
dtype: object

In [6]:
train_data = df.copy()
cat_cols = ["channelGrouping", "browser", "deviceCategory",
            "operatingSystem", "city", "country", "region",
            "subContinent", 'isTrueDirect', 'medium',
            "source", "adContent", "adPosition", "campaign"]

labelencoder = preprocessing.LabelEncoder()
for c in cat_cols:
    train_data[c] = labelencoder.fit_transform(train_data[c].astype(str))
    labels = train_data[c]
    labelencoder.fit(labels)
    labels = labelencoder.transform(labels)
    
extract_date_info(train_data, 'month')
extract_date_info(train_data, 'weekday')
extract_date_info(train_data, 'weekofyear')
train_data.drop(['visitStartTime'], axis=1, inplace=True)

In [7]:
X = train_data.drop(['totalTransactionRevenue', 'timeOnSite', 'transactions'], axis=1)
y = train_data['totalTransactionRevenue']

#### Clasificación binaria
En este caso se fuerza a un balanceo de los datos:
    - Todos los datos de compras con incluídos
    - Se incluyen tantas no compras como compras/2

La proporción de compras y no compras es, por tanto, 66% y 33% reespectivamente. Este planteamiento presentó mejores resultados que hacer _cost sentitive learning_ desde el clasificador.

In [8]:
train_cero = pd.DataFrame()
train_aux = pd.DataFrame()
train_merge = pd.DataFrame()

train_cero = train_data[train_data.totalTransactionRevenue == 0]
train_aux = train_data[train_data.totalTransactionRevenue != 0]

train_merge = train_merge.append(train_cero.sample(int(train_aux.shape[0]/2)))
train_merge = train_merge.append(train_aux)

In [9]:
X = train_merge.drop(['totalTransactionRevenue', 'timeOnSite', 'transactions', 'fullVisitorId'], axis=1)
y = train_merge['totalTransactionRevenue']
y_binary = y.apply(lambda x: x if x == 0 else 1)

In [10]:
train_x = X
train_y = y_binary

In [11]:
clf = RandomForestClassifier(n_estimators=200, max_depth=40,
                            random_state=1)
# class_weight = 'balanced' or {0:.3, 1:.7}

In [12]:
clf.fit(train_x, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=40, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [13]:
featImportance, featCols = (list(t) for t in zip(*sorted(zip(clf.feature_importances_, list(train_x)), reverse=True)))
for i in range(len(featCols)):
    print(featCols[i], ": ", featImportance[i])

country :  0.18476730223122764
weekofyear :  0.11957611076047389
subContinent :  0.11304337651534699
weekday :  0.08025933792675781
visitNumber :  0.07281263831083913
source :  0.06285709959400175
city :  0.05369019747732076
month :  0.05087404363736084
operatingSystem :  0.04506252810033068
channelGrouping :  0.04087876761898689
region :  0.03949812901416359
medium :  0.03697432322715898
deviceCategory :  0.03552276495078619
browser :  0.022499535730884514
isTrueDirect :  0.015108948952337579
prevPurchases :  0.010777630678963954
adPosition :  0.007369013115752363
adContent :  0.004388370576413139
campaign :  0.0040398815808933225


#### Testing with test.csv

In [14]:
test = pd.read_csv('../data/test_v2_cleaned.csv', index_col=0, parse_dates=['visitStartTime'], dtype={'fullVisitorId': 'str'})

In [15]:
test_data = test.copy()
cat_cols = ["channelGrouping", "browser", "deviceCategory",
            "operatingSystem", "city", "country", "region",
            "subContinent", 'isTrueDirect', 'medium',
            "source", "adContent", "adPosition", "campaign"]

labelencoder = preprocessing.LabelEncoder()
for c in cat_cols:
    test_data[c] = labelencoder.fit_transform(test_data[c].astype(str))
    labels = test_data[c]
    labelencoder.fit(labels)
    labels = labelencoder.transform(labels)
    
extract_date_info(test_data, 'month')
extract_date_info(test_data, 'weekday')
extract_date_info(test_data, 'weekofyear')
test_data.drop('visitStartTime', axis=1, inplace=True)

In [16]:
X_test = test_data.drop(['totalTransactionRevenue', 'timeOnSite', 'transactions', 'fullVisitorId'], axis=1)
y_test = test_data['totalTransactionRevenue']
y_test_binary = y_test.apply(lambda x: x if x == 0 else 1)

In [17]:
print(X_test.shape)
print(y_test.shape)
print(y_test_binary.shape)

(401589, 19)
(401589,)
(401589,)


In [18]:
y_pred_test = clf.predict(X_test)

In [19]:
print(classification_report(y_test_binary, y_pred_test))

             precision    recall  f1-score   support

        0.0       0.99      0.90      0.95    396995
        1.0       0.06      0.51      0.10      4594

avg / total       0.98      0.90      0.94    401589



In [20]:
print(confusion_matrix(y_test_binary, y_pred_test))

[[357731  39264]
 [  2246   2348]]


In [21]:
# porcentaje de usuarios que compran según nuestra predicción
unique, counts = np.unique(np.array(y_pred_test), return_counts=True)
print(unique, counts)
print(counts[1]/y_pred_test.shape[0])

[0. 1.] [359977  41612]
0.1036183760013347
