### Funciones y librerías usadas en el Notebook

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
import warnings
import sys

from datetime import date
from sklearn.metrics  import  classification_report 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #Ignora los warning

if not sys.warnoptions:
    warnings.simplefilter("ignore")

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
#Funcion que servira para extraer todos los valores que existen en "visitStartTime"
def extract_date_info(df, parametro):
    #Obtenemos la fecha
    if parametro == "date":
        df["date"] = [d.date() for d in df["visitStartTime"]]
    #Obtenemos el tiempo(hora, minutos, segundos)
    if parametro == "time":
        df["time"] = [d.time() for d in df["visitStartTime"]]
    #Obtenemos el dia
    if parametro == "day":
        df["day"] = df["visitStartTime"].dt.day
    #Obtenemos el mes
    if parametro == "month":
        df['month'] = df["visitStartTime"].dt.month
    #Obtenemos el numero que es un dia en la semana, los valores van de 0 a 6
    if parametro == "weekday":
        df['weekday'] = df["visitStartTime"].dt.weekday
    #Obtenemos el numero de la semana en el año
    if parametro == "weekofyear": 
        df['weekofyear'] = df["visitStartTime"].dt.weekofyear
    #Obtenemos la hora
    if parametro == "hour":
        df['hour'] = df['visitStartTime'].dt.hour

In [3]:
train = pd.read_csv("../data/train_v2_cleaned.csv", sep=",", index_col=0, parse_dates=['visitStartTime'], dtype={'fullVisitorId': 'str'});
test = pd.read_csv("../data/test_v2_cleaned.csv", sep=",", index_col=0, parse_dates=['visitStartTime'], dtype={'fullVisitorId': 'str'})

In [4]:
#Guardamos los valores de "totalTransactionRevenue", los cuales serán utilizados para entrenar el modelo y encontrar
#el error cuadratico medio
y_train = train["totalTransactionRevenue"].values
y_test = test["totalTransactionRevenue"].values
y_test_aux = test["totalTransactionRevenue"]

#Guardamos el Id de test que nos servirá para poder generar el fichero final con las predicciones 
test_id = test["fullVisitorId"].values

In [5]:
#Utilizamos la funcion creada antes para extrar solo el mes, tanto en el train como en el test
for data in (train, test):
    extract_date_info(data,'month')

In [6]:
cat_cols = ["channelGrouping","browser","deviceCategory","operatingSystem","city","country","region","subContinent",
            "adContent","adPosition","campaign","isTrueDirect","medium","source"]

In [7]:
#Usaremos labelencoder para convertir las etiquetas en valores numericos, pensamos utilizar hot enconder, pero por la 
#cantidad de datos, no era viable ese metodo.
labelencoder = preprocessing.LabelEncoder()
for data in (train, test):
    for c in cat_cols:
        data[c] = labelencoder.fit_transform(data[c].astype(str))
        labels = data[c]
        labelencoder.fit(labels)
        labels = labelencoder.transform(labels)

In [9]:
#Eliminamos las columnas que hacen "trampa" para la prediccion tanto en el test como en el train
train.drop(columns=['timeOnSite','transactions','fullVisitorId','visitStartTime'], axis=1, inplace=True)
test.drop(columns=['timeOnSite','transactions','fullVisitorId','visitStartTime','totalTransactionRevenue'], axis=1, inplace=True)

In [10]:
#Usaremos "train_test_split" para poder encontrar los valores de entrenamiento y validacion para poder entrenar el modelo
train_lgb = train[:train.shape[0]]
x = train_lgb.drop(columns=['totalTransactionRevenue'], axis=1)
y = np.log1p(train_lgb["totalTransactionRevenue"].astype(float))
train_x, valid_x, train_y, valid_y = train_test_split(x,y, test_size=0.30, random_state=1)

### LGBM

In [11]:
lgb_params = {"objective" : "regression", 
              "metric" : "rmse", 
              "num_leaves" : 100,
              "min_child_samples" : 100,
              "learning_rate" : 0.05,
              "bagging_fraction" : 0.8,
              "feature_fraction" : 0.6,
              "bagging_frequency" : 7,
              "bagging_seed" : 3,
              "verbosity" : -1}
lgb_train = lgb.Dataset(train_x, label=train_y)
lgb_val = lgb.Dataset(valid_x, label=valid_y)
model = lgb.train(lgb_params, lgb_train, 1000, valid_sets=[lgb_train, lgb_val], early_stopping_rounds=500, verbose_eval=100)

Training until validation scores don't improve for 500 rounds.
[100]	training's rmse: 0.397309	valid_1's rmse: 0.405339
[200]	training's rmse: 0.396206	valid_1's rmse: 0.404909
[300]	training's rmse: 0.395621	valid_1's rmse: 0.404904
[400]	training's rmse: 0.395194	valid_1's rmse: 0.404908
[500]	training's rmse: 0.394842	valid_1's rmse: 0.40494
[600]	training's rmse: 0.394481	valid_1's rmse: 0.404989
[700]	training's rmse: 0.394189	valid_1's rmse: 0.405025
Early stopping, best iteration is:
[253]	training's rmse: 0.395874	valid_1's rmse: 0.404883


In [12]:
#Prediccion del test
prediccion_gasto = np.zeros(len(test))
prediccion_gasto = model.predict(test, num_iteration=model.best_iteration)
prediccion  = pd.DataFrame()
prediccion["fullVisitorId"] = test_id
prediccion["PredictedLogRevenue"] = np.expm1(prediccion_gasto)
prediccion["PredictedLogRevenue"] = prediccion["PredictedLogRevenue"].apply(lambda x : 0.0 if x < 0 else x)

#Agruparlos por "fullVisitorId" y sumar sus "PredictedLogRevenue" para luego aplicar el logaritmo a la suma.
submission = prediccion.groupby("fullVisitorId").agg({"PredictedLogRevenue" : "sum"}).reset_index()
submission["PredictedLogRevenue"] = np.log1p(submission["PredictedLogRevenue"])
submission["PredictedLogRevenue"] =  submission["PredictedLogRevenue"].apply(lambda x : 0.0 if x < 0 else x)
submission["PredictedLogRevenue"] = submission["PredictedLogRevenue"].fillna(0.0)

In [22]:
print(mean_squared_error(y_test, prediccion['PredictedLogRevenue']))

4377.197705630635


### Analisis de Resultados

#### Consideramos "compras" solo a aquellas cuya prediccion es diferente a 0

In [13]:
# Evaluación binaria de los resultados
y_pred_test = prediccion["PredictedLogRevenue"].values
y_pred_test_binary = pd.DataFrame(y_pred_test)[0].apply(lambda x: x if x == 0 else 1)
y_test_binary = pd.DataFrame(y_test_aux)['totalTransactionRevenue'].apply(lambda x: x if x == 0 else 1)
print(classification_report(y_test_binary, y_pred_test_binary))

             precision    recall  f1-score   support

        0.0       1.00      0.19      0.32    396995
        1.0       0.01      0.96      0.03      4594

avg / total       0.99      0.20      0.31    401589



In [14]:
print(confusion_matrix(y_test_binary, y_pred_test_binary))

[[ 74474 322521]
 [   188   4406]]


Sin embargo estos resultados no son logicos, ya que aquellas prediccion que marca como compra ("1") son valores que rondan el 0, como por ejemplo 0.001

#### Consideramos "compras" solo a aquellas cuya prediccion es mayor a 1

In [16]:
# Evaluación binaria de los resultados
y_pred_test = prediccion["PredictedLogRevenue"].values
y_pred_test_binary = pd.DataFrame(y_pred_test)[0].apply(lambda x: 0 if x < 1 else 1)
y_test_binary = pd.DataFrame(y_test_aux)['totalTransactionRevenue'].apply(lambda x: x if x == 0 else 1)
print(classification_report(y_test_binary, y_pred_test_binary))

             precision    recall  f1-score   support

        0.0       0.99      1.00      0.99    396995
        1.0       1.00      0.11      0.19      4594

avg / total       0.99      0.99      0.99    401589



In [17]:
print(confusion_matrix(y_test_binary, y_pred_test_binary))

[[396995      0]
 [  4111    483]]


In [19]:
prediccion[prediccion["PredictedLogRevenue"] > 1].sample(20)

Unnamed: 0,fullVisitorId,PredictedLogRevenue
84596,392344150119056175,66.760237
220182,6316136652184945229,103.503385
74852,4828622678500284421,58.415245
307139,9752762228732349732,60.76975
382211,438250177572890948,66.726944
95282,9249130983722228273,115.155428
354664,5080412442132595375,71.373722
74903,2652382083854828728,103.631725
309450,6643124935587485700,123.335658
242703,10364132187346780,125.943505


Como se puede apreciar el modelo predice valores que rondan el 0, como por ejemplo 0.001, esto quiere decir
que nadie compra.

Gracias a este analisis nos dimos cuenta que la orientacion que habiamos tomado para la prediccion no era la correcta.