Datatón BC 2018 - Robocoach (Implementacion de Random Forest)
Se parte generando una base de datos en formato SQL con las tablas suministradas, esta base de datos está en la platafor AZURE de Microsoft y será cargada desde Google Colab por medio de un notebook de Python

La máquina virtual de Google Colab requiere la instalación de los siguientes paquetes para cargar la base de datos SQL

In [1]:
!apt-get install libqt4-sql-odbc
!apt-get install unixodbc unixodbc-dev --install-suggests
!curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add -
!curl https://packages.microsoft.com/config/ubuntu/18.04/prod.list > /etc/apt/sources.list.d/mssql-release.list
!apt-get update
!ACCEPT_EULA=Y apt-get install msodbcsql17

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libltdl7 libodbc1 libqt4-sql libqtcore4 qtcore4-l10n
Suggested packages:
  libmyodbc odbc-postgresql tdsodbc unixodbc-bin libqt4-dev libicu55 libthai0
The following NEW packages will be installed:
  libltdl7 libodbc1 libqt4-sql libqt4-sql-odbc libqtcore4 qtcore4-l10n
0 upgraded, 6 newly installed, 0 to remove and 2 not upgraded.
Need to get 2,536 kB of archives.
After this operation, 11.7 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libltdl7 amd64 2.4.6-2 [38.8 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 libodbc1 amd64 2.3.4-1.1ubuntu3 [183 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 qtcore4-l10n all 4:4.8.7+dfsg-7ubuntu1 [617 kB]
Get:4 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libqtcore4 amd64 4:4.8.7+dfsg-7ubuntu1 [1,552 kB]
Get:5 h

In [2]:
 !pip install pyodbc

Collecting pyodbc
[?25l  Downloading https://files.pythonhosted.org/packages/aa/70/93a4e75ab19f4e0fa57c4467a8b167674800cf8c8adcf6782c64a0a49a4d/pyodbc-4.0.24.tar.gz (218kB)
[K    100% |████████████████████████████████| 225kB 7.5MB/s 
[?25hBuilding wheels for collected packages: pyodbc
  Running setup.py bdist_wheel for pyodbc ... [?25l- \ | / - \ | / - \ | / - \ done
[?25h  Stored in directory: /root/.cache/pip/wheels/0a/96/4e/358f7804ea32eca5b990a1872ea7b8889433b45b001ab297fd
Successfully built pyodbc
Installing collected packages: pyodbc
Successfully installed pyodbc-4.0.24


In [0]:
import numpy as np
import pandas as pd
import pyodbc
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline


In [0]:
conn = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};Server=tcp:robocoach.database.windows.net,1433;" \
                      "Database=datathon_database;Uid=datathon_admin@robocoach;Pwd=robocoach.2018;Encrypt=yes;"\
                      "TrustServerCertificate=yes;Connection Timeout=120;encoding='cp1252'")

In [0]:
sql = "SELECT seg_str, ocupacion, tipo_vivienda, estado_civil, genero, edad, ingreso_rango, fecha, valor_trx, promedio_transaccion_usuario, promedio_anual_transacciones, num_transacciones, num_anual_transacciones, total_transacciones, categoria FROM transacciones_personas_promedios_categorizadas_v"

In [0]:
data = pd.read_sql(sql,conn) # Se tarda mucho no porque haya muerto la conexion, sino por la cantidad absurda de datos y cross-validation que debe hacer en el data set
data.head()

In [0]:
X = data[[column for column in data.columns if column != 'categoria' or column != 'id_cliente']]
y = data[[column for column in data.columns if column == 'categoria']]
X.shape

In [0]:
categorical_feats = ['seg_str', 'ocupacion', 'tipo_vivienda', 'estado_civil', 'genero']

In [0]:
def convert_categorical_features(df, keys):
    le = LabelEncoder()
    for key in keys:
        df[key] = le.fit_transform(df[key])
    return df

In [0]:
def pretty_print_split_results(results_dict, n_iter):
  iter_range = range(1, n_iter+1)
  print_colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
  line_styles = ['-', '--', '-.', ':']
  marker_styles= ['s','o']
  fig = plt.figure(figsize=(9,7))
  ax = fig.add_subplot(111)
  
  mean_test_score = results_dict['mean_test_score']
  std_test_score  = results_dict['std_test_score']
  
  mean_train_score = results_dict['mean_train_score']
  std_train_score = results_dict['std_train_score']
  
  ax.plot(
    iter_range, mean_test_score, color="b",
    marker="s", linestyle="--", markersize="5",
    label="Mean Test Score"
  )
  
  ax.fill_between(
    iter_range, mean_test_score + std_test_score, mean_test_score - std_test_score,
    alpha=0.25, color="b"
  )
  
  ax.plot(
    iter_range, mean_train_score, color="g",
    marker="o", linestyle="-", markersize="5",
    label="Mean Train Score"
  )
  
  ax.fill_between(
    iter_range, mean_train_score + std_train_score, mean_train_score - std_train_score,
    alpha=0.25, color="g"
  )
  
  ax.set_xlabel("Iterations")
  ax.set_ylabel("Scores")
  ax.set_ylim(auto=True)
  ax.grid(linestyle="--")
  ax.legend(loc="lower right")
  

In [0]:
X = convert_categorical_features(X, categorical_feats)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

In [0]:
param_space = {
    'classifier__n_estimators': [1, 10, 20, 50, 100],
    'classifier__max_depth': [1, 2, 5, 10, 20]
}

classifier = model = Pipeline([
    ('imp', Imputer(missing_values='NaN', strategy='median')),
    ('classifier', RandomForestClassifier)
])

In [0]:
grid_search = GridSearchCV(param_grid=param_space, estimator=classifier, scoring='f1', cv=5, return_train_score=True)

In [0]:
grid_search.fit(X_train, y_train)

In [0]:
grid_search.best_score_

In [0]:
grid_search.best_params_

In [0]:
best_model = grid_search.best_estimator_
y_train_pred= grid_search.predict(X_train)
y_test_pred = grid_search.predict(X_test)