In [32]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import keras
import h5py
import PIL
import seaborn as sns
import plotly
import sklearn_pandas
from IPython.display import Image
from IPython.core.display import HTML 
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn_pandas import DataFrameMapper
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer

In [2]:
data = pd.read_csv('..//TP2//Seasons_Stats.csv')

In [3]:
columns = ['blanl','blank2', 'Tm']
data = data.drop(columns, axis=1)

In [4]:
data = data.drop(data[data.Year < 1980].index, axis=0)
pd.set_option('display.max_columns', 100)
df = pd.DataFrame(data)

In [5]:
df[['3P%','FT%', '2P%', '3PAr', 'FTr', 'FG%', 'eFG%', 'TS%', 'TOV%']] = df[['3P%','FT%', '2P%', '3PAr', 'FTr', 'FG%', 'eFG%', 'TS%', 'TOV%']].fillna(0)
df = df.drop_duplicates(subset=['Player','Year'])
df = df.rename(columns = {'Unnamed: 0' : 'id'})
df.insert(18, 'morePoints', 'No', allow_duplicates = True)

In [6]:
def funcionSetMorePoints(id):
    auxiliar = df.loc[id]
    temporadaSiguiente = df.loc[(df.Player == auxiliar.Player) & (df.Year == auxiliar.Year  + 1)]      
    if ((not temporadaSiguiente.empty) and (auxiliar.PTS <= temporadaSiguiente.PTS.item())):
        df.loc[df.id == id,'morePoints'] = 'Yes'
    else:
        if temporadaSiguiente.empty:
            df.loc[df.id == id,'morePoints'] = 'None'
    return id

In [7]:
df.id = df.id.apply(funcionSetMorePoints)

In [8]:
df = df[df['morePoints'] != 'None']
df['GS'] = df['GS'].fillna(32.033531)
df = df.dropna()

In [9]:
df['Pos'] = df['Pos'].str.replace('-',' ')
columns = ['id','Player','Year']
df = df.drop(columns, axis=1)

# TP N° 2 - Experimentación

## Métrica a utilizar

**Accuracy**<br>
Accuracy nos indica el porcentaje de casos que se acertaron. <br>
Los problemas que pueden aparecer al utilizar esta métrica no aplicarían para este modelo, ya que: <br>
•	Como pudimos apreciar en el análisis exploratorio de datos, no poseemos un gran desbalanceo en los datos. <br>
•	El resultado de la predicción no tiene diferente impacto (tiene igual impacto errar por Si o por No). Creemos que el costo de obtener falsos positivos no es alto. <br>
Por esta razón consideramos que la métrica Accuracy es la que más se adapta a nuestro caso de estudio.


## Técnica de feature engineering

Podría ser Binning (para achicar los decimales). Data pasada creemos que no xq tendríamos que tener los registros de cada jugador al momento de predecir y no los vamos a tener

In [10]:
df_binning = df.round(1)
display(df)
display(df_binning)

Unnamed: 0,Pos,Age,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,morePoints,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
5727,C,32.0,82.0,32.033531,3143.0,25.3,0.639,0.001,0.344,7.2,22.2,15.4,16.5,1.2,4.6,Yes,15.7,24.1,9.5,5.3,14.8,0.227,4.0,2.7,6.7,6.8,835.0,1383.0,0.604,0.0,1.0,0.000,835.0,1382.0,0.604,0.604,364.0,476.0,0.765,190.0,696.0,886.0,371.0,81.0,280.0,297.0,216.0,2034.0
5728,PF,25.0,67.0,32.033531,1222.0,11.0,0.511,0.003,0.258,5.4,12.0,8.6,9.3,1.4,0.6,No,9.9,13.3,1.2,0.8,2.0,0.080,-1.5,-0.1,-1.6,0.1,153.0,318.0,0.481,0.0,1.0,0.000,153.0,317.0,0.483,0.481,56.0,82.0,0.683,62.0,129.0,191.0,87.0,35.0,12.0,39.0,118.0,362.0
5729,C,25.0,75.0,32.033531,2168.0,19.2,0.571,0.002,0.270,8.2,22.4,15.4,21.6,2.3,1.4,No,18.2,21.9,3.1,3.9,7.0,0.155,1.6,2.8,4.4,3.5,465.0,875.0,0.531,0.0,2.0,0.000,465.0,873.0,0.533,0.531,188.0,236.0,0.797,158.0,451.0,609.0,322.0,108.0,55.0,218.0,237.0,1118.0
5730,PG,31.0,80.0,80.000000,2864.0,15.3,0.574,0.023,0.548,2.3,5.3,3.8,30.2,1.7,0.2,No,19.7,17.0,5.9,2.9,8.9,0.148,1.1,-1.1,0.0,1.5,383.0,794.0,0.482,4.0,18.0,0.222,379.0,776.0,0.488,0.485,361.0,435.0,0.830,59.0,138.0,197.0,671.0,106.0,10.0,242.0,218.0,1131.0
5731,C,31.0,26.0,32.033531,560.0,7.4,0.524,0.000,0.833,6.0,16.9,11.5,9.0,1.0,1.5,Yes,24.8,7.9,0.1,0.5,0.6,0.053,-2.9,1.5,-1.4,0.1,27.0,60.0,0.450,0.0,0.0,0.000,27.0,60.0,0.450,0.450,32.0,50.0,0.640,29.0,86.0,115.0,40.0,12.0,15.0,27.0,66.0,86.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24090,PG,23.0,41.0,0.000000,384.0,9.9,0.428,0.272,0.148,1.7,12.5,7.1,26.9,1.9,0.0,No,15.5,24.4,-0.5,0.5,-0.1,-0.009,-3.9,-1.5,-5.4,-0.3,62.0,169.0,0.367,10.0,46.0,0.217,52.0,123.0,0.423,0.396,20.0,25.0,0.800,6.0,44.0,50.0,65.0,15.0,0.0,33.0,30.0,154.0
24091,SG,30.0,54.0,2.000000,1033.0,9.1,0.483,0.637,0.204,1.4,9.0,5.1,5.4,1.1,0.5,Yes,6.9,18.5,0.1,0.0,0.1,0.003,-1.4,-3.5,-4.8,-0.7,126.0,372.0,0.339,77.0,237.0,0.325,49.0,135.0,0.363,0.442,63.0,76.0,0.829,14.0,83.0,97.0,34.0,23.0,7.0,30.0,50.0,392.0
24092,PF,27.0,73.0,73.000000,2407.0,17.5,0.533,0.031,0.169,8.1,23.0,15.4,9.5,2.3,1.2,No,11.6,21.6,1.9,2.2,4.1,0.081,-0.5,0.6,0.1,1.3,495.0,963.0,0.514,7.0,30.0,0.233,488.0,933.0,0.523,0.518,105.0,163.0,0.644,176.0,484.0,660.0,135.0,112.0,37.0,136.0,182.0,1102.0
24093,C,23.0,73.0,60.000000,1774.0,16.1,0.592,0.023,0.531,8.4,19.7,14.0,6.3,1.6,2.9,Yes,11.2,15.4,3.6,2.7,6.3,0.169,-1.0,2.0,1.0,1.3,231.0,437.0,0.529,1.0,10.0,0.100,230.0,427.0,0.539,0.530,175.0,232.0,0.754,138.0,317.0,455.0,71.0,57.0,63.0,68.0,204.0,638.0


Unnamed: 0,Pos,Age,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,morePoints,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
5727,C,32.0,82.0,32.0,3143.0,25.3,0.6,0.0,0.3,7.2,22.2,15.4,16.5,1.2,4.6,Yes,15.7,24.1,9.5,5.3,14.8,0.2,4.0,2.7,6.7,6.8,835.0,1383.0,0.6,0.0,1.0,0.0,835.0,1382.0,0.6,0.6,364.0,476.0,0.8,190.0,696.0,886.0,371.0,81.0,280.0,297.0,216.0,2034.0
5728,PF,25.0,67.0,32.0,1222.0,11.0,0.5,0.0,0.3,5.4,12.0,8.6,9.3,1.4,0.6,No,9.9,13.3,1.2,0.8,2.0,0.1,-1.5,-0.1,-1.6,0.1,153.0,318.0,0.5,0.0,1.0,0.0,153.0,317.0,0.5,0.5,56.0,82.0,0.7,62.0,129.0,191.0,87.0,35.0,12.0,39.0,118.0,362.0
5729,C,25.0,75.0,32.0,2168.0,19.2,0.6,0.0,0.3,8.2,22.4,15.4,21.6,2.3,1.4,No,18.2,21.9,3.1,3.9,7.0,0.2,1.6,2.8,4.4,3.5,465.0,875.0,0.5,0.0,2.0,0.0,465.0,873.0,0.5,0.5,188.0,236.0,0.8,158.0,451.0,609.0,322.0,108.0,55.0,218.0,237.0,1118.0
5730,PG,31.0,80.0,80.0,2864.0,15.3,0.6,0.0,0.5,2.3,5.3,3.8,30.2,1.7,0.2,No,19.7,17.0,5.9,2.9,8.9,0.1,1.1,-1.1,0.0,1.5,383.0,794.0,0.5,4.0,18.0,0.2,379.0,776.0,0.5,0.5,361.0,435.0,0.8,59.0,138.0,197.0,671.0,106.0,10.0,242.0,218.0,1131.0
5731,C,31.0,26.0,32.0,560.0,7.4,0.5,0.0,0.8,6.0,16.9,11.5,9.0,1.0,1.5,Yes,24.8,7.9,0.1,0.5,0.6,0.1,-2.9,1.5,-1.4,0.1,27.0,60.0,0.4,0.0,0.0,0.0,27.0,60.0,0.4,0.4,32.0,50.0,0.6,29.0,86.0,115.0,40.0,12.0,15.0,27.0,66.0,86.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24090,PG,23.0,41.0,0.0,384.0,9.9,0.4,0.3,0.1,1.7,12.5,7.1,26.9,1.9,0.0,No,15.5,24.4,-0.5,0.5,-0.1,-0.0,-3.9,-1.5,-5.4,-0.3,62.0,169.0,0.4,10.0,46.0,0.2,52.0,123.0,0.4,0.4,20.0,25.0,0.8,6.0,44.0,50.0,65.0,15.0,0.0,33.0,30.0,154.0
24091,SG,30.0,54.0,2.0,1033.0,9.1,0.5,0.6,0.2,1.4,9.0,5.1,5.4,1.1,0.5,Yes,6.9,18.5,0.1,0.0,0.1,0.0,-1.4,-3.5,-4.8,-0.7,126.0,372.0,0.3,77.0,237.0,0.3,49.0,135.0,0.4,0.4,63.0,76.0,0.8,14.0,83.0,97.0,34.0,23.0,7.0,30.0,50.0,392.0
24092,PF,27.0,73.0,73.0,2407.0,17.5,0.5,0.0,0.2,8.1,23.0,15.4,9.5,2.3,1.2,No,11.6,21.6,1.9,2.2,4.1,0.1,-0.5,0.6,0.1,1.3,495.0,963.0,0.5,7.0,30.0,0.2,488.0,933.0,0.5,0.5,105.0,163.0,0.6,176.0,484.0,660.0,135.0,112.0,37.0,136.0,182.0,1102.0
24093,C,23.0,73.0,60.0,1774.0,16.1,0.6,0.0,0.5,8.4,19.7,14.0,6.3,1.6,2.9,Yes,11.2,15.4,3.6,2.7,6.3,0.2,-1.0,2.0,1.0,1.3,231.0,437.0,0.5,1.0,10.0,0.1,230.0,427.0,0.5,0.5,175.0,232.0,0.8,138.0,317.0,455.0,71.0,57.0,63.0,68.0,204.0,638.0


## Generando los Mappers

In [11]:
mapper = DataFrameMapper([
    (['Age'], [StandardScaler()]),
    (['G'], [StandardScaler()]),
    (['GS'], [StandardScaler()]),
    (['MP'], [StandardScaler()]),
    (['PER'], [StandardScaler()]),
    (['ORB%'], [StandardScaler()]),
    (['DRB%'], [StandardScaler()]),
    (['TRB%'], [StandardScaler()]),
    (['AST%'], [StandardScaler()]),
    (['TOV%'], [StandardScaler()]),
    (['OWS'], [StandardScaler()]),
    (['DWS'], [StandardScaler()]),
    (['WS'], [StandardScaler()]),
    (['STL%'], [StandardScaler()]),
    (['BLK%'], [StandardScaler()]),
    (['OWS'], [StandardScaler()]),
    (['OBPM'], [StandardScaler()]),
    (['DBPM'], [StandardScaler()]),
    (['BPM'], [StandardScaler()]),
    (['VORP'], [StandardScaler()]),
    (['USG%'], [StandardScaler()]),
    (['PTS'], [StandardScaler()]),
    (['FG'], [StandardScaler()]),
    (['3P'], [StandardScaler()]),
    (['3PA'], [StandardScaler()]),
    (['2P'],[StandardScaler()]),
    (['2PA'],[StandardScaler()]),
    (['FT'], [StandardScaler()]),
    (['FTA'],[StandardScaler()]),
    (['ORB'],[StandardScaler()]),
    (['DRB'],[StandardScaler()]),
    (['TRB'],[StandardScaler()]),
    (['STL'],[StandardScaler()]),
    (['BLK'],[StandardScaler()]),
    (['TOV'],[StandardScaler()]),
    (['PF'],[StandardScaler()]),
    (['Pos'],[OneHotEncoder()]), 
])

mapper2 = DataFrameMapper([
    (['Age'], [MinMaxScaler()]),
    (['G'], [MinMaxScaler()]),
    (['GS'], [MinMaxScaler()]),
    (['MP'], [MinMaxScaler()]),
    (['PER'], [MinMaxScaler()]),
    (['ORB%'], [MinMaxScaler()]),
    (['DRB%'], [MinMaxScaler()]),
    (['TRB%'], [MinMaxScaler()]),
    (['AST%'], [MinMaxScaler()]),
    (['TOV%'], [MinMaxScaler()]),
    (['OWS'], [MinMaxScaler()]),
    (['DWS'], [MinMaxScaler()]),
    (['WS'], [MinMaxScaler()]),
    (['STL%'], [MinMaxScaler()]),
    (['BLK%'], [MinMaxScaler()]),
    (['OWS'], [MinMaxScaler()]),
    (['OBPM'], [MinMaxScaler()]),
    (['DBPM'], [MinMaxScaler()]),
    (['BPM'], [MinMaxScaler()]),
    (['VORP'], [MinMaxScaler()]),
    (['USG%'], [MinMaxScaler()]),
    (['PTS'], [MinMaxScaler()]),
    (['FG'], [MinMaxScaler()]),
    (['3P'], [MinMaxScaler()]),
    (['3PA'], [MinMaxScaler()]),
    (['2P'],[MinMaxScaler()]),
    (['2PA'],[MinMaxScaler()]),
    (['FT'], [MinMaxScaler()]),
    (['FTA'],[MinMaxScaler()]),
    (['ORB'],[MinMaxScaler()]),
    (['DRB'],[MinMaxScaler()]),
    (['TRB'],[MinMaxScaler()]),
    (['STL'],[MinMaxScaler()]),
    (['BLK'],[MinMaxScaler()]),
    (['TOV'],[MinMaxScaler()]),
    (['PF'],[MinMaxScaler()]),
    (['Pos'],[OneHotEncoder()]), 
])


mapper3 = DataFrameMapper([
    (['Age'], [RobustScaler()]),
    (['G'], [RobustScaler()]),
    (['GS'], [RobustScaler()]),
    (['MP'], [RobustScaler()]),
    (['PER'], [RobustScaler()]),
    (['ORB%'], [RobustScaler()]),
    (['DRB%'], [RobustScaler()]),
    (['TRB%'], [RobustScaler()]),
    (['AST%'], [RobustScaler()]),
    (['TOV%'], [RobustScaler()]),
    (['OWS'], [RobustScaler()]),
    (['DWS'], [RobustScaler()]),
    (['WS'], [RobustScaler()]),
    (['STL%'], [RobustScaler()]),
    (['BLK%'], [RobustScaler()]),
    (['OWS'], [RobustScaler()]),
    (['OBPM'], [RobustScaler()]),
    (['DBPM'], [RobustScaler()]),
    (['BPM'], [RobustScaler()]),
    (['VORP'], [RobustScaler()]),
    (['USG%'], [RobustScaler()]),
    (['PTS'], [RobustScaler()]),
    (['FG'], [RobustScaler()]),
    (['3P'], [RobustScaler()]),
    (['3PA'], [RobustScaler()]),
    (['2P'],[RobustScaler()]),
    (['2PA'],[RobustScaler()]),
    (['FT'], [RobustScaler()]),
    (['FTA'],[RobustScaler()]),
    (['ORB'],[RobustScaler()]),
    (['DRB'],[RobustScaler()]),
    (['TRB'],[RobustScaler()]),
    (['STL'],[RobustScaler()]),
    (['BLK'],[RobustScaler()]),
    (['TOV'],[RobustScaler()]),
    (['PF'],[RobustScaler()]),
    (['Pos'],[OneHotEncoder()]), 
])

## Dividiendo el Dataset

### Dataset con Binning

In [12]:
#Dejamos un 80% del dataset para entrenar, un 10% para validar y un 10% para testear
train_binning, not_train_binning = train_test_split(df_binning, test_size=0.2, random_state=42)
validation_binning, test_binning = train_test_split(not_train_binning, test_size=0.5,random_state=42)

### Dataset sin Binning

In [13]:
#Dejamos un 80% del dataset para entrenar, un 10% para validar y un 10% para testear
train, not_train = train_test_split(df, test_size=0.2, random_state=42)
validation, test = train_test_split(not_train, test_size=0.5,random_state=42)

#### Entrenamos los Mappers con el dataset sin Binning

In [14]:
mapper.fit(train)
mapper2.fit(train)

DataFrameMapper(drop_cols=[],
                features=[(['Age'], [MinMaxScaler()]),
                          (['G'], [MinMaxScaler()]), (['GS'], [MinMaxScaler()]),
                          (['MP'], [MinMaxScaler()]),
                          (['PER'], [MinMaxScaler()]),
                          (['ORB%'], [MinMaxScaler()]),
                          (['DRB%'], [MinMaxScaler()]),
                          (['TRB%'], [MinMaxScaler()]),
                          (['AST%'], [MinMaxScaler()]),
                          (['TOV%'], [MinMaxScaler()]),
                          (['OWS'], [MinMaxScaler()]),
                          (['DWS'], [Min...
                          (['DBPM'], [MinMaxScaler()]),
                          (['BPM'], [MinMaxScaler()]),
                          (['VORP'], [MinMaxScaler()]),
                          (['USG%'], [MinMaxScaler()]),
                          (['PTS'], [MinMaxScaler()]),
                          (['FG'], [MinMaxScaler()]),
              

#### Entrenamos los Mappers con el dataset con Binning

In [15]:
mapper.fit(train_binning)
mapper2.fit(train_binning)

DataFrameMapper(drop_cols=[],
                features=[(['Age'], [MinMaxScaler()]),
                          (['G'], [MinMaxScaler()]), (['GS'], [MinMaxScaler()]),
                          (['MP'], [MinMaxScaler()]),
                          (['PER'], [MinMaxScaler()]),
                          (['ORB%'], [MinMaxScaler()]),
                          (['DRB%'], [MinMaxScaler()]),
                          (['TRB%'], [MinMaxScaler()]),
                          (['AST%'], [MinMaxScaler()]),
                          (['TOV%'], [MinMaxScaler()]),
                          (['OWS'], [MinMaxScaler()]),
                          (['DWS'], [Min...
                          (['DBPM'], [MinMaxScaler()]),
                          (['BPM'], [MinMaxScaler()]),
                          (['VORP'], [MinMaxScaler()]),
                          (['USG%'], [MinMaxScaler()]),
                          (['PTS'], [MinMaxScaler()]),
                          (['FG'], [MinMaxScaler()]),
              

## Entrenando los Modelos

### Logistic Regression

#### Diferencias entre el dataset con binning y el dataset sin Binning

In [19]:
# DATASET SIN BINNING
lr_model_sin_binning = Pipeline([
    ('mapper', mapper2),
    ('imputer', IterativeImputer(random_state=42)),
    ('poli', PolynomialFeatures(degree=2)),
    ('classifier', LogisticRegression(random_state=42,max_iter=3000, solver = 'liblinear')),
])

lr_model_sin_binning.fit(train, train.morePoints)

y_pred_lr_sin_binning = lr_model_sin_binning.predict(validation)

train_predict_lr_sin_binning = lr_model_sin_binning.predict(train)

print("VALIDATION ACCURACY")
print(metrics.accuracy_score(validation.morePoints, y_pred_lr_sin_binning))
print("-" * 30)
print("TRAIN ACCURACY")
print(metrics.accuracy_score(train.morePoints, train_predict_lr_sin_binning))

VALIDATION ACCURACY
0.671900826446281
------------------------------
TRAIN ACCURACY
0.6811324653854102


In [20]:
# DATASET CON BINNING
lr_model_con_binning = Pipeline([
    ('mapper', mapper2),
    ('imputer', IterativeImputer(random_state=42)),
    ('poli', PolynomialFeatures(degree=2)),
    ('classifier', LogisticRegression(random_state=42,max_iter=3000, solver = 'liblinear')),
])

lr_model_con_binning.fit(train_binning, train_binning.morePoints)

y_pred_lr_con_binning = lr_model_con_binning.predict(validation_binning)

train_predict_lr_con_binning = lr_model_con_binning.predict(train_binning)

print("VALIDATION ACCURACY")
print(metrics.accuracy_score(validation_binning.morePoints, y_pred_lr_con_binning))
print("-" * 30)
print("TRAIN ACCURACY")
print(metrics.accuracy_score(train_binning.morePoints, train_predict_lr_con_binning))

VALIDATION ACCURACY
0.671900826446281
------------------------------
TRAIN ACCURACY
0.6811324653854102


### Conclusion
Utilizando logistic regresion y ajustando las variables de entrada, logramos obtener un accuracy del 67%.
Si bien un 67% no es lo ideal, es un resultado que, teniendo en cuenta el dataset y la cantidad de columnas que el mismo posee, no es un resultado totalmente malo.

Pensamos que esto se debe a que este modelo no es el adecuado para poder predecir la variable morePoints. Creemos que al tener 50 variables/columnas al modelo se le dificulta obtener una función sigmoide que se ajuste a la distribución de los datos.

#### Metodo Binning Conclusiones

Luego de realizar los respectivos experimentos, concluimos que no consideramos determinante aplicar esta técnica para mejorar el rendimiento de estos modelos. Podemos ver que en los resultados, tanto de validacion como de train, hay un cambio mínimo y casi imperceptible, puesto que recién dicho cambio se da en el tercer decimal. 

Por otro lado, no creemos que el modelo llegue a overfitear sin haberle realizado la técnica de binning, ya que el dataset orgininal no posee un gran numero de decimales despues de la coma (teniendo en la mayoria de sus columnas un maximo de tres decimales).

### KNN

In [24]:
from sklearn.neighbors import KNeighborsClassifier

In [25]:

ks = [5, 10, 20, 35, 50, 75]

for k in ks:
    knn_model = Pipeline([
        ('mapper', mapper2),
        ('imputer', IterativeImputer(random_state=42)),
        ('classifier', KNeighborsClassifier(n_neighbors=k, algorithm = 'ball_tree')),
    ])

    knn_model.fit(train, train.morePoints)

    y_pred_knn = knn_model.predict(validation)

    train_predict_knn = knn_model.predict(train)
    print("VALOR DE k: ", k)
    print("VALIDATION ACCURACY")
    print(metrics.accuracy_score(validation.morePoints, y_pred_knn))
    print("TRAIN ACCURACY")
    print(metrics.accuracy_score(train.morePoints, train_predict_knn))
    print("-" * 30)

VALOR DE k:  5
VALIDATION ACCURACY
0.5991735537190083
TRAIN ACCURACY
0.7399256044637321
------------------------------
VALOR DE k:  10
VALIDATION ACCURACY
0.6090909090909091
TRAIN ACCURACY
0.6963215540400909
------------------------------
VALOR DE k:  20
VALIDATION ACCURACY
0.6247933884297521
TRAIN ACCURACY
0.6826823723909898
------------------------------
VALOR DE k:  35
VALIDATION ACCURACY
0.6446280991735537
TRAIN ACCURACY
0.6688365364744782
------------------------------
VALOR DE k:  50
VALIDATION ACCURACY
0.6578512396694215
TRAIN ACCURACY
0.667803265137425
------------------------------
VALOR DE k:  75
VALIDATION ACCURACY
0.6520661157024793
TRAIN ACCURACY
0.662120272783633
------------------------------


### Conclusion
Utilizando el modelo KNN con distintos valores de K, podemos ver que la mejor opción es con k=50 con un 65% de aciertos en el set Validation, y luego el modelo empieza a reducir la performance sobre el rendimiento.<br>


### Gradient Boosting

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

Un importante hyper parametro para el algoritmo AdaBoost es el número de "decision trees" usados.
El siguiente ejemplo explora el efecto de los números de árboles con valores entre 5 y 100

In [59]:
def get_models():
    models = dict()
    # define number of trees to consider
    n_trees = [5, 10, 50, 75, 100]
    for n in n_trees:
        models[str(n)] = GradientBoostingClassifier(n_estimators=n, max_depth=3, n_iter_no_change=25, validation_fraction=0.2, random_state=0)
    return models

In [60]:
modelos = get_models()
for modelo in modelos.values():
    print(modelo)
    boo_model = Pipeline([
    ('mapper', mapper),
    ('imputer', IterativeImputer(random_state=42)),
    ('classifier', modelo),
    ])
    boo_model.fit(train, train.morePoints)
    y_pred_ada = boo_model.predict(validation)
    train_predict_ada = boo_model.predict(train)
    test_predict_ada = boo_model.predict(test)

    print("TRAIN ACCURACY")
    print(metrics.accuracy_score(train.morePoints, train_predict_ada))
    print("VALIDATION ACCURACY")
    print(metrics.accuracy_score(validation.morePoints, y_pred_ada))
    print("-" * 30)

GradientBoostingClassifier(n_estimators=5, n_iter_no_change=25, random_state=0,
                           validation_fraction=0.2)
TRAIN ACCURACY
0.6472411655300682
VALIDATION ACCURACY
0.6363636363636364
------------------------------
GradientBoostingClassifier(n_estimators=10, n_iter_no_change=25, random_state=0,
                           validation_fraction=0.2)
TRAIN ACCURACY
0.6637735069229179
VALIDATION ACCURACY
0.6537190082644628
------------------------------
GradientBoostingClassifier(n_estimators=50, n_iter_no_change=25, random_state=0,
                           validation_fraction=0.2)
TRAIN ACCURACY
0.6860921678032651
VALIDATION ACCURACY
0.6661157024793388
------------------------------
GradientBoostingClassifier(n_estimators=75, n_iter_no_change=25, random_state=0,
                           validation_fraction=0.2)
TRAIN ACCURACY
0.6886753461458979
VALIDATION ACCURACY
0.6661157024793388
------------------------------
GradientBoostingClassifier(n_iter_no_change=25, rando

### Conclusion
Utilizando el modelo Gradient Boosting con distintos valores de estimators, podemos ver que la mejor opción es con n_estimators=100 con un 66% de aciertos en el set Validation.<br>


## Técnicas utilizadas para el No-Overfitting

Como técnicas utilizadas para que los modelos no overfiteen podemos encontrar varias, entre ellas:<br>

•**Logistic Regression:** para la Regresión Logística se utilizaron los siguientes parámetros: "solver = 'liblinear'" y "max_iter=3000". Liblinear se utiliza para datasets pequeños por lo cuál decidimos utilizarlo para nuestro modelo y por otro lado, "max-iter", es el número de iteraciones máximas para que el algoritmo converga.<br>
•**KNN:** en este modelo probamos con distintos "K" y llegamos a la conclusión que el algoritmo puede llegar hasta los 100 k sin comenzar a overfitear o empeorar su performance general. Decidimos quedarnos con el valor 50 que es con el que mejor resultado obtubimos en el set Validation (no es un k muy chico por lo que es difícil que overfitee).<br>
•**Gradient Boosting:** aquí decidimos variar el número de árboles, a su vez, realizamos también distintas pruebas con diferentes profundidades, obteniendo los mejores resultados con n_trees=100 y max_depth=3.<br>

También se intentó realizar un Data Binning para disminuir las probabilidades de sobreentrenamiento, pero concluimos que no es determinante esta técnica para nuestro dataset.


## Valor final de la métrica 

Teniendo en cuenta los resultados obtenidos en el set Validation de cada uno de los modelos, podemos concluir que el de mejor performance es **Logistic Regression** con un 67% en la métrica Accuracy.

Habiendo elegido este modelo, la métrica que se le entregará al cliente será la del modelo entrenado con el set de Test. 

In [29]:
#lr_model_sin_binning.fit(test, test.morePoints)
test_predict = lr_model_sin_binning.predict(test)
print("Test ACCURACY")
print(metrics.accuracy_score(test.morePoints, test_predict))

Test ACCURACY
0.6801652892561983


Como podemos observar en la predicción del set de Test, el modelo se comporta aún mejor alcanzando el 68% de Accuracy. Esta métrica es confiable y podremos informársela al cliente ya que entrenamos los modelos con los sets de Train y Validation pero no con el de Test; por lo tanto, al realizar una predicción de este set sabemos que el porcentaje de Accuracy es realista.

## RANDOM FOREST

#### Diferencias entre el dataset con binning y el dataset sin Binning

Intentamos, también, realizar un modelo Random Forest sin éxito.

In [38]:
from sklearn.ensemble import RandomForestClassifier

In [37]:
forest_model = RandomForestClassifier(random_state=42, n_estimators = 100, max_depth=4, max_features=40, class_weight = 'balanced')

rf_model = Pipeline([
    ('mapper', mapper2),
    ('imputer', IterativeImputer(random_state=42)),
    ('classifier', forest_model),
])

rf_model.fit(train, train.morePoints)

y_pred_rf = rf_model.predict(validation)

train_predict_rf = rf_model.predict(train)

print("VALIDATION ACCURACY")
print(metrics.accuracy_score(validation.morePoints, y_pred_rf))
print("-" * 30)
print("TRAIN ACCURACY")
print(metrics.accuracy_score(train.morePoints, train_predict_rf))

VALIDATION ACCURACY
0.6421487603305785
------------------------------
TRAIN ACCURACY
0.6703864434800578


In [40]:
forest_model = RandomForestClassifier(random_state=42, n_estimators = 100, max_depth=4, max_features=40, class_weight = 'balanced')

rf_model = Pipeline([
    ('mapper', mapper2),
    ('imputer', IterativeImputer(random_state=42)),
    ('classifier', forest_model),
])

rf_model.fit(train_binning, train_binning.morePoints)

y_pred_rf_binning = rf_model.predict(validation_binning)
train_predict_rf_binning = rf_model.predict(train_binning)

print("VALIDATION ACCURACY")
print(metrics.accuracy_score(validation_binning.morePoints, y_pred_rf_binning))
print("-" * 30)
print("TRAIN ACCURACY")
print(metrics.accuracy_score(train_binning.morePoints, train_predict_rf_binning))

VALIDATION ACCURACY
0.6421487603305785
------------------------------
TRAIN ACCURACY
0.6703864434800578
