In [54]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import keras
import h5py
import PIL
import seaborn as sns
import plotly
import sklearn_pandas
from IPython.display import Image
from IPython.core.display import HTML 
#from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn_pandas import DataFrameMapper
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer

In [55]:
# Esta celda lo que hace es obtener el dataset que se encuentra en el repositorio, utilizando un path relativo
path = '..//TP2//Seasons_Stats.csv'
data = pd.read_csv(path)

In [56]:
# Con esta celda buscamos reducir la cantidad de columnas que posee el dataset, eliminando las columnas que pensamos no son necesarias para realizar la predicción.
columns = ['blanl','blank2', 'Tm']
data = data.drop(columns, axis=1)

In [57]:
# Eliminamos todas las filas de datos que sean anteriores al año 1980. Elegimos este año ya que a partir de aquí se comienzan a tener en cuenta estadísticas como, por ejemplo, los tiros de 3 puntos.
# Esto lo hacemos ya que, antes de ese año, el dataset poseía muchas estadísticas que todavía no se tenian en cuenta y había muchos valores en NaN
data = data.drop(data[data.Year < 1980].index, axis=0)
# Además, eliminamos todos los datos en NaN para evitar inconsistencias en el dataset.
#data = data.dropna()

#Con esta linea mostramos todas las columnas del dataset
pd.set_option('display.max_columns', 100)

In [58]:
df = pd.DataFrame(data)
display(df)

Unnamed: 0.1,Unnamed: 0,Year,Player,Pos,Age,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
312,312,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
487,487,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
618,618,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
779,779,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
911,911,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24686,24686,2017.0,Cody Zeller,PF,24.0,62.0,58.0,1725.0,16.7,0.604,0.002,0.442,8.6,17.3,12.9,9.1,1.8,3.0,10.9,15.5,3.4,2.2,5.6,0.157,-0.2,2.3,2.1,1.8,253.0,443.0,0.571,0.0,1.0,0.000,253.0,442.0,0.572,0.571,133.0,196.0,0.679,135.0,270.0,405.0,99.0,62.0,58.0,65.0,189.0,639.0
24687,24687,2017.0,Tyler Zeller,C,27.0,51.0,5.0,525.0,13.0,0.508,0.006,0.247,9.2,17.0,13.2,12.2,0.7,3.3,10.2,16.5,0.5,0.6,1.0,0.094,-3.2,0.8,-2.5,-0.1,78.0,158.0,0.494,0.0,1.0,0.000,78.0,157.0,0.497,0.494,22.0,39.0,0.564,43.0,81.0,124.0,42.0,7.0,21.0,20.0,61.0,178.0
24688,24688,2017.0,Stephen Zimmerman,C,20.0,19.0,0.0,108.0,7.3,0.346,0.000,0.161,10.8,24.9,17.6,5.3,0.9,3.7,8.3,14.8,-0.1,0.1,0.0,-0.005,-7.8,0.4,-7.3,-0.1,10.0,31.0,0.323,0.0,0.0,,10.0,31.0,0.323,0.323,3.0,5.0,0.600,11.0,24.0,35.0,4.0,2.0,5.0,3.0,17.0,23.0
24689,24689,2017.0,Paul Zipser,SF,22.0,44.0,18.0,843.0,6.9,0.503,0.448,0.181,1.9,14.2,8.0,6.1,0.9,1.5,14.4,14.4,-0.3,0.8,0.5,0.030,-3.6,-0.1,-3.7,-0.4,88.0,221.0,0.398,33.0,99.0,0.333,55.0,122.0,0.451,0.473,31.0,40.0,0.775,15.0,110.0,125.0,36.0,15.0,16.0,40.0,78.0,240.0


In [59]:
df[['3P%','FT%', '2P%', '3PAr', 'FTr', 'FG%', 'eFG%', 'TS%', 'TOV%']] = df[['3P%','FT%', '2P%', '3PAr', 'FTr', 'FG%', 'eFG%', 'TS%', 'TOV%']].fillna(0)

In [60]:
# Este celda elimina los años que figuran mas de una vez para un mismo jugador.
df = df.drop_duplicates(subset=['Player','Year'])

In [61]:
# Esta celda renombra la primer columna del dataFrame (denominada 'Unnamed: 0') y le setea el nombre de "id"
df = df.rename(columns = {'Unnamed: 0' : 'id'})

In [62]:
# Con esta celda, creamos la que será nuestra columna TARGET y la llamamos "morePoints" ("Mas Puntos")
# Dicha columna podrá tener solo dos valores "Yes"/"No" dependiendo si el jugador hizo mas puntos que la temporada anterior
# Cabe recalcar que si es la 1er temporada del jugador o si una temporada no jugo por algun motivo, se colocará en "Yes" ya que consideramos que al no haber jugado efectivamente realizó mas puntos.
df.insert(18, 'morePoints', 'No', allow_duplicates = True)

In [63]:
# Esta es una funcion que realizamos para setear la columna "morePoints"
# Buscamos el año anterior y comparamos los puntos que realizó en ese año y el año actual.
# Si efectivamente hizo mas puntos no hacemos nada, pero si no seteamos la columna en "No"
def funcionSetMorePoints(id):
    auxiliar = df.loc[id]
    temporadaSiguiente = df.loc[(df.Player == auxiliar.Player) & (df.Year == auxiliar.Year  + 1)]      
    if ((not temporadaSiguiente.empty) and (auxiliar.PTS <= temporadaSiguiente.PTS.item())):
        df.loc[df.id == id,'morePoints'] = 'Yes'
    else:
        if temporadaSiguiente.empty:
            df.loc[df.id == id,'morePoints'] = 'None'
    return id

In [64]:
# Llamamos a la funcion defenida anteriormente
df.id = df.id.apply(funcionSetMorePoints)

In [65]:
#Cuando la columna morePoints se encuentra en None quiere decir que es la ultima temporada del jugador
#En este caso se debe eliminar la fila ya que no puede ser utilizada para entrenar los modelos.
df = df[df['morePoints'] != 'None']

In [66]:
df['GS'] = df['GS'].fillna(32.033531)

In [67]:
df = df.dropna()

In [68]:
df = df.round(2)
display(df)

Unnamed: 0,id,Year,Player,Pos,Age,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,morePoints,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
5727,5727,1980.0,Kareem Abdul-Jabbar*,C,32.0,82.0,32.03,3143.0,25.3,0.64,0.00,0.34,7.2,22.2,15.4,16.5,1.2,4.6,Yes,15.7,24.1,9.5,5.3,14.8,0.23,4.0,2.7,6.7,6.8,835.0,1383.0,0.60,0.0,1.0,0.00,835.0,1382.0,0.60,0.60,364.0,476.0,0.76,190.0,696.0,886.0,371.0,81.0,280.0,297.0,216.0,2034.0
5728,5728,1980.0,Tom Abernethy,PF,25.0,67.0,32.03,1222.0,11.0,0.51,0.00,0.26,5.4,12.0,8.6,9.3,1.4,0.6,No,9.9,13.3,1.2,0.8,2.0,0.08,-1.5,-0.1,-1.6,0.1,153.0,318.0,0.48,0.0,1.0,0.00,153.0,317.0,0.48,0.48,56.0,82.0,0.68,62.0,129.0,191.0,87.0,35.0,12.0,39.0,118.0,362.0
5729,5729,1980.0,Alvan Adams,C,25.0,75.0,32.03,2168.0,19.2,0.57,0.00,0.27,8.2,22.4,15.4,21.6,2.3,1.4,No,18.2,21.9,3.1,3.9,7.0,0.16,1.6,2.8,4.4,3.5,465.0,875.0,0.53,0.0,2.0,0.00,465.0,873.0,0.53,0.53,188.0,236.0,0.80,158.0,451.0,609.0,322.0,108.0,55.0,218.0,237.0,1118.0
5730,5730,1980.0,Tiny Archibald*,PG,31.0,80.0,80.00,2864.0,15.3,0.57,0.02,0.55,2.3,5.3,3.8,30.2,1.7,0.2,No,19.7,17.0,5.9,2.9,8.9,0.15,1.1,-1.1,0.0,1.5,383.0,794.0,0.48,4.0,18.0,0.22,379.0,776.0,0.49,0.48,361.0,435.0,0.83,59.0,138.0,197.0,671.0,106.0,10.0,242.0,218.0,1131.0
5731,5731,1980.0,Dennis Awtrey,C,31.0,26.0,32.03,560.0,7.4,0.52,0.00,0.83,6.0,16.9,11.5,9.0,1.0,1.5,Yes,24.8,7.9,0.1,0.5,0.6,0.05,-2.9,1.5,-1.4,0.1,27.0,60.0,0.45,0.0,0.0,0.00,27.0,60.0,0.45,0.45,32.0,50.0,0.64,29.0,86.0,115.0,40.0,12.0,15.0,27.0,66.0,86.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24090,24090,2016.0,Joe Young,PG,23.0,41.0,0.00,384.0,9.9,0.43,0.27,0.15,1.7,12.5,7.1,26.9,1.9,0.0,No,15.5,24.4,-0.5,0.5,-0.1,-0.01,-3.9,-1.5,-5.4,-0.3,62.0,169.0,0.37,10.0,46.0,0.22,52.0,123.0,0.42,0.40,20.0,25.0,0.80,6.0,44.0,50.0,65.0,15.0,0.0,33.0,30.0,154.0
24091,24091,2016.0,Nick Young,SG,30.0,54.0,2.00,1033.0,9.1,0.48,0.64,0.20,1.4,9.0,5.1,5.4,1.1,0.5,Yes,6.9,18.5,0.1,0.0,0.1,0.00,-1.4,-3.5,-4.8,-0.7,126.0,372.0,0.34,77.0,237.0,0.32,49.0,135.0,0.36,0.44,63.0,76.0,0.83,14.0,83.0,97.0,34.0,23.0,7.0,30.0,50.0,392.0
24092,24092,2016.0,Thaddeus Young,PF,27.0,73.0,73.00,2407.0,17.5,0.53,0.03,0.17,8.1,23.0,15.4,9.5,2.3,1.2,No,11.6,21.6,1.9,2.2,4.1,0.08,-0.5,0.6,0.1,1.3,495.0,963.0,0.51,7.0,30.0,0.23,488.0,933.0,0.52,0.52,105.0,163.0,0.64,176.0,484.0,660.0,135.0,112.0,37.0,136.0,182.0,1102.0
24093,24093,2016.0,Cody Zeller,C,23.0,73.0,60.00,1774.0,16.1,0.59,0.02,0.53,8.4,19.7,14.0,6.3,1.6,2.9,Yes,11.2,15.4,3.6,2.7,6.3,0.17,-1.0,2.0,1.0,1.3,231.0,437.0,0.53,1.0,10.0,0.10,230.0,427.0,0.54,0.53,175.0,232.0,0.75,138.0,317.0,455.0,71.0,57.0,63.0,68.0,204.0,638.0


# TP N° 2 - Experimentación

## Métrica a utilizar

Accuracy<br>
Accuracy nos indica el porcentaje de casos que se acertaron. 
Los problemas que pueden aparecer al utilizar esta métrica no aplicarían para este modelo, ya que: 
•	Como pudimos apreciar en el análisis exploratorio de datos, no poseemos un gran desbalanceo en los datos. 
•	El resultado de la predicción no tiene diferente impacto (tiene igual impacto errar por Si o por No). Creemos que el costo de obtener falsos positivos no es alto. 
Por esta razón consideramos que la métrica Accuracy es la que más se adapta a nuestro caso de estudio.


**No tenemos ninguno de estos dos problemas, para mi podría ser Accuracy ANASHEEEIIII.**

## Técnica de feature engineering

Podría ser Binning (para achicar los decimales). Data pasada creemos que no xq tendríamos que tener los registros de cada jugador al momento de predecir y no los vamos a tener

## Dividiendo el Dataset

In [69]:
df['Pos'] = df['Pos'].str.replace('-',' ')
#jugador = df[df['Player'] == "Andres Nocioni"]
#display(jugador)
columns = ['id','Player','Year']
df = df.drop(columns, axis=1)


In [70]:
from sklearn.preprocessing import MinMaxScaler
mapper = DataFrameMapper([
    (['Age'], [StandardScaler()]),
    (['G'], [StandardScaler()]),
    (['GS'], [StandardScaler()]),
    (['MP'], [StandardScaler()]),
    (['PER'], [StandardScaler()]),
    (['ORB%'], [StandardScaler()]),
    (['DRB%'], [StandardScaler()]),
    (['TRB%'], [StandardScaler()]),
    (['AST%'], [StandardScaler()]),
    (['TOV%'], [StandardScaler()]),
    (['OWS'], [StandardScaler()]),
    (['DWS'], [StandardScaler()]),
    (['WS'], [StandardScaler()]),
    (['STL%'], [StandardScaler()]),
    (['BLK%'], [StandardScaler()]),
    (['OWS'], [StandardScaler()]),
    (['OBPM'], [StandardScaler()]),
    (['DBPM'], [StandardScaler()]),
    (['BPM'], [StandardScaler()]),
    (['VORP'], [StandardScaler()]),
    (['USG%'], [StandardScaler()]),
    (['PTS'], [StandardScaler()]),
    (['FG'], [StandardScaler()]),
    (['3P'], [StandardScaler()]),
    (['3PA'], [StandardScaler()]),
    (['2P'],[StandardScaler()]),
    (['2PA'],[StandardScaler()]),
    (['FT'], [StandardScaler()]),
    (['FTA'],[StandardScaler()]),
    (['ORB'],[StandardScaler()]),
    (['DRB'],[StandardScaler()]),
    (['TRB'],[StandardScaler()]),
    (['STL'],[StandardScaler()]),
    (['BLK'],[StandardScaler()]),
    (['TOV'],[StandardScaler()]),
    (['PF'],[StandardScaler()]),
    (['Pos'],[OneHotEncoder()]), 
])
mapper2 = DataFrameMapper([
    (['Age'], [MinMaxScaler()]),
    (['G'], [MinMaxScaler()]),
    (['GS'], [MinMaxScaler()]),
    (['MP'], [MinMaxScaler()]),
    (['PER'], [MinMaxScaler()]),
    (['ORB%'], [MinMaxScaler()]),
    (['DRB%'], [MinMaxScaler()]),
    (['TRB%'], [MinMaxScaler()]),
    (['AST%'], [MinMaxScaler()]),
    (['TOV%'], [MinMaxScaler()]),
    (['OWS'], [MinMaxScaler()]),
    (['DWS'], [MinMaxScaler()]),
    (['WS'], [MinMaxScaler()]),
    (['STL%'], [MinMaxScaler()]),
    (['BLK%'], [MinMaxScaler()]),
    (['OWS'], [MinMaxScaler()]),
    (['OBPM'], [MinMaxScaler()]),
    (['DBPM'], [MinMaxScaler()]),
    (['BPM'], [MinMaxScaler()]),
    (['VORP'], [MinMaxScaler()]),
    (['USG%'], [MinMaxScaler()]),
    (['PTS'], [MinMaxScaler()]),
    (['FG'], [MinMaxScaler()]),
    (['3P'], [MinMaxScaler()]),
    (['3PA'], [MinMaxScaler()]),
    (['2P'],[MinMaxScaler()]),
    (['2PA'],[MinMaxScaler()]),
    (['FT'], [MinMaxScaler()]),
    (['FTA'],[MinMaxScaler()]),
    (['ORB'],[MinMaxScaler()]),
    (['DRB'],[MinMaxScaler()]),
    (['TRB'],[MinMaxScaler()]),
    (['STL'],[MinMaxScaler()]),
    (['BLK'],[MinMaxScaler()]),
    (['TOV'],[MinMaxScaler()]),
    (['PF'],[MinMaxScaler()]),
    (['Pos'],[OneHotEncoder()]), 
])


In [71]:
display(mapper2)

DataFrameMapper(drop_cols=[],
                features=[(['Age'], [MinMaxScaler()]),
                          (['G'], [MinMaxScaler()]), (['GS'], [MinMaxScaler()]),
                          (['MP'], [MinMaxScaler()]),
                          (['PER'], [MinMaxScaler()]),
                          (['ORB%'], [MinMaxScaler()]),
                          (['DRB%'], [MinMaxScaler()]),
                          (['TRB%'], [MinMaxScaler()]),
                          (['AST%'], [MinMaxScaler()]),
                          (['TOV%'], [MinMaxScaler()]),
                          (['OWS'], [MinMaxScaler()]),
                          (['DWS'], [Min...
                          (['DBPM'], [MinMaxScaler()]),
                          (['BPM'], [MinMaxScaler()]),
                          (['VORP'], [MinMaxScaler()]),
                          (['USG%'], [MinMaxScaler()]),
                          (['PTS'], [MinMaxScaler()]),
                          (['FG'], [MinMaxScaler()]),
              

In [72]:
#Dejamos un 80% del dataset para entrenar, un 10% para validar y un 10% para testear
train, not_train = train_test_split(df, test_size=0.2, random_state=42)
validation, test = train_test_split(not_train, test_size=0.5,random_state=42)

In [73]:
mapper.fit(train)
mapper2.fit(train)

DataFrameMapper(drop_cols=[],
                features=[(['Age'], [MinMaxScaler()]),
                          (['G'], [MinMaxScaler()]), (['GS'], [MinMaxScaler()]),
                          (['MP'], [MinMaxScaler()]),
                          (['PER'], [MinMaxScaler()]),
                          (['ORB%'], [MinMaxScaler()]),
                          (['DRB%'], [MinMaxScaler()]),
                          (['TRB%'], [MinMaxScaler()]),
                          (['AST%'], [MinMaxScaler()]),
                          (['TOV%'], [MinMaxScaler()]),
                          (['OWS'], [MinMaxScaler()]),
                          (['DWS'], [Min...
                          (['DBPM'], [MinMaxScaler()]),
                          (['BPM'], [MinMaxScaler()]),
                          (['VORP'], [MinMaxScaler()]),
                          (['USG%'], [MinMaxScaler()]),
                          (['PTS'], [MinMaxScaler()]),
                          (['FG'], [MinMaxScaler()]),
              

In [74]:
#sample = train.sample(5, random_state=42)
#mapper.transform(sample)

In [75]:
# mapper.transformed_names_

## Feature selection

In [99]:
from sklearn.decomposition import PCA
df2 = df
scaler = StandardScaler()
#df2[['Age','G','GS','MP','PER','ORB%','DRB%','TRB%','AST%','TOV%','OWS','BPM','VORP','USG%','PTS','FG','3P','3PA','2P','2PA','FT','FTA','ORB','DRB','TRB','STL','BLK','TOV','PF']] = scaler.fit_transform(df2[['Age','G','GS','MP','PER','ORB%','DRB%','TRB%','AST%','TOV%','OWS','BPM','VORP','USG%','PTS','FG','3P','3PA','2P','2PA','FT','FTA','ORB','DRB','TRB','STL','BLK','TOV','PF']])
features = ['Age','G','GS','MP','PER','ORB%','DRB%','TRB%','AST%','TOV%','OWS','BPM','VORP','USG%','PTS','FG','3P','3PA','2P','2PA','FT','FTA','ORB','DRB','TRB','STL','BLK','TOV','PF']
#Separo las features del target
scaler.fit(df)
x = scaler.transform(df)
x = x.loc[:,features].values
#x[['Age','G','GS','MP','PER','ORB%','DRB%','TRB%','AST%','TOV%','OWS','BPM','VORP','USG%','PTS','FG','3P','3PA','2P','2PA','FT','FTA','ORB','DRB','TRB','STL','BLK','TOV','PF']] = scaler.fit_transform(x[['Age','G','GS','MP','PER','ORB%','DRB%','TRB%','AST%','TOV%','OWS','BPM','VORP','USG%','PTS','FG','3P','3PA','2P','2PA','FT','FTA','ORB','DRB','TRB','STL','BLK','TOV','PF']])
#Separo el target

y=df.loc[:,['morePoints']].values
pca = PCA(n_components=8)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['col1', 'col2','col3','col4','col5','col6','col7','col8'])

finaldf = pd.concat([principalDf, df[['morePoints']]], axis = 1)
display(finaldf)
display(df)
print(pca.explained_variance_ratio_)


Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8,morePoints
0,10.760957,3.153663,0.700871,0.876578,-2.127376,-1.289530,0.595182,-3.120356,
1,-1.945049,0.192869,-0.324158,-0.318154,0.639558,-0.744978,-1.321583,-0.285482,
2,4.777026,1.850503,0.791162,-0.400783,-1.655594,0.617766,-0.327841,0.731889,
3,3.834345,-2.698461,1.501574,-2.223413,-1.577735,-1.485432,0.188862,-0.244429,
4,-4.243273,1.075140,-0.855180,-1.127080,-1.863009,-0.308566,1.015231,-0.956640,
...,...,...,...,...,...,...,...,...,...
24090,,,,,,,,,No
24091,,,,,,,,,Yes
24092,,,,,,,,,No
24093,,,,,,,,,Yes


Unnamed: 0,Pos,Age,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,morePoints,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
5727,C,1.390307,0.968120,-0.000202,1.829040,2.387905,0.64,0.00,0.34,0.232434,1.362038,1.141234,0.308413,1.2,4.6,Yes,0.237187,0.973342,3.428020,5.3,14.8,0.23,4.0,2.7,2.141055,3.917457,3.149565,1383.0,0.60,-0.653873,-0.696243,0.00,3.505519,2.674071,0.60,0.60,2.064104,2.110188,0.76,1.553275,3.422831,2.914316,371.0,0.758760,5.959251,2.964440,0.995576,2.801746
5728,PF,-0.405324,0.261517,-0.000202,-0.337741,-0.551242,0.51,0.00,0.26,-0.200291,-0.337763,-0.331843,-0.458655,1.4,0.6,No,-0.923844,-1.162666,-0.242222,0.8,2.0,0.08,-1.5,-0.1,-0.125796,-0.488136,-0.499121,318.0,0.48,-0.653873,-0.696243,0.00,-0.369715,-0.379311,0.48,0.48,-0.604228,-0.577327,0.68,-0.252349,-0.409703,-0.371095,87.0,-0.405436,-0.492274,-0.829443,-0.289630,-0.574632
5729,C,-0.405324,0.638372,-0.000202,0.729294,1.134143,0.57,0.00,0.27,0.472837,1.395368,1.141234,0.851753,2.3,1.4,No,0.737632,0.538230,0.597954,3.9,7.0,0.16,1.6,2.8,1.512892,1.747538,1.170073,875.0,0.53,-0.653873,-0.687466,0.00,1.403119,1.214755,0.53,0.53,0.539343,0.473123,0.80,1.101869,1.766798,1.604879,322.0,1.442093,0.542859,1.802747,1.270978,0.952008
5730,PG,1.133789,0.873907,1.598351,1.514343,0.332558,0.57,0.02,0.55,-0.945539,-1.454299,-1.371661,1.767974,1.7,0.2,No,1.037898,-0.430885,1.836108,2.9,8.9,0.15,1.1,-1.1,0.311188,0.432436,0.731375,794.0,0.48,-0.561217,-0.547033,0.22,0.914454,0.936654,0.49,0.48,2.038114,1.830522,0.83,-0.294668,-0.348869,-0.342732,671.0,1.391475,-0.540419,2.155666,1.021805,0.978260
5731,C,1.133789,-1.669864,-0.000202,-1.084441,-1.291167,0.52,0.00,0.83,-0.056049,0.478808,0.296381,-0.490617,1.0,1.5,Yes,2.058805,-2.230670,-0.728640,0.5,0.6,0.05,-2.9,1.5,-0.071173,-0.488136,-1.173218,60.0,0.45,-0.653873,-0.705020,0.00,-1.085667,-1.116136,0.45,0.45,-0.812150,-0.795602,0.64,-0.717861,-0.700354,-0.730363,40.0,-0.987533,-0.420055,-1.005902,-0.971576,-1.131976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24090,PG,-0.918362,-0.963261,-1.067570,-1.282959,-0.777330,0.43,0.27,0.15,-1.089781,-0.254439,-0.656786,1.416401,1.9,0.0,No,0.197151,1.032676,-0.993958,0.5,-0.1,-0.01,-3.9,-1.5,-1.163631,-0.751156,-0.985969,169.0,0.37,-0.422233,-0.301275,0.22,-0.943613,-0.935513,0.42,0.40,-0.916111,-0.966130,0.80,-1.042309,-0.984245,-1.037632,65.0,-0.911608,-0.781148,-0.917673,-1.443693,-0.994660
24091,SG,0.877270,-0.350872,-1.000922,-0.550923,-0.941758,0.48,0.64,0.20,-1.161902,-0.837704,-1.090043,-0.874151,1.1,0.5,Yes,-1.524378,-0.134217,-0.728640,0.0,0.1,0.00,-1.4,-3.5,-0.999762,-1.014177,-0.643570,372.0,0.34,1.129750,1.375145,0.32,-0.960660,-0.901109,0.36,0.44,-0.543584,-0.618254,0.83,-0.929457,-0.720632,-0.815453,34.0,-0.709139,-0.612638,-0.961788,-1.181406,-0.514051
24092,PF,0.107713,0.544159,1.365083,0.998873,0.784734,0.53,0.03,0.17,0.448797,1.495356,1.141234,-0.437348,2.3,1.2,No,-0.583542,0.478896,0.067317,2.2,4.1,0.08,-0.5,0.6,0.338499,0.300926,1.330572,963.0,0.51,-0.491725,-0.441708,0.23,1.533809,1.386777,0.52,0.52,-0.179720,-0.024818,0.64,1.355785,1.989855,1.845966,135.0,1.543327,0.109548,0.596939,0.549688,0.919698
24093,C,-0.918362,0.544159,0.931871,0.284884,0.496985,0.59,0.02,0.53,0.520918,0.945420,0.837953,-0.778267,1.6,2.9,Yes,-0.663613,-0.747331,0.819053,2.7,6.3,0.17,-1.0,2.0,0.584302,0.300926,-0.081822,437.0,0.53,-0.630709,-0.617250,0.10,0.067812,-0.063938,0.54,0.53,0.426719,0.445838,0.75,0.819740,0.861049,0.876888,71.0,0.151354,0.735442,-0.402999,0.838204,-0.017287


[0.49239997 0.17116467 0.05748674 0.05483768 0.04190636 0.03412232
 0.03018073 0.01934721]


In [81]:
display(finaldf)

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15,morePoints
0,2546.883094,561.410840,387.637198,121.893167,-155.286173,-43.548540,-4.264597,219.339569,100.844471,95.639031,-34.616669,82.593799,-19.823303,-13.239219,-8.642968,
1,-463.706841,-86.878542,26.392117,-99.740526,2.633225,-16.330871,-23.960386,2.016496,-2.261379,-21.534768,8.084508,-5.961572,5.899521,10.923639,1.195121,
2,1009.089641,179.651534,291.869288,-7.589734,-109.659471,15.110113,73.555087,8.186610,89.396819,1.111500,5.855560,23.204721,-20.609704,-6.020630,0.579250,
3,1452.703750,-171.316753,-242.815126,-232.907668,327.651969,-23.028195,1.376890,15.115342,25.152804,-2.403668,-29.194794,-12.770927,-12.092163,-5.174589,2.599330,
4,-1244.294280,-36.726601,42.517502,9.354056,42.432742,-24.521047,1.439889,11.513465,4.275911,-2.875638,-4.089925,-29.151558,2.702054,-4.896909,1.452491,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24090,,,,,,,,,,,,,,,,No
24091,,,,,,,,,,,,,,,,Yes
24092,,,,,,,,,,,,,,,,No
24093,,,,,,,,,,,,,,,,Yes


## Logistic Regression

In [83]:
lr_model_si = Pipeline([
    ('mapper', mapper2),
    ('classifier', LogisticRegression(random_state=42,max_iter=300, solver = 'liblinear'))
])

lr_model_si.fit(train, train.morePoints)

y_pred = lr_model_si.predict(validation)

train_predict3 = lr_model_si.predict(train)

print("VALIDATION ACCURACY")
print(metrics.accuracy_score(validation.morePoints, y_pred))
print("-" * 30)
print("TRAIN ACCURACY")
print(metrics.accuracy_score(train.morePoints, train_predict3))

VALIDATION ACCURACY
0.6537190082644628
------------------------------
TRAIN ACCURACY
0.6682165736722463


### Conclusion
Utilizando logistic regresion y ajustando las variables de entrada, logramos obtener un accuracy del 65%.

## RANDOM FOREST

In [85]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer

In [86]:
forest_model = RandomForestClassifier(random_state=42, n_estimators = 200, max_depth=15, max_features=20, class_weight = 'balanced')

rf_model = Pipeline([
    ('mapper', mapper2),
    ('imputer', IterativeImputer(random_state=42)),
    ('classifier', forest_model),
])

rf_model.fit(train, train.morePoints)

y_pred2 = rf_model.predict(validation)

y_pred2

train_predict = rf_model.predict(train)

print("VALIDATION ACCURACY")
print(metrics.accuracy_score(validation.morePoints, y_pred2))
print("-" * 30)
print("TRAIN ACCURACY")
print(metrics.accuracy_score(train.morePoints, train_predict))

VALIDATION ACCURACY
0.6570247933884298
------------------------------
TRAIN ACCURACY
0.9874974168216574


### KNN

In [87]:
from sklearn.neighbors import KNeighborsClassifier

In [89]:
K = 35

knn_model = Pipeline([
    ('mapper', mapper2),
    ('imputer', IterativeImputer(random_state=42)),
    ('classifier', KNeighborsClassifier(n_neighbors=K, algorithm = 'ball_tree')),
])

knn_model.fit(train, train.morePoints)

y_pred_knn = knn_model.predict(validation)

y_pred_knn

train_predict_knn = knn_model.predict(train)

print("VALIDATION ACCURACY")
print(metrics.accuracy_score(validation.morePoints, y_pred_knn))
print("-" * 30)
print("TRAIN ACCURACY")
print(metrics.accuracy_score(train.morePoints, train_predict_knn))

VALIDATION ACCURACY
0.6446280991735537
------------------------------
TRAIN ACCURACY
0.6688365364744782


In [90]:
from sklearn.ensemble import GradientBoostingClassifier

Un importante hyper parametro para el algoritmo AdaBoost es el número de "decision trees" usados.
El siguiente ejemplo explora el efecto de los números de árboles con valores entre 5 y 100

In [91]:
def get_models():
    models = dict()
    # define number of trees to consider
    n_trees = [5, 10, 50, 75, 100]
    for n in n_trees:
        models[str(n)] = GradientBoostingClassifier(n_estimators=n, max_depth=3)
    return models


#boost_model = GradientBoostingClassifier(n_estimators=100, max_depth=3, learning_rate=0.1)
#boo_model = Pipeline([
#    ('mapper', mapper),
#    ('imputer', IterativeImputer(random_state=42)),
#    ('classifier', boost_model),
#])
#boo_model.fit(train, train.morePoints)
#y_pred_ada = boo_model.predict(validation)
#train_predict_ada = boo_model.predict(train)

#print("TRAIN ACCURACY")
#print(metrics.accuracy_score(train.morePoints, train_predict_ada))
#print("VALIDATION ACCURACY")
#print(metrics.accuracy_score(validation.morePoints, y_pred_ada))
#print("-" * 30)

In [93]:
modelos = get_models()
for modelo in modelos.values():
    print(modelo)
    boo_model = Pipeline([
    ('mapper', mapper),
    ('imputer', IterativeImputer(random_state=42)),
    ('classifier', modelo),
    ])
    boo_model.fit(train, train.morePoints)
    y_pred_ada = boo_model.predict(validation)
    train_predict_ada = boo_model.predict(train)
    test_predict_ada = boo_model.predict(test)

    print("TRAIN ACCURACY")
    print(metrics.accuracy_score(train.morePoints, train_predict_ada))
    print("VALIDATION ACCURACY")
    print(metrics.accuracy_score(validation.morePoints, y_pred_ada))
    print("TEST ACCURACY")
    print(metrics.accuracy_score(test.morePoints, test_predict_ada))
    print("-" * 30)

GradientBoostingClassifier(n_estimators=5)
TRAIN ACCURACY
0.6557139904939037
VALIDATION ACCURACY
0.6429752066115703
TEST ACCURACY
0.6429752066115703
------------------------------
GradientBoostingClassifier(n_estimators=10)
TRAIN ACCURACY
0.6656333953296135
VALIDATION ACCURACY
0.6446280991735537
TEST ACCURACY
0.6578512396694215
------------------------------
GradientBoostingClassifier(n_estimators=50)
TRAIN ACCURACY
0.6854722050010332
VALIDATION ACCURACY
0.6628099173553719
TEST ACCURACY
0.6809917355371901
------------------------------
GradientBoostingClassifier(n_estimators=75)
TRAIN ACCURACY
0.6924984500929944
VALIDATION ACCURACY
0.6595041322314049
TEST ACCURACY
0.6743801652892562
------------------------------
GradientBoostingClassifier()
TRAIN ACCURACY
0.7031411448646414
VALIDATION ACCURACY
0.6636363636363637
TEST ACCURACY
0.6760330578512397
------------------------------


In [None]:
#boo_model.fit(train, train.morePoints)
#y_pred_ada = boo_model.predict(validation)

In [114]:
#train_predict_ada = boo_model.predict(train)

#print("TRAIN ACCURACY")
#print(metrics.accuracy_score(train.morePoints, train_predict_ada))
#print("-" * 30)
#print("VALIDATION ACCURACY")
#print(metrics.accuracy_score(validation.morePoints, y_pred_ada))

In [148]:
#esto es para probar el binning, agrupando los PTS en distintos grupos
#data = df.squeeze()
#bins = [0,250,500,750,1000,1250,1500,1750,2000,2250,2500,2750,3000,3250,3500,3750,4000]
#binned_Data = pd.cut(data.PTS, bins)
#display(binned_Data)

5727     (2000, 2250]
5728       (250, 500]
5729     (1000, 1250]
5730     (1000, 1250]
5731         (0, 250]
             ...     
24090        (0, 250]
24091      (250, 500]
24092    (1000, 1250]
24093      (500, 750]
24094      (250, 500]
Name: PTS, Length: 12098, dtype: category
Categories (16, interval[int64]): [(0, 250] < (250, 500] < (500, 750] < (750, 1000] ... (3000, 3250] < (3250, 3500] < (3500, 3750] < (3750, 4000]]

In [149]:
#Cambio la columna PTS por la columna creada.
#df['PTS'] = binned_Data
#display(df)

Unnamed: 0,id,Year,Player,Pos,Age,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,morePoints,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
5727,5727,1980.0,Kareem Abdul-Jabbar*,C,32.0,82.0,32.03,3143.0,25.3,0.64,0.00,0.34,7.2,22.2,15.4,16.5,1.2,4.6,Yes,15.7,24.1,9.5,5.3,14.8,0.23,4.0,2.7,6.7,6.8,835.0,1383.0,0.60,0.0,1.0,0.00,835.0,1382.0,0.60,0.60,364.0,476.0,0.76,190.0,696.0,886.0,371.0,81.0,280.0,297.0,216.0,"(2000, 2250]"
5728,5728,1980.0,Tom Abernethy,PF,25.0,67.0,32.03,1222.0,11.0,0.51,0.00,0.26,5.4,12.0,8.6,9.3,1.4,0.6,No,9.9,13.3,1.2,0.8,2.0,0.08,-1.5,-0.1,-1.6,0.1,153.0,318.0,0.48,0.0,1.0,0.00,153.0,317.0,0.48,0.48,56.0,82.0,0.68,62.0,129.0,191.0,87.0,35.0,12.0,39.0,118.0,"(250, 500]"
5729,5729,1980.0,Alvan Adams,C,25.0,75.0,32.03,2168.0,19.2,0.57,0.00,0.27,8.2,22.4,15.4,21.6,2.3,1.4,No,18.2,21.9,3.1,3.9,7.0,0.16,1.6,2.8,4.4,3.5,465.0,875.0,0.53,0.0,2.0,0.00,465.0,873.0,0.53,0.53,188.0,236.0,0.80,158.0,451.0,609.0,322.0,108.0,55.0,218.0,237.0,"(1000, 1250]"
5730,5730,1980.0,Tiny Archibald*,PG,31.0,80.0,80.00,2864.0,15.3,0.57,0.02,0.55,2.3,5.3,3.8,30.2,1.7,0.2,No,19.7,17.0,5.9,2.9,8.9,0.15,1.1,-1.1,0.0,1.5,383.0,794.0,0.48,4.0,18.0,0.22,379.0,776.0,0.49,0.48,361.0,435.0,0.83,59.0,138.0,197.0,671.0,106.0,10.0,242.0,218.0,"(1000, 1250]"
5731,5731,1980.0,Dennis Awtrey,C,31.0,26.0,32.03,560.0,7.4,0.52,0.00,0.83,6.0,16.9,11.5,9.0,1.0,1.5,Yes,24.8,7.9,0.1,0.5,0.6,0.05,-2.9,1.5,-1.4,0.1,27.0,60.0,0.45,0.0,0.0,0.00,27.0,60.0,0.45,0.45,32.0,50.0,0.64,29.0,86.0,115.0,40.0,12.0,15.0,27.0,66.0,"(0, 250]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24090,24090,2016.0,Joe Young,PG,23.0,41.0,0.00,384.0,9.9,0.43,0.27,0.15,1.7,12.5,7.1,26.9,1.9,0.0,No,15.5,24.4,-0.5,0.5,-0.1,-0.01,-3.9,-1.5,-5.4,-0.3,62.0,169.0,0.37,10.0,46.0,0.22,52.0,123.0,0.42,0.40,20.0,25.0,0.80,6.0,44.0,50.0,65.0,15.0,0.0,33.0,30.0,"(0, 250]"
24091,24091,2016.0,Nick Young,SG,30.0,54.0,2.00,1033.0,9.1,0.48,0.64,0.20,1.4,9.0,5.1,5.4,1.1,0.5,Yes,6.9,18.5,0.1,0.0,0.1,0.00,-1.4,-3.5,-4.8,-0.7,126.0,372.0,0.34,77.0,237.0,0.32,49.0,135.0,0.36,0.44,63.0,76.0,0.83,14.0,83.0,97.0,34.0,23.0,7.0,30.0,50.0,"(250, 500]"
24092,24092,2016.0,Thaddeus Young,PF,27.0,73.0,73.00,2407.0,17.5,0.53,0.03,0.17,8.1,23.0,15.4,9.5,2.3,1.2,No,11.6,21.6,1.9,2.2,4.1,0.08,-0.5,0.6,0.1,1.3,495.0,963.0,0.51,7.0,30.0,0.23,488.0,933.0,0.52,0.52,105.0,163.0,0.64,176.0,484.0,660.0,135.0,112.0,37.0,136.0,182.0,"(1000, 1250]"
24093,24093,2016.0,Cody Zeller,C,23.0,73.0,60.00,1774.0,16.1,0.59,0.02,0.53,8.4,19.7,14.0,6.3,1.6,2.9,Yes,11.2,15.4,3.6,2.7,6.3,0.17,-1.0,2.0,1.0,1.3,231.0,437.0,0.53,1.0,10.0,0.10,230.0,427.0,0.54,0.53,175.0,232.0,0.75,138.0,317.0,455.0,71.0,57.0,63.0,68.0,204.0,"(500, 750]"
