In [6]:
import streamlit as st
import pandas
import numpy as np
from sklearn import model_selection, tree, ensemble, metrics, feature_selection
import joblib

fname = r'D:\Users\Isabelle\Downloads\data.csv~1\data.csv'
savefile = 'model_kobe.pkl'


print('=> Leitura dos dados')
df = pandas.read_csv(fname,sep=',')
target_col = 'shot_made_flag'
label_map = df[['lat','lon','minutes_remaining','period','playoffs','shot_distance','shot_made_flag','shot_type']].dropna()
df = label_map
print('DATASET KOBE')
print(df.head())
print(df['shot_type'].value_counts())
print(df['shot_type'].unique())


results = {}
print (df.columns)
for kobe_type in df['shot_type'].unique():
    print('=> Training for kobe:', kobe_type)
    print('\tSeparacao treino/teste 80/20')
    kobe = df.loc[df['shot_type'] == kobe_type].copy()
    Y = kobe[target_col]
    X = kobe.drop([target_col,'shot_type'], axis=1)
    ml_feature = list(X.columns)
    # train/test
    xtrain, xtest, ytrain, ytest = model_selection.train_test_split(X, Y, test_size=0.2,train_size=0.8)
    cvfold = model_selection.StratifiedKFold(n_splits = 10, random_state = 0, shuffle=True)
    print('\t\tTreino:', xtrain.shape[0])
    print('\t\tTeste :', xtest.shape[0])


    print('\tTreinamento e hiperparametros')
    param_grid = { 'ccp_alpha':[0.0], 'class_weight':[None], 'criterion':['gini'],
                       'max_depth':[None], 'max_features':[None], 'max_leaf_nodes':[None],
                      'min_impurity_decrease':[0.0], 'min_impurity_split':[None],
                      'min_samples_leaf':[1], 'min_samples_split':[2],
                      'min_weight_fraction_leaf':[0.0], 'presort':['deprecated'],
                      'random_state':[6651], 'splitter':['best']
                 }
    selector = feature_selection.RFE(tree.DecisionTreeClassifier(),
                                     n_features_to_select = 4)
    selector.fit(xtrain, ytrain)
    ml_feature = np.array(ml_feature)[selector.support_]
    
    model = model_selection.GridSearchCV(tree.DecisionTreeClassifier(),
                                         param_grid = param_grid,
                                         scoring = 'f1',
                                         refit = True,
                                         cv = cvfold,
                                         return_train_score=True
                                        )
    model.fit(xtrain[ml_feature], ytrain)


    print('\t---------Avaliando modelo------------')
    print(xtrain[ml_feature])
    threshold = 0.5
    xtrain.loc[:, 'probabilidade'] = model.predict_proba(xtrain[ml_feature])[:,1]
    xtrain.loc[:, 'classificacao'] = (xtrain.loc[:, 'probabilidade'] > threshold).astype(int)
    xtrain.loc[:, 'categoria'] = 'treino'

    xtest.loc[:, 'probabilidade']  = model.predict_proba(xtest[ml_feature])[:,1]
    xtest.loc[:, 'classificacao'] = (xtest.loc[:, 'probabilidade'] > threshold).astype(int)
    xtest.loc[:, 'categoria'] = 'teste'

    kobe = pandas.concat((xtrain, xtest))
    kobe[target_col] = pandas.concat((ytrain, ytest))
    kobe['target_label'] =  ['Acertou' if t else 'Errou'
                            for t in kobe[target_col]]
    
    print('\t\tAcurácia treino:', metrics.accuracy_score(ytrain, xtrain['classificacao']))
    print('\t\tAcurácia teste :', metrics.accuracy_score(ytest, xtest['classificacao']))


    print('\tRetreinamento com histórico completo')
    model = model.best_estimator_
    model = model.fit(X[ml_feature], Y)
    
 
    results[kobe_type] = {
        'model': model,
        'data': kobe, 
        'features': ml_feature,
        'target_col': target_col,
        'threshold': threshold
    }

print('=> Exportacao dos resultados')

joblib.dump(results, savefile, compress=9)
print('\tModelo salvo em', savefile)

=> Leitura dos dados
DATASET KOBE
       lat       lon  minutes_remaining  period  playoffs  shot_distance  \
1  34.0443 -118.4268                 10       1         0             15   
2  33.9093 -118.3708                  7       1         0             16   
3  33.8693 -118.1318                  6       1         0             22   
4  34.0443 -118.2698                  6       2         0              0   
5  34.0553 -118.4148                  9       3         0             14   

   shot_made_flag       shot_type  
1             0.0  2PT Field Goal  
2             1.0  2PT Field Goal  
3             0.0  2PT Field Goal  
4             1.0  2PT Field Goal  
5             0.0  2PT Field Goal  
2PT Field Goal    20285
3PT Field Goal     5412
Name: shot_type, dtype: int64
['2PT Field Goal' '3PT Field Goal']
Index(['lat', 'lon', 'minutes_remaining', 'period', 'playoffs',
       'shot_distance', 'shot_made_flag', 'shot_type'],
      dtype='object')
=> Training for kobe: 2PT Field Goal


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xtrain.loc[:, 'probabilidade'] = model.predict_proba(xtrain[ml_feature])[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xtrain.loc[:, 'classificacao'] = (xtrain.loc[:, 'probabilidade'] > threshold).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xtrain.loc[:, 'categoria'] = 'treino'

	---------Avaliando modelo------------
           lat       lon  minutes_remaining  period
17594  33.8473 -118.4358                  6       3
4216   33.7913 -118.2688                  1       2
23327  33.7983 -118.3568                  5       2
9688   33.8733 -118.0678                  6       2
25370  33.8673 -118.0908                  2       2
...        ...       ...                ...     ...
19979  33.8613 -118.4388                  4       4
27377  33.7833 -118.2588                  0       4
7483   33.8353 -118.4438                  1       4
9892   33.7893 -118.3318                  7       3
9533   33.9423 -118.4948                  5       4

[4329 rows x 4 columns]
		Acurácia treino: 0.9988449988449989
		Acurácia teste : 0.5493998153277931
	Retreinamento com histórico completo
=> Exportacao dos resultados


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xtrain.loc[:, 'probabilidade'] = model.predict_proba(xtrain[ml_feature])[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xtrain.loc[:, 'classificacao'] = (xtrain.loc[:, 'probabilidade'] > threshold).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xtrain.loc[:, 'categoria'] = 'treino'

	Modelo salvo em model_kobe.pkl
