In [1]:
# Dependências

import numpy as np 
import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
# Parâmetros

TEST_SIZE = 0.2
TRAIN_SIZE = 0.8
SEED = 42

In [98]:
# Import de dados

uri = r'C:\Users\natha\Pictures\Infnet\kobe-classifier\data\01_raw\kobe_datase.csv'
data = pd.read_csv(uri)

In [99]:
print(f'Quantidade de linhas com dados faltantes: {data.shot_made_flag.isna().sum()}')
data.dropna(inplace=True)
print(f'Quantidade de linhas com dados faltantes: {data.shot_made_flag.isna().sum()}')

Quantidade de linhas com dados faltantes: 5000
Quantidade de linhas com dados faltantes: 0


In [100]:
data.columns

Index(['action_type', 'combined_shot_type', 'game_event_id', 'game_id', 'lat',
       'loc_x', 'loc_y', 'lon', 'minutes_remaining', 'period', 'playoffs',
       'season', 'seconds_remaining', 'shot_distance', 'shot_made_flag',
       'shot_type', 'shot_zone_area', 'shot_zone_basic', 'shot_zone_range',
       'team_id', 'team_name', 'game_date', 'matchup', 'opponent', 'shot_id'],
      dtype='object')

coluna lng no trabalho, no dataset -> lon

In [101]:
data.groupby('shot_type').count()['game_id']

shot_type
2PT Field Goal    20285
3PT Field Goal     5412
Name: game_id, dtype: int64

Temos 20.285 inputs do shot_type = '2PT Field Goal'

In [102]:
data.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,...,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,...,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5
5,Jump Shot,Jump Shot,244,20000012,34.0553,-145,-11,-118.4148,9,3,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,6


In [103]:
data['shot_type'].unique()

array(['2PT Field Goal', '3PT Field Goal'], dtype=object)

In [8]:
def prepared_data(data):
    columns = ['lat', 'lon', 'minutes_remaining', 'period', 'playoffs', 'shot_distance', 'shot_made_flag']

    data.dropna(inplace=True)
    data = data[data['shot_type'] == '2PT Field Goal']
    data = data[columns]

    return data

In [9]:
p_data = prepared_data(data)

# Treino e teste

In [10]:
def split_data(data):
    X = data.drop('shot_made_flag', axis = 1)
    y = data['shot_made_flag'].copy()

    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=TEST_SIZE, 
                                                        random_state=SEED,
                                                        shuffle=True,
                                                        stratify=y)
    
    return X_train, X_test, y_train, y_test

In [11]:
X_train, X_test, y_train, y_test = split_data(p_data)

# PyCaret

In [12]:
from pycaret.classification import *
from pycaret.classification import ClassificationExperiment
from sklearn.metrics import log_loss, f1_score

In [13]:
train_data = X_train.copy()
train_data['shot_made_flag'] = y_train.copy()

In [14]:
def train_knn_pycaret(X_test, y_test, train_data, SEED):
    exp = ClassificationExperiment()
    exp.setup(train_data, 
          target = 'shot_made_flag', 
          session_id = SEED,  
          n_jobs=-2, 
          log_experiment='mlflow', 
          experiment_name='kobe_classifier')
    
    exp.add_metric('logloss', 
               'Log Loss', 
               log_loss, 
               greater_is_better = False)
    
    exp.add_metric('f1_score', 
               'F1 Score', 
               f1_score, 
               greater_is_better = True)
    
    knn_model = exp.create_model('knn', 
                            verbose=True)
   

    return knn_model

In [15]:
modelo_teste =  train_knn_pycaret(X_test, y_test, train_data, SEED)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Log Loss,F1 Score
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.5563,0.5705,0.5517,0.5339,0.5426,0.1121,0.1122,7.073,0.5426
1,0.5361,0.549,0.5037,0.5141,0.5089,0.0694,0.0694,7.3958,0.5089
2,0.544,0.5493,0.4779,0.5243,0.5,0.0826,0.0829,7.2695,0.5
3,0.5246,0.5165,0.5037,0.5018,0.5028,0.0475,0.0475,7.5782,0.5028
4,0.5484,0.5593,0.5295,0.5266,0.5281,0.0952,0.0952,7.1993,0.5281
5,0.5475,0.5527,0.5,0.5272,0.5133,0.0911,0.0912,7.2134,0.5133
6,0.5308,0.542,0.4594,0.5092,0.483,0.0556,0.0559,7.48,0.483
7,0.5581,0.5592,0.5046,0.5404,0.5219,0.112,0.1122,7.045,0.5219
8,0.5528,0.5614,0.4954,0.5348,0.5143,0.1011,0.1014,7.1292,0.5143
9,0.5233,0.5216,0.4982,0.5009,0.4995,0.0445,0.0445,7.599,0.4995


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [26]:
modelo_teste

In [25]:
X_test

Unnamed: 0,lat,lon,minutes_remaining,period,playoffs,shot_distance
30558,33.8883,-118.3828,11,3,1,19
520,34.0213,-118.1598,2,3,0,11
25613,33.9833,-118.1208,10,2,0,16
15368,34.0443,-118.2698,0,4,0,0
9175,34.0443,-118.2698,0,1,0,0
...,...,...,...,...,...,...
323,34.0313,-118.2988,11,2,0,3
1043,34.0423,-118.1848,6,1,0,8
23222,34.0443,-118.2698,3,4,0,0
14718,33.9343,-118.3778,5,1,0,15


In [17]:
# real
y_test

30558    0.0
520      0.0
25613    0.0
15368    1.0
9175     0.0
        ... 
323      0.0
1043     0.0
23222    1.0
14718    1.0
24021    0.0
Name: shot_made_flag, Length: 4057, dtype: float64

In [18]:
def register_log_loss(model, X_test, y_test):
    y_test_predict = model.predict(X_test)
    metric_log_loss = log_loss(y_test, y_test_predict)

    return {
        'lr_log_loss': {'value': metric_log_loss, 'step': 1}
    }

In [19]:
register_log_loss(modelo_teste, X_test, y_test)

{'lr_log_loss': {'value': 7.367999118008955, 'step': 1}}

In [20]:
def train_knn_pycaret(train_data, SEED):
    exp = ClassificationExperiment()
    exp.setup(train_data, 
          target = 'shot_made_flag', 
          session_id = SEED,  
          n_jobs=-2, 
          log_experiment='mlflow', 
          experiment_name='kobe_classifier')
    
    exp.add_metric('logloss', 
               'Log Loss', 
               log_loss, 
               greater_is_better = False)
    
    model_knn = exp.create_model('knn', 
                            verbose=True)
    
    return model_knn

In [21]:
def register_f1_score(model, X_test, y_test):
    y_test_predict = model.predict(X_test)
    metric_f1_score = f1_score(y_test, y_test_predict)

    return metric_f1_score

# Servindo o modelo

In [108]:
import streamlit as st
import requests
import pandas as pd
from json import loads

uri = 'http://localhost:5001/invocations'

In [120]:
pq = pd.read_parquet(r'C:\Users\natha\Pictures\Infnet\kobe-classifier\data\04_feature\X_test.parquet')

In [121]:
pq.head()

Unnamed: 0,lat,lon,minutes_remaining,period,playoffs,shot_distance
30558,33.8883,-118.3828,11,3,1,19
520,34.0213,-118.1598,2,3,0,11
25613,33.9833,-118.1208,10,2,0,16
15368,34.0443,-118.2698,0,4,0,0
9175,34.0443,-118.2698,0,1,0,0


In [160]:
# Import de dados

url = r'C:\Users\natha\Pictures\Infnet\kobe-classifier\data\01_raw\kobe_datase.csv'
data = pd.read_csv(url)

In [161]:
columns = ['lat', 'lon', 'minutes_remaining', 'period', 'playoffs', 'shot_distance']
data.dropna(inplace=True)
data = data[data['shot_type'] == '3PT Field Goal']
y_real_3PT = list(data['shot_made_flag'])
data = data[columns]

In [167]:
len(y_real_3PT)

5412

In [168]:
len(predict_3PT)

1

In [125]:
dados_teste = data.to_json(orient='records')
parsed = loads(dados_teste)

In [170]:
# for i in parsed:
teste = {"dataframe_records": parsed}
response = requests.post(uri, json=teste)
# print(response)
results = response.json()

predict_3PT = results['predictions']
# predict_3PT.append(pred)

In [171]:
predict_3PT

[0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0

In [172]:
log_loss(y_real_3PT, predict_3PT)

14.944929454024185

In [173]:
f1_score(y_real_3PT, predict_3PT)

0.3137614678899083

In [174]:
X_test = pd.read_parquet(r'C:\Users\natha\Pictures\Infnet\kobe-classifier\data\04_feature\X_test.parquet')

In [175]:
X_test

Unnamed: 0,lat,lon,minutes_remaining,period,playoffs,shot_distance
30558,33.8883,-118.3828,11,3,1,19
520,34.0213,-118.1598,2,3,0,11
25613,33.9833,-118.1208,10,2,0,16
15368,34.0443,-118.2698,0,4,0,0
9175,34.0443,-118.2698,0,1,0,0
...,...,...,...,...,...,...
323,34.0313,-118.2988,11,2,0,3
1043,34.0423,-118.1848,6,1,0,8
23222,34.0443,-118.2698,3,4,0,0
14718,33.9343,-118.3778,5,1,0,15


In [178]:
y_test = pd.read_parquet(r'C:\Users\natha\Pictures\Infnet\kobe-classifier\data\04_feature\y_test.parquet')
X_test = pd.read_parquet(r'C:\Users\natha\Pictures\Infnet\kobe-classifier\data\04_feature\X_test.parquet')



In [180]:
def predict(test_df):
    test_df = data.to_json(orient='records')
    parsed = loads(test_df)
    evaluation = {"dataframe_records": parsed}
    response = requests.post(uri, json=evaluation)
    results = response.json()

    predict = results['predictions']

    return predict

In [182]:
teste1 = predict(X_test)

In [184]:
len(X_test)

4057