In [1]:
import numpy as np
import pandas as pd
import sqlite3
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

seed=99

In [2]:
class ModelEval():
    '''
    
    
    Parameters
    ----------
    model : 
    
    model_name : 
    
    results_df : 
    
    residuals_df : 
    
    stats : 
    
    param_grid : 
    '''
    
    def __init__(self, model, model_name, results_df, residuals_df, stats, param_grid):
        
        self.model = model
        self.model_name = model_name
        self.results_df = results_df
        self.residuals_df = residuals_df
        self.X_train, self.X_test, self.y_train, self.y_test = stats
        self.param_grid = param_grid
        self.cv_score = None
        self.gs_score = None
    
    def calc_cv_score(self):
        
        cv_scores = cross_val_score(self.model, self.X_train, self.y_train, cv=3)
        self.cv_score = cv_scores.mean()
        return self.cv_score
    
    def calc_grid_search(self):
        
        grid_search = GridSearchCV(self.model, self.param_grid, cv=3)
        grid_search.fit(self.X_train, self.y_train)
        self.params_model = grid_search.best_estimator_
        self.gs_score = grid_search.best_score_
#         grid_search.best_score_
    
    def fit_model(self):
        
        if self.param_grid == None:
            self.fitted_model = self.model.fit(self.X_train, self.y_train)
            return self.fitted_model
        
        else:
            self.fitted_model = self.params_model.fit(self.X_train, self.y_train)
            return self.fitted_model
    
    def calc_train_preds(self):
        
        self.train_preds = self.fitted_model.predict(self.X_train)
        return self.train_preds
    
    def calc_test_preds(self):
        
        self.test_preds = self.fitted_model.predict(self.X_test)
        return self.test_preds
    
    def calc_train_score(self):
        
        self.train_score = self.fitted_model.score(self.X_train, self.y_train)
        return self.train_score
    
    def calc_test_score(self):
    
        self.test_score = self.fitted_model.score(self.X_test, self.y_test)
        return self.test_score
    
    def create_conf_matrix(self):
        conf_matrix = confusion_matrix(y_true=self.y_test, y_pred=self.test_preds)
        self.disp = ConfusionMatrixDisplay(conf_matrix)
        return self.disp.plot();
    
    def record_results(self):
        
        idx = self.results_df.shape[0]
        results_df.loc[idx] = [self.model_name, self.cv_score, self.gs_score, self.train_score, self.test_score]
        
        return self.results_df
    
    def calc_residuals(self):
        
        train_preds_df = pd.DataFrame(self.train_preds, index=self.y_train.index)
        test_preds_df = pd.DataFrame(self.test_preds, index=self.y_test.index)
        
        self.model_preds = pd.concat([train_preds_df, test_preds_df]).sort_index()
        
        self.residuals_df[f'{self.model_name}' + '_residuals'] = self.residuals_df['y_true'] - self.model_preds[0]
    
    def full_diag(self):
        
        if self.param_grid == None:   
            self.calc_cv_score()
        else:
            self.calc_grid_search()
            
        self.fit_model()
        self.calc_train_preds()
        self.calc_test_preds()
        self.calc_train_score()
        self.calc_test_score()
        self.create_conf_matrix()
        self.record_results()
        self.calc_residuals()
    
    def plot_error(self):
        
        season_error_counts = []
        for season in X['season'].unique():
            season_stats = X[X['season'] == season]
            season_stats['bin'] = pd.cut(season_stats.index, bins=4, labels=[0,1,2,3])

            error_counts = []
            for bn in season_stats['bin'].unique(): 
                error_count = season_stats['svc_residuals'][season_stats['bin'] == bn].astype(bool).sum(axis=0)
                error_counts.append(error_count)

            season_error_counts.append(error_counts)

In [3]:
conn = sqlite3.connect('NBA-Boxscore-Database.sqlite')
query = 'SELECT * from team_stats'
team_stats = pd.read_sql(query, con=conn)
team_stats

Unnamed: 0,game_id,team,MP,FG,FGA,FGp,3P,3PA,3Pp,FT,FTA,FTp,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,PM,TSp,eFGp,3PAr,FTr,ORBp,DRBp,TRBp,ASTp,STLp,BLKp,TOVp,USGp,ORtg,DRtg,BPM
0,131410290001,ORL,240,36.0,93.0,0.387,9.0,19.0,0.474,6.0,10.0,0.600,13.0,26.0,39.0,17.0,10.0,6.0,17.0,26.0,87.0,,0.447,0.435,0.204,0.108,27.7,72.2,47.0,47.2,10.5,11.1,14.9,100.0,91.6,102.1,
1,131410290001,IND,240,34.0,71.0,0.479,7.0,17.0,0.412,22.0,32.0,0.688,10.0,34.0,44.0,17.0,4.0,18.0,20.0,13.0,97.0,,0.570,0.528,0.239,0.451,27.8,72.3,53.0,50.0,4.2,24.3,19.0,100.0,102.1,91.6,
2,131410290002,CHI,240,35.0,83.0,0.422,7.0,26.0,0.269,18.0,23.0,0.783,11.0,30.0,41.0,23.0,11.0,4.0,18.0,27.0,95.0,,0.510,0.464,0.313,0.277,23.9,85.7,50.6,65.7,11.3,7.7,16.2,100.0,97.9,110.2,
3,131410290002,MIA,240,37.0,72.0,0.514,11.0,20.0,0.550,22.0,29.0,0.759,5.0,35.0,40.0,26.0,10.0,7.0,18.0,21.0,107.0,,0.631,0.590,0.278,0.403,14.3,76.1,49.4,70.3,10.3,12.3,17.5,100.0,110.2,97.9,
4,131410290003,LAC,240,41.0,83.0,0.494,8.0,21.0,0.381,13.0,23.0,0.565,10.0,30.0,40.0,27.0,11.0,4.0,16.0,21.0,103.0,,0.553,0.542,0.253,0.277,22.7,62.5,43.5,65.9,11.0,6.3,14.7,100.0,102.6,115.6,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23953,222304091228,OKC,240,45.0,90.0,0.500,12.0,36.0,0.333,13.0,16.0,0.813,8.0,41.0,49.0,30.0,5.0,2.0,14.0,17.0,115.0,,0.593,0.567,0.400,0.178,20.0,78.8,53.3,66.7,5.0,3.6,12.6,100.0,114.9,99.9,
23954,222304091229,LAC,240,49.0,100.0,0.490,5.0,23.0,0.217,16.0,24.0,0.667,14.0,39.0,53.0,22.0,3.0,5.0,10.0,18.0,119.0,,0.538,0.515,0.230,0.240,28.6,76.5,53.0,44.9,2.9,8.1,8.3,100.0,116.4,111.5,
23955,222304091229,PHO,240,42.0,99.0,0.424,14.0,37.0,0.378,16.0,22.0,0.727,12.0,35.0,47.0,29.0,4.0,3.0,7.0,21.0,114.0,,0.524,0.495,0.374,0.222,23.5,71.4,47.0,69.0,3.9,3.9,6.1,100.0,111.5,116.4,
23956,222304091230,GSW,240,58.0,96.0,0.604,27.0,49.0,0.551,14.0,16.0,0.875,9.0,49.0,58.0,47.0,13.0,6.0,15.0,18.0,157.0,,0.762,0.745,0.510,0.167,27.3,89.1,65.9,81.0,11.9,10.0,12.7,100.0,143.9,92.6,


In [5]:
team_stats_full_10 = pd.read_csv('team_stats_full_10.csv', index_col=0)
team_stats_full_10 = team_stats_full_10.dropna().reset_index(drop=True)
team_stats_full_10

# team_stats_full_20 = pd.read_csv('team_stats_full_20.csv', index_col=0)
# team_stats_full_20 = team_stats_full_20.dropna().reset_index(drop=True)
# team_stats_full_20

# team_stats_full_30 = pd.read_csv('team_stats_full_30.csv', index_col=0)
# team_stats_full_30 = team_stats_full_30.dropna().reset_index(drop=True)
# team_stats_full_30

# csv_dfs = [team_stats_full_10, team_stats_full_20, team_stats_full_30]

Unnamed: 0,game_id,season,date,away_team,away_score,home_team,home_score,result,a_FG,a_FGA,a_FGp,a_3P,a_3PA,a_3Pp,a_FT,a_FTA,a_FTp,a_ORB,a_DRB,a_TRB,a_AST,a_STL,a_BLK,a_TOV,a_PF,a_PTS,a_TSp,a_eFGp,a_3PAr,a_FTr,a_ORBp,a_DRBp,a_TRBp,a_ASTp,a_STLp,a_BLKp,a_TOVp,h_FG,h_FGA,h_FGp,h_3P,h_3PA,h_3Pp,h_FT,h_FTA,h_FTp,h_ORB,h_DRB,h_TRB,h_AST,h_STL,h_BLK,h_TOV,h_PF,h_PTS,h_TSp,h_eFGp,h_3PAr,h_FTr,h_ORBp,h_DRBp,h_TRBp,h_ASTp,h_STLp,h_BLKp,h_TOVp
0,131410310018,1314,2013-10-31,NYK,81,CHI,82,1,36.0,71.0,0.5070,3.0,13.0,0.2310,15.0,17.0,0.8820,6.0,28.0,34.0,17.0,12.0,6.0,22.0,16.0,90.0,0.5730,0.5280,0.1830,0.2390,18.80,84.80,52.30,47.20,12.80,10.30,21.90,35.0,83.0,0.4220,7.0,26.0,0.2690,18.0,23.0,0.7830,11.0,30.0,41.0,23.0,11.0,4.0,18.0,27.0,95.0,0.5100,0.4640,0.3130,0.2770,23.90,85.70,50.60,65.70,11.30,7.70,16.20
1,131410310019,1314,2013-10-31,GSW,115,LAC,126,1,46.0,86.0,0.5350,15.0,27.0,0.5560,18.0,23.0,0.7830,7.0,41.0,48.0,34.0,8.0,9.0,15.0,22.0,125.0,0.6500,0.6220,0.3140,0.2670,18.90,82.00,55.20,73.90,7.80,12.70,13.50,41.0,83.0,0.4940,8.0,21.0,0.3810,13.0,23.0,0.5650,10.0,30.0,40.0,27.0,11.0,4.0,16.0,21.0,103.0,0.5530,0.5420,0.2530,0.2770,22.70,62.50,43.50,65.90,11.00,6.30,14.70
2,131411010020,1314,2013-11-01,CLE,84,CHA,90,1,35.0,84.0,0.4170,5.0,15.0,0.3330,23.0,34.0,0.6760,16.0,32.0,48.0,21.0,7.0,3.0,11.0,20.0,98.0,0.4950,0.4460,0.1790,0.4050,36.40,78.00,56.50,60.00,7.60,5.20,10.00,33.0,90.0,0.3670,4.0,10.0,0.4000,13.0,20.0,0.6500,12.0,25.0,37.0,14.0,8.0,6.0,7.0,21.0,83.0,0.4200,0.3890,0.1110,0.2220,23.10,64.10,40.70,42.40,8.90,11.80,6.60
3,131411010021,1314,2013-11-01,NOP,90,ORL,110,1,34.0,85.0,0.4000,5.0,11.0,0.4550,17.0,21.0,0.8100,15.0,28.0,43.0,17.0,7.0,5.0,15.0,29.0,90.0,0.4780,0.4290,0.1290,0.2470,32.60,82.40,53.80,50.00,7.70,10.60,13.70,41.5,96.5,0.4285,9.5,23.0,0.4220,8.5,14.0,0.6055,13.5,33.0,46.5,19.0,8.5,6.0,17.5,27.5,101.0,0.4900,0.4775,0.2370,0.1440,28.15,73.15,49.70,45.95,8.40,9.55,14.60
4,131411010022,1314,2013-11-01,PHI,109,WAS,102,0,43.0,80.0,0.5380,8.0,21.0,0.3810,20.0,24.0,0.8330,8.0,32.0,40.0,24.0,16.0,1.0,18.0,21.0,114.0,0.6290,0.5880,0.2630,0.3000,25.00,82.10,56.30,55.80,16.10,2.20,16.60,34.0,78.0,0.4360,10.0,25.0,0.4000,24.0,32.0,0.7500,13.0,25.0,38.0,23.0,5.0,3.0,16.0,30.0,102.0,0.5540,0.5000,0.3210,0.4100,28.90,67.60,46.30,67.60,5.30,5.60,14.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11956,222304091226,2223,2023-04-09,UTA,117,LAL,128,1,43.1,91.3,0.4738,10.0,33.0,0.3037,18.6,24.7,0.7599,11.0,37.4,48.4,26.5,4.6,4.5,14.9,19.5,114.8,0.5634,0.5287,0.3627,0.2721,24.17,78.07,52.34,61.66,4.43,8.47,12.79,43.5,86.0,0.5065,11.3,29.0,0.3917,22.6,29.8,0.7542,10.8,36.2,47.0,26.1,6.3,5.1,13.3,16.1,120.9,0.6107,0.5724,0.3378,0.3551,25.69,78.39,53.25,60.16,6.31,8.55,11.87
11957,222304091227,2223,2023-04-09,NOP,108,MIN,113,1,43.5,86.8,0.5018,12.4,27.8,0.4370,18.7,22.1,0.8614,9.0,36.0,45.0,27.9,7.2,4.0,13.3,18.0,118.1,0.6124,0.5727,0.3194,0.2584,23.13,80.34,54.14,63.78,7.29,7.78,12.06,42.9,87.6,0.4892,12.4,30.4,0.4013,18.2,24.5,0.7421,10.0,33.6,43.6,28.7,6.9,4.1,13.3,20.8,116.4,0.5902,0.5595,0.3459,0.2829,23.20,76.90,50.52,66.79,6.89,6.75,11.89
11958,222304091228,2223,2023-04-09,MEM,100,OKC,115,1,46.8,91.6,0.5108,15.1,37.9,0.3918,16.4,20.8,0.7866,9.8,34.6,44.4,29.9,8.3,5.5,12.8,19.2,125.1,0.6218,0.5937,0.4166,0.2273,22.85,77.40,50.85,64.00,8.11,10.24,11.31,41.1,92.1,0.4461,11.1,34.5,0.3201,21.7,26.2,0.8401,12.1,30.4,42.5,22.5,7.8,2.6,10.6,20.4,115.0,0.5543,0.5062,0.3739,0.2859,25.87,72.62,47.92,54.95,7.81,4.92,9.29
11959,222304091229,2223,2023-04-09,LAC,119,PHO,114,0,43.4,84.8,0.5145,14.3,34.1,0.4139,17.7,23.0,0.7577,9.0,32.3,41.3,28.3,8.2,4.5,13.8,17.8,118.8,0.6282,0.6003,0.4051,0.2725,21.85,73.97,49.42,65.28,8.43,7.54,12.77,43.0,91.0,0.4757,11.2,31.4,0.3549,18.4,23.1,0.8009,12.3,32.7,45.0,27.8,6.1,7.3,11.1,20.0,115.6,0.5747,0.5385,0.3478,0.2579,26.86,77.08,51.34,64.58,6.27,13.56,9.91


In [None]:
feature_list = list(team_stats.drop(['game_id', 'team', 'MP', 'PM', 'USGp', 'ORtg', 'DRtg', 'BPM'], axis=1).columns)
agg_df = pd.DataFrame(columns = feature_list)
# for feature in feature_list:
#     team_10_agg[feature] = team_stats_full_10[f'a_{feature}'] - team_stats_full_10[f'h_{feature}']

for df in csv_dfs:
    team_agg_df = agg_df
    
    for feature in feature_list:
        team_agg_df[feature] = df[f'a_{feature}'] - df[f'h_{feature}']
    
    gi = df.loc[:,'game_id':'result']
    team_agg_df = pd.concat([gi,team_10_agg], axis=1)

In [6]:
feature_list = list(team_stats.drop(['game_id', 'team', 'MP', 'PM', 'USGp', 'ORtg', 'DRtg', 'BPM'], axis=1).columns)
team_10_agg = pd.DataFrame(columns = feature_list)
for feature in feature_list:
    team_10_agg[feature] = team_stats_full_10[f'a_{feature}'] - team_stats_full_10[f'h_{feature}']
team_10_agg

Unnamed: 0,FG,FGA,FGp,3P,3PA,3Pp,FT,FTA,FTp,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,TSp,eFGp,3PAr,FTr,ORBp,DRBp,TRBp,ASTp,STLp,BLKp,TOVp
0,1.0,-12.0,0.0850,-4.0,-13.0,-0.0380,-3.0,-6.0,0.0990,-5.0,-2.0,-7.0,-6.0,1.0,2.0,4.0,-11.0,-5.0,0.0630,0.0640,-0.1300,-0.0380,-5.10,-0.90,1.70,-18.50,1.50,2.60,5.70
1,5.0,3.0,0.0410,7.0,6.0,0.1750,5.0,0.0,0.2180,-3.0,11.0,8.0,7.0,-3.0,5.0,-1.0,1.0,22.0,0.0970,0.0800,0.0610,-0.0100,-3.80,19.50,11.70,8.00,-3.20,6.40,-1.20
2,2.0,-6.0,0.0500,1.0,5.0,-0.0670,10.0,14.0,0.0260,4.0,7.0,11.0,7.0,-1.0,-3.0,4.0,-1.0,15.0,0.0750,0.0570,0.0680,0.1830,13.30,13.90,15.80,17.60,-1.30,-6.60,3.40
3,-7.5,-11.5,-0.0285,-4.5,-12.0,0.0330,8.5,7.0,0.2045,1.5,-5.0,-3.5,-2.0,-1.5,-1.0,-2.5,1.5,-11.0,-0.0120,-0.0485,-0.1080,0.1030,4.45,9.25,4.10,4.05,-0.70,1.05,-0.90
4,9.0,2.0,0.1020,-2.0,-4.0,-0.0190,-4.0,-8.0,0.0830,-5.0,7.0,2.0,1.0,11.0,-2.0,2.0,-9.0,12.0,0.0750,0.0880,-0.0580,-0.1100,-3.90,14.50,10.00,-11.80,10.80,-3.40,1.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11956,-0.4,5.3,-0.0327,-1.3,4.0,-0.0880,-4.0,-5.1,0.0057,0.2,1.2,1.4,0.4,-1.7,-0.6,1.6,3.4,-6.1,-0.0473,-0.0437,0.0249,-0.0830,-1.52,-0.32,-0.91,1.50,-1.88,-0.08,0.92
11957,0.6,-0.8,0.0126,0.0,-2.6,0.0357,0.5,-2.4,0.1193,-1.0,2.4,1.4,-0.8,0.3,-0.1,0.0,-2.8,1.7,0.0222,0.0132,-0.0265,-0.0245,-0.07,3.44,3.62,-3.01,0.40,1.03,0.17
11958,5.7,-0.5,0.0647,4.0,3.4,0.0717,-5.3,-5.4,-0.0535,-2.3,4.2,1.9,7.4,0.5,2.9,2.2,-1.2,10.1,0.0675,0.0875,0.0427,-0.0586,-3.02,4.78,2.93,9.05,0.30,5.32,2.02
11959,0.4,-6.2,0.0388,3.1,2.7,0.0590,-0.7,-0.1,-0.0432,-3.3,-0.4,-3.7,0.5,2.1,-2.8,2.7,-2.2,3.2,0.0535,0.0618,0.0573,0.0146,-5.01,-3.11,-1.92,0.70,2.16,-6.02,2.86


In [None]:
gi = team_stats_full_10.loc[:,'game_id':'result']
team_full_10_agg = pd.concat([gi,team_10_agg], axis=1)
team_full_10_agg

In [None]:
X = team_10_agg
y = team_full_10_agg['result']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.80, random_state=seed)

In [None]:
scaler = StandardScaler()
pca = PCA(n_components=10)
logreg = LogisticRegression()
pipeline = Pipeline([
    ('scaler', scaaler),  # Scaling is done within the pipeline
    ('pca', pca),
    ('logreg', logreg)
])

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

explained_variance = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance)
sum(explained_variance)

In [None]:
model = LogisticRegression()
model.fit(X_train_pca, y_train)

In [None]:
y_pred = model.predict(X_test_pca)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

In [None]:

# Step 3: Standardize the data
X = data.drop(columns=['target_column_name'])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Apply PCA
n_components = 2
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

# Step 5: Explained Variance
explained_variance = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance)

# Step 6: Interpret the Results
# Use X_pca for further analysis, visualization, or modeling