In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector,make_column_transformer
from sklearn.pipeline import make_pipeline,Pipeline 
from sklearn.impute import SimpleImputer
from df_after_transform import df_after_transform
from sklearn.model_selection import KFold, cross_validate, GridSearchCV, cross_val_score,train_test_split
from sklearn.linear_model import Lasso,Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_selection import SelectKBest, f_classif,f_regression
from sklearn.metrics import r2_score,make_scorer
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb

## Inital, simple looking at teams, no matchup

In [44]:
# create list of dates:
# Define the start and end dates
start_date = '2023-11-01'
#testing with smaller set
#end_date = '2023-12-01'
end_date = '2024-03-31'

# Generate the range of dates
date_range = pd.date_range(start=start_date, end=end_date)

# Convert to list
date_list = list(date_range)

# format dates:
formatted_date_list = [date.strftime('%Y%m%d') for date in date_range]

# Print the formatted list of dates
print(formatted_date_list)
# Print the list of dates

['20231101', '20231102', '20231103', '20231104', '20231105', '20231106', '20231107', '20231108', '20231109', '20231110', '20231111', '20231112', '20231113', '20231114', '20231115', '20231116', '20231117', '20231118', '20231119', '20231120', '20231121', '20231122', '20231123', '20231124', '20231125', '20231126', '20231127', '20231128', '20231129', '20231130', '20231201', '20231202', '20231203', '20231204', '20231205', '20231206', '20231207', '20231208', '20231209', '20231210', '20231211', '20231212', '20231213', '20231214', '20231215', '20231216', '20231217', '20231218', '20231219', '20231220', '20231221', '20231222', '20231223', '20231224', '20231225', '20231226', '20231227', '20231228', '20231229', '20231230', '20231231', '20240101', '20240102', '20240103', '20240104', '20240105', '20240106', '20240107', '20240108', '20240109', '20240110', '20240111', '20240112', '20240113', '20240114', '20240115', '20240116', '20240117', '20240118', '20240119', '20240120', '20240121', '20240122', '20

In [21]:
## initally look at teams and find what makes them score

# y variables are the amount of points put up
# X varaibles is box score data before the game

y_df = pd.read_csv("full_scedule.csv")

y_df['f_Date'] = y_df['f_Date'].astype(str)
y_df.dtypes

Date                object
Start (ET)          object
Visitor/Neutral     object
PTS                  int64
Home/Neutral        object
PTS.1                int64
Unnamed: 6          object
Unnamed: 7          object
Attend.            float64
Arena               object
Notes               object
f_Date              object
Home_s              object
Visitor_s           object
Home_abbr           object
Away_abbr           object
dtype: object

In [74]:
#intialize df
new_df = pd.DataFrame(columns = ['Date','Team','Score','FG','FGA',
                                 'FG%','3P','3P%','FT','FTA','FT%','ORB','DRB','TRB',
                                'AST','STL','BLK','TOV','PF','PTS','TS%','eFG%'])

In [75]:
## y variables: score

for date in formatted_date_list:
    filtered_df = y_df[y_df['f_Date'] == date]
    indices = filtered_df.index.tolist()
    for index in indices:
        row = y_df.loc[index]
        home = {'Date':row['f_Date'],'Team' : row['Home_abbr'],'Score':row['PTS.1']}
        away = {'Date':row['f_Date'],'Team' : row['Away_abbr'],'Score':row['PTS']}
        new_df = pd.concat([new_df, pd.DataFrame([home]), pd.DataFrame([away])], ignore_index=True)




In [76]:
## x variable: box score from day before.
main_folder = 'NBA_Team_Statsheet'
date_prev = '20231031'
for date in formatted_date_list:
    box_score_df = pd.read_csv(f"{main_folder}/{date_prev}/statsheet.csv")
    box_score_df.set_index('Unnamed: 0', inplace=True)
    cols_to_add = ['FG','FGA','FG%','3P','3P%','FT','FTA','FT%','ORB','DRB','TRB',
                   'AST','STL','BLK','TOV','PF','PTS','TS%','eFG%']
    filtered_df = new_df[new_df['Date'] == date]
    for index, row in filtered_df.iterrows():
        team_name = row['Team']

        for col in cols_to_add:
            new_df.at[index, col] = box_score_df.at[team_name, col]
            
    date_prev = date



    

In [78]:
# make right things nums
exclude_columns = ['Date','Team']
for column in new_df.columns:
    if column not in exclude_columns:
        new_df[column] = pd.to_numeric(new_df[column], errors='coerce')

In [80]:


# split train and test data
split_ratio = 0.8  # 80% train, 20% test
split_index = int(len(new_df) * split_ratio)

# make all objects nums
new_df = new_df.apply(pd.to_numeric, errors='coerce')
train_df = new_df[:split_index]
test_df = new_df[split_index:]

In [79]:
new_df.dtypes

Date      object
Team      object
Score      int64
FG       float64
FGA      float64
FG%      float64
3P       float64
3P%      float64
FT       float64
FTA      float64
FT%      float64
ORB      float64
DRB      float64
TRB      float64
AST      float64
STL      float64
BLK      float64
TOV      float64
PF       float64
PTS      float64
TS%      float64
eFG%     float64
dtype: object

In [81]:
#Initialize X,y train,test
y_train = train_df['Score']
X_train = train_df.drop('Score',axis = 1)

y_test = test_df['Score']
X_test = test_df.drop('Score',axis = 1)

In [66]:
#preprocessing
numer_pipe = make_pipeline(SimpleImputer(strategy = 'mean'),
                        StandardScaler())
num_pipe_features = X_train.select_dtypes(include=['number']).columns
preproc_pipe = make_column_transformer(
                (numer_pipe, num_pipe_features),
                remainder = 'drop',)

In [83]:
processed_features = preproc_pipe.get_feature_names_out()
print("Features processed by the pipeline:", processed_features)

NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

## xgboost regression

In [87]:
# create pipeline
xg_pipe = Pipeline([('preproc', preproc_pipe),
                    ('feature_select', SelectKBest(f_regression, k =10)),
                    ('regressor', xgb.XGBRegressor(objective = 'reg:squarederror'))
                ])
xg_pipe

In [93]:
xg_pipe.fit(X_train,y_train)
if 'preproc' in xg_pipe.named_steps:
    feature_names = xg_pipe.named_steps['preproc'].get_feature_names_out()
#print(feature_names)
#xg_pipe.get_feature_names_out()

In [94]:
parameters = {
    'regressor__n_estimators': [200, 300],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__max_depth': [3, 5],
    'regressor__colsample_bytree': [0.6,0.7, 0.8],
    'regressor__subsample': [0.7, 0.8, 1.0],
    'feature_select__k' : [6,8,10,12]

    
}
grid_search = GridSearchCV(estimator = xg_pipe, param_grid=parameters, cv=5, scoring='r2')
results = grid_search.fit(X_train, y_train)


KeyboardInterrupt



In [None]:
results_df = pd.DataFrame(results.cv_results_)
sorted_df = results_df.sort_values(by='rank_test_score')
display(sorted_df.head(10))

In [72]:
y_pred = optimal_model.predict(X_test)
results_df = pd.DataFrame(


-0.03008508384951991
