# Data Preprocessing

## Importing the libraries

In [29]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import model.preprocess_games as ppg
pd.options.display.max_columns = None
pd.set_option("display.max_colwidth", -1)
#from IPython.core.interactiveshell import InteractiveShell
#InteractiveShell.ast_node_interactivity = "all"
pd.set_option("display.max_rows", 101)


  


## Importing the dataset

In [30]:
DATA_PATH = 'data'

SEASONS_PROCESSED_DS = f"{DATA_PATH}/seasons.processed.feather"

TEAMS_DS = f"{DATA_PATH}/teams.processed.feather"
TEAMS_PROCESSED_DS = f"{DATA_PATH}/teams.processed.feather"

GAMES_DS = f"{DATA_PATH}/games.csv"
GAMES_PROCESSED_DS = f"{DATA_PATH}/games.processed.feather"

In [31]:
games = pd.read_csv(GAMES_DS, parse_dates=["GAME_DATE_EST"], usecols=["GAME_ID",'GAME_DATE_EST', 'GAME_STATUS_TEXT', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID',
       'SEASON', 'PTS_home', 'FG_PCT_home', 'FT_PCT_home',
       'FG3_PCT_home', 'AST_home', 'REB_home', 'PTS_away',
       'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away', 'REB_away',
       'HOME_TEAM_WINS']
                    , infer_datetime_format=True, index_col="GAME_ID")
games = games.sort_values(by=['GAME_DATE_EST', 'GAME_ID'])
teams = pd.read_feather(TEAMS_PROCESSED_DS)
seasons = pd.read_feather(SEASONS_PROCESSED_DS)
games_matchup = pd.read_feather(GAMES_PROCESSED_DS)
games_matchup = games_matchup.set_index(["GAME_ID"])
games_matchup = games_matchup.sort_values(by=['GAME_DATE_EST', 'GAME_ID'])
games_matchup.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3689 entries, 21600001 to 21801229
Data columns (total 70 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   GAME_DATE_EST          3689 non-null   datetime64[ns]
 1   HOME_TEAM_NAME         3689 non-null   object        
 2   HOME_TEAM_ID           3689 non-null   int64         
 3   VISITOR_TEAM_NAME      3689 non-null   object        
 4   VISITOR_TEAM_ID        3689 non-null   int64         
 5   GAME_STATUS_TEXT       3689 non-null   object        
 6   SEASON                 3689 non-null   int64         
 7   HT_RANK                0 non-null      object        
 8   HT_CLASS               0 non-null      object        
 9   HT_HW                  3689 non-null   int64         
 10  HT_HL                  3689 non-null   int64         
 11  HT_VW                  3689 non-null   int64         
 12  HT_VL                  3689 non-null   int64       

In [32]:
games_matchup[["GAME_DATE_EST", "HOME_TEAM_NAME", "VISITOR_TEAM_NAME", "HOME_TEAM_WINS"]]
print(len(games_matchup["HOME_TEAM_NAME"].unique()))
print(len(games_matchup["VISITOR_TEAM_NAME"].unique()))

30
30


In [33]:
x_columns = [
"HOME_TEAM_NAME",
"VISITOR_TEAM_NAME",
"SEASON",
"HT_HW",
"HT_HL",
"HT_VW",
"HT_VL",
"HT_LAST10_W",
"HT_LAST10_L",
"HT_LAST10_MATCHUP_W",
"HT_LAST10_MATCHUP_L",
"HT_OVERALL_OFF_POINTS",
"HT_OVERALL_DEF_POINTS",
"HT_OVERALL_OFF_FG",
"HT_OVERALL_DEF_FG",
"HT_OVERALL_OFF_3P",
"HT_OVERALL_DEF_3P",
"HT_OVERALL_OFF_FT",
"HT_OVERALL_DEF_FT",
"HT_OVERALL_OFF_REB",
"HT_OVERALL_DEF_REB",
"HT_AWAY_POINTS",
"HT_AWAY_FG",
"HT_AWAY_3P",
"HT_AWAY_FT",
"HT_AWAY_REB",
"VT_HW",
"VT_HL",
"VT_VW",
"VT_VL",
"VT_LAST10_W",
"VT_LAST10_L",
"VT_LAST10_MATCHUP_W",
"VT_LAST10_MATCHUP_L",
"VT_OVERALL_OFF_POINTS",
"VT_OVERALL_DEF_POINTS",
"VT_OVERALL_OFF_FG",
"VT_OVERALL_DEF_FG",
"VT_OVERALL_OFF_3P",
"VT_OVERALL_DEF_3P",
"VT_OVERALL_OFF_FT",
"VT_OVERALL_DEF_FT",
"VT_OVERALL_OFF_REB",
"VT_OVERALL_DEF_REB",
"VT_AWAY_POINTS",
"VT_AWAY_FG",
"VT_AWAY_3P",
"VT_AWAY_FT",
"VT_AWAY_REB"
]

In [50]:
gm_df = games_matchup.loc[:, x_columns]
gm_df = pd.get_dummies(gm_df, dtype=float)
X = gm_df.values
len(X)
#X

3689

In [35]:
y_columns = [
"PTS_home",
"FG_PCT_home",
"FT_PCT_home",
"FG3_PCT_home",
"AST_home",
"REB_home",
"PTS_away",
"FG_PCT_away",
"FT_PCT_away",
"FG3_PCT_away",
"AST_away",
"REB_away",
"HOME_TEAM_WINS"
]

In [49]:
y = games_matchup.loc[:, ["HOME_TEAM_WINS"]].values
len(y)

3689

## Taking care of missing data

In [37]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [38]:
games_matchup.isna().sum()

GAME_DATE_EST            0   
HOME_TEAM_NAME           0   
HOME_TEAM_ID             0   
VISITOR_TEAM_NAME        0   
VISITOR_TEAM_ID          0   
GAME_STATUS_TEXT         0   
SEASON                   0   
HT_RANK                  3689
HT_CLASS                 3689
HT_HW                    0   
HT_HL                    0   
HT_VW                    0   
HT_VL                    0   
HT_LAST10_W              0   
HT_LAST10_L              0   
HT_LAST10_MATCHUP_W      0   
HT_LAST10_MATCHUP_L      0   
HT_OVERALL_OFF_POINTS    0   
HT_OVERALL_DEF_POINTS    0   
HT_OVERALL_OFF_FG        0   
HT_OVERALL_DEF_FG        0   
HT_OVERALL_OFF_3P        0   
HT_OVERALL_DEF_3P        0   
HT_OVERALL_OFF_FT        0   
HT_OVERALL_DEF_FT        0   
HT_OVERALL_OFF_REB       0   
HT_OVERALL_DEF_REB       0   
HT_AWAY_POINTS           0   
HT_AWAY_FG               0   
HT_AWAY_3P               0   
HT_AWAY_FT               0   
HT_AWAY_REB              0   
VT_RANK                  3689
VT_CLASS  

In [39]:
games_matchup = games_matchup.drop(columns=["HT_RANK", "HT_CLASS", "VT_RANK", "VT_CLASS"])


In [40]:
games_matchup.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3689 entries, 21600001 to 21801229
Data columns (total 66 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   GAME_DATE_EST          3689 non-null   datetime64[ns]
 1   HOME_TEAM_NAME         3689 non-null   object        
 2   HOME_TEAM_ID           3689 non-null   int64         
 3   VISITOR_TEAM_NAME      3689 non-null   object        
 4   VISITOR_TEAM_ID        3689 non-null   int64         
 5   GAME_STATUS_TEXT       3689 non-null   object        
 6   SEASON                 3689 non-null   int64         
 7   HT_HW                  3689 non-null   int64         
 8   HT_HL                  3689 non-null   int64         
 9   HT_VW                  3689 non-null   int64         
 10  HT_VL                  3689 non-null   int64         
 11  HT_LAST10_W            3689 non-null   int64         
 12  HT_LAST10_L            3689 non-null   int64       

## Encoding categorical data

In [41]:
np.size(X,1)

107

In [42]:
#from sklearn.compose import ColumnTransformer
#from sklearn.preprocessing import OneHotEncoder
#ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,1])], remainder='passthrough')
#X = np.array(ct.fit_transform(X))

In [43]:
np.size(X,1)

107

## Splitting the dataset into the Training set and Test set

In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1)

## Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="lbfgs" ,random_state=42)
score = cross_val_score(log_clf, X=X_train, y=y_train, cv=3, verbose=3)
score.mean()

In [None]:
from sklearn.metrics import precision_score, recall_score

log_clf = LogisticRegression(solver="lbfgs", random_state=42)
log_clf.fit(X_train, y_train)

y_pred = log_clf.predict(X_test)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

In [51]:
from sklearn.metrics import precision_score, recall_score

log_clf = LogisticRegression(solver="lbfgs", random_state=42)
log_clf.fit(X_train, y_train)

y_pred = log_clf.predict(X_test)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

[CV]  ................................................................
[CV] .................................... , score=0.658, total=   0.1s
[CV]  ................................................................
[CV] .................................... , score=0.651, total=   0.0s
[CV]  ................................................................
[CV] .................................... , score=0.606, total=   0.0s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
  return f(**kwargs)
STOP: TOTAL NO. of I

0.6384167707394266

In [57]:
from sklearn.metrics import precision_score, recall_score

log_clf = LogisticRegression(solver="lbfgs", random_state=42)
log_clf.fit(X_train, y_train)

y_pred = log_clf.predict(X_test)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 67.24%
Recall: 79.86%


  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
