In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
import seaborn as sns
import feature_selection as fs
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC
import prep_utils as pu
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import pointbiserialr
from sklearn.feature_selection import RFE

PAST_YEARS = 1
TEST_YEAR = 10

In [3]:
db = sqlite3.connect("db/ac.db")
db_cur = db.cursor()

[df_awards, df_coaches, df_players_teams, df_players, df_series_post, df_teams_post, df_teams] = pu.db_to_pandas(db)

In [4]:
# Transform all possible attributes into percentages. (Made / Attempted) & (Offensive & Defensive Rebound %)
df_new_teams = pu.prepare_teams(df_teams,df_teams_post,PAST_YEARS)
df_new_teams = fs.fs_teams(df_new_teams)
df_team_results = df_new_teams[["year","tmID","confID","playoff","rank","team_playoffs_count"]]

Dropping divID in [1mTeams[0m...


In [5]:
df_new_coaches = pu.prepare_coaches(df_coaches, df_awards,PAST_YEARS)
df_new_coaches = pu.group_coaches(df_new_coaches)
df_new_coaches.drop("coachID", axis = 1, inplace = True)

df_final_coaches = df_new_coaches.copy()
df_final_coaches.columns = df_final_coaches.columns.str.lower()

Dropping Attribute lgID in [1mCoaches[0m...
Creating attribute coach previous regular season win ratio...
Creating attribute coach playoffs win ratio...
Creating attribute coach playoffs count...
Creating attribute coach awards count...
Dropping attribute post_wins..
Dropping attribute post_losses..
Dropping attribute won..
Dropping attribute lost..

[1mCoaches Null Verification:[0m
year                    0
tmID                    0
coachID                 0
coach_reg_wr            0
coach_po_wr             0
coach_playoffs_count    0
coach_awards            0
dtype: int64


In [6]:
df_new_players_teams = pu.prepare_player_teams(df_players_teams,df_awards,PAST_YEARS)
df_players = df_new_players_teams.copy()
df_players = fs.fs_players(df_players,0.2)
df_players = df_players[df_players['year'] != 1]

df_team_results.columns = df_team_results.columns.str.lower()
merged_data = pd.merge(df_players, df_team_results, on=['tmid', 'year'], how='left')
merged_data = pd.merge(merged_data, df_final_coaches, on=['tmid', 'year'], how='left')

#print(merged_data.to_string())


Dropping Attribute lgID in [1mPlayers_Teams[0m...


#### Point Bisserial Correlation 
We will use this to check correlation between continuous attributes & target


In [7]:
fs.bisserial_corr(merged_data)

total_assists: 36.39% correlation
total_gs: 32.71% correlation
total_points: 31.77% correlation
coach_po_wr: 31.26% correlation
total_minutes: 31.16% correlation
coach_reg_wr: 30.47% correlation
total_turnovers: 28.44% correlation
player_awards: 27.79% correlation
total_blocks: 26.30% correlation
total_steals: 24.90% correlation
coach_playoffs_count: 24.50% correlation
total_pf: 23.19% correlation
team_playoffs_count: 19.10% correlation
rank: 18.01% correlation
total_drebounds_pct: 12.93% correlation
total_orebounds_pct: 12.93% correlation
coach_awards: 12.93% correlation
total_dq: 12.35% correlation
total_fg_pct: 10.58% correlation
total_gp: 10.20% correlation
total_ft_pct: 4.27% correlation
total_three_pct: 3.71% correlation


### Dividing the dataset in both train & test
We will be using year 10 and test and the remaining ones to train the model

In [8]:
label_encoder = LabelEncoder()
merged_data['tmid'] = label_encoder.fit_transform(merged_data['tmid'])
merged_data['confid'] = label_encoder.fit_transform(merged_data['confid'])

x = merged_data.drop('playoff', axis=1)
y = merged_data['playoff']

x_train = merged_data[merged_data['year'].between(0, TEST_YEAR - 1)].drop('playoff', axis=1)
y_train = merged_data[merged_data['year'].between(0, TEST_YEAR - 1)]['playoff']

x_test = merged_data[merged_data['year'] == TEST_YEAR].drop('playoff', axis=1)
y_test = merged_data[merged_data['year'] == TEST_YEAR]['playoff']

### RFE
We will running RFE on the different models to find out which features produce the best results

In [9]:
min_features = 5

rfe_classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Support Vector Machine': SVC(random_state=42, kernel='linear'),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
}

classifiers_features = {}

total_features = len(x_train.columns)
for model_name, model in rfe_classifiers.items():
    print(f"\033[1mModel: {model_name}\033[0m")
    
    results = []

    for i in range(min_features, total_features):
        rfe = RFE(model, n_features_to_select=i)
        rfe.fit(x_train, y_train)
        
        selected_features = set(x_train.columns[rfe.support_])
        selected_features.add("tmid")
        selected_features.add("year")
        selected_features = list(selected_features)

        model.fit(x_train[selected_features], y_train)

        accuracy = model.score(x_test[selected_features], y_test)

        results.append((selected_features, accuracy))

    # Sort the results based on accuracy in descending order
    results = sorted(results, key=lambda x: x[1], reverse=True)

    classifiers_features[model_name] = results[0][0]
    # Print the results
    for features, accuracy in results[:3]:
        print("Selected Features:", features)
        print("Accuracy:" + str(accuracy) + '\n')

[1mModel: Random Forest[0m


Selected Features: ['total_gs', 'total_three_pct', 'total_fg_pct', 'total_orebounds_pct', 'total_gp', 'total_assists', 'total_minutes', 'year', 'total_steals', 'coach_reg_wr', 'total_ft_pct', 'tmid', 'total_points']
Accuracy:0.8461538461538461

Selected Features: ['total_gs', 'total_three_pct', 'total_turnovers', 'total_fg_pct', 'total_orebounds_pct', 'total_blocks', 'total_assists', 'total_minutes', 'total_gp', 'year', 'total_steals', 'total_drebounds_pct', 'coach_reg_wr', 'total_ft_pct', 'tmid', 'total_points']
Accuracy:0.8461538461538461

Selected Features: ['total_gs', 'total_fg_pct', 'total_orebounds_pct', 'year', 'total_assists', 'tmid', 'total_points']
Accuracy:0.7692307692307693

[1mModel: Logistic Regression[0m


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Selected Features: ['confid', 'team_playoffs_count', 'total_fg_pct', 'coach_po_wr', 'total_dq', 'total_gp', 'rank', 'player_awards', 'coach_playoffs_count', 'total_assists', 'total_drebounds_pct', 'coach_reg_wr', 'coach_awards', 'total_ft_pct', 'year', 'tmid']
Accuracy:0.6923076923076923

Selected Features: ['total_gs', 'total_gp', 'rank', 'total_drebounds_pct', 'coach_reg_wr', 'tmid', 'team_playoffs_count', 'total_steals', 'coach_po_wr', 'total_dq', 'total_blocks', 'coach_playoffs_count', 'coach_awards', 'total_ft_pct', 'confid', 'total_fg_pct', 'player_awards', 'total_assists', 'year']
Accuracy:0.6923076923076923

Selected Features: ['total_gs', 'total_gp', 'rank', 'total_drebounds_pct', 'coach_reg_wr', 'tmid', 'team_playoffs_count', 'total_minutes', 'total_steals', 'coach_po_wr', 'total_pf', 'total_dq', 'total_blocks', 'coach_playoffs_count', 'coach_awards', 'total_ft_pct', 'total_points', 'confid', 'total_fg_pct', 'player_awards', 'total_assists', 'year']
Accuracy:0.692307692307692

Since RFE doesn't work with KNN, we will be using SelectKBest which produces the same process

In [10]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
results = []

for i in range(min_features, total_features):
    knn = KNeighborsClassifier()

    selector = SelectKBest(score_func=mutual_info_classif, k=i)
    selector.fit(x_train, y_train)


    selected_features = set(x_train.columns[selector.get_support()])
    selected_features.add("tmid")
    selected_features.add("year")
    selected_features = list(selected_features)

    knn.fit(x_train[selected_features], y_train)

    accuracy = knn.score(x_test[selected_features], y_test)

    results.append((selected_features, accuracy))

results = sorted(results, key=lambda x: x[1], reverse=True)

classifiers_features["K-Nearest Neighbors"] = results[0][0]

# Print the best 3 results
for features, accuracy in results[:3]:
    print("Selected Features:", features)
    print("Accuracy:" + str(accuracy) + '\n')

Selected Features: ['confid', 'rank', 'player_awards', 'year', 'total_assists', 'tmid']
Accuracy:0.6923076923076923

Selected Features: ['team_playoffs_count', 'rank', 'player_awards', 'year', 'coach_playoffs_count', 'total_assists', 'coach_po_wr', 'tmid']
Accuracy:0.6923076923076923

Selected Features: ['total_gs', 'total_steals', 'total_blocks', 'player_awards', 'total_minutes', 'year', 'coach_playoffs_count', 'total_assists', 'coach_reg_wr', 'coach_po_wr', 'tmid', 'coach_awards', 'total_points']
Accuracy:0.6923076923076923



#### GridSearch
Now that we know the best features for each model, we will use gridsearch to fine tune its parameters.

In [11]:
fs.grid_search(classifiers_features,x_train,x_test,y_train,y_test)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best parameters for Random Forest: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best cross-validation score for Random Forest: 0.6550
Test set accuracy for Random Forest: 0.8462



  warn(
