# Project IART - 2023

In [1]:
import pandas
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import warnings

from IPython.display import Image
from subprocess import call

from sklearn.exceptions import ConvergenceWarning

from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

import sklearn.tree as tree  # 1. Decision Tree
from sklearn.neural_network import MLPClassifier  # 2. Neural Network
from sklearn.neighbors import KNeighborsClassifier # 3. KNN (K-Nearest Neighbors)
from sklearn.svm import SVC  # 4. Support Vector Machine (SVM)
from sklearn.ensemble import RandomForestClassifier  # 5. Random Forest
from sklearn.linear_model import LogisticRegression  # 6. Logistic Regression


from sklearn.metrics import confusion_matrix as conf_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# visualization
from sklearn.manifold import TSNE


from sklearn.mixture import GaussianMixture
from sklearn.neighbors import LocalOutlierFactor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# data
import seaborn as sns

modelResults = {}


In [2]:
# Load the datasets
df_teams = pandas.read_csv("dataset/teams.csv")
df_teams_post = pandas.read_csv("dataset/teams_post.csv")
df_series_post = pandas.read_csv("dataset/series_post.csv")
df_players = pandas.read_csv("dataset/players.csv")
df_players_teams = pandas.read_csv("dataset/players_teams.csv")
df_coaches = pandas.read_csv("dataset/coaches.csv")
df_awards_players = pandas.read_csv("dataset/awards_players.csv")

dfs = [df_teams, df_teams_post, df_series_post, df_players, df_players_teams, df_coaches, df_awards_players]
dfs_names = ["teams", "teams_post", "series_post", "players", "players_teams", "coaches", "awards_players"]

In [3]:
for i in range(len(dfs)):
    print(
        f"{dfs_names[i]}:\nRows: {dfs[i].shape[0]} \tColumns: {dfs[i].shape[1]}\n")

teams:
Rows: 142 	Columns: 61

teams_post:
Rows: 80 	Columns: 5

series_post:
Rows: 70 	Columns: 9

players:
Rows: 893 	Columns: 10

players_teams:
Rows: 1876 	Columns: 43

coaches:
Rows: 162 	Columns: 9

awards_players:
Rows: 95 	Columns: 4



In [4]:
df_teams.head()

Unnamed: 0,year,lgID,tmID,franchID,confID,divID,rank,playoff,seeded,firstRound,...,GP,homeW,homeL,awayW,awayL,confW,confL,min,attend,arena
0,9,WNBA,ATL,ATL,EA,,7,N,0,,...,34,1,16,3,14,2,18,6825,141379,Philips Arena
1,10,WNBA,ATL,ATL,EA,,2,Y,0,L,...,34,12,5,6,11,10,12,6950,120737,Philips Arena
2,1,WNBA,CHA,CHA,EA,,8,N,0,,...,32,5,11,3,13,5,16,6475,90963,Charlotte Coliseum
3,2,WNBA,CHA,CHA,EA,,4,Y,0,W,...,32,11,5,7,9,15,6,6500,105525,Charlotte Coliseum
4,3,WNBA,CHA,CHA,EA,,2,Y,0,L,...,32,11,5,7,9,12,9,6450,106670,Charlotte Coliseum


In [5]:
df_teams = df_teams.drop(columns=['lgID', 'franchID', 'divID', 'rank', 'name', "arena", "GP"], axis=1) 
# Replace confID with 0 and 1
# Playoff is target (switch to binary)

In [6]:
# replace df_teams features firstRound, semis, final with a single feature named RoundReached with values 0, 1, 2, 3
# 0 - not reached
# 1 - first round
# 2 - semi finals
# 3 - finals
df_teams['firstRound'] = df_teams['firstRound'].replace('W', 2)
df_teams['firstRound'] = df_teams['firstRound'].replace('L', 1)
df_teams['firstRound'] = df_teams['firstRound'].fillna(0)

df_teams['semis'] = df_teams['semis'].replace('W', 1)
df_teams['semis'] = df_teams['semis'].replace('L', 0)
df_teams['semis'] = df_teams['semis'].fillna(0)


df_teams['finals'] = df_teams['finals'].replace('W', 1)
df_teams['finals'] = df_teams['finals'].replace('L', 0)
df_teams['finals'] = df_teams['finals'].fillna(0)

df_teams['RoundReached'] = df_teams['firstRound'] + df_teams['semis'] + df_teams['finals']

df_teams = df_teams.drop(columns=['firstRound', 'semis', 'finals'], axis=1)

# now we can drop the playoff feature ... however we will need to use it to classify the teams
df_teams = df_teams.drop(columns=['playoff'], axis=1)

df_teams.head(10)

# ATL was founded in the end of 2007
# https://en.wikipedia.org/wiki/2007_WNBA_season
# https://en.wikipedia.org/wiki/2008_WNBA_season
# https://en.wikipedia.org/wiki/2009_WNBA_season
# it seams that by season 9 only 4 teams from each conference would pass ... and not the 8 best teams .... this might be a problem to be aware




Unnamed: 0,year,tmID,confID,seeded,o_fgm,o_fga,o_ftm,o_fta,o_3pm,o_3pa,...,lost,homeW,homeL,awayW,awayL,confW,confL,min,attend,RoundReached
0,9,ATL,EA,0,895,2258,542,725,202,598,...,30,1,16,3,14,2,18,6825,141379,0.0
1,10,ATL,EA,0,1089,2428,569,755,114,374,...,16,12,5,6,11,10,12,6950,120737,1.0
2,1,CHA,EA,0,812,1903,431,577,131,386,...,24,5,11,3,13,5,16,6475,90963,0.0
3,2,CHA,EA,0,746,1780,410,528,153,428,...,14,11,5,7,9,15,6,6500,105525,3.0
4,3,CHA,EA,0,770,1790,490,663,211,527,...,14,11,5,7,9,12,9,6450,106670,1.0
5,4,CHA,EA,0,787,1881,456,590,187,517,...,16,13,4,5,12,12,12,6850,120061,1.0
6,5,CHA,EA,0,745,1744,436,590,166,459,...,18,10,7,6,11,8,12,6900,116383,0.0
7,6,CHA,EA,0,772,1913,447,624,104,301,...,28,5,12,1,16,4,16,6945,98054,0.0
8,7,CHA,EA,0,864,2178,552,777,176,544,...,23,7,10,4,13,6,14,6825,106942,0.0
9,7,CHI,EA,0,858,2175,449,643,157,536,...,29,3,14,2,15,4,16,6800,57635,0.0


In [7]:
# add a new feature that indicates the pecentage of games won
df_teams['winPercentage'] = (df_teams['homeW'] + df_teams['awayW'])/ (df_teams['homeW'] + df_teams['awayW'] + df_teams['awayL'] + df_teams['homeL'])

# 
# 
# Talvez dei bias ao adicionar esta feature ... porém parece que as vitorias em casa são indicadoras de sucesso
#
# add a new feature that indicates the pecentage of home games won
df_teams['homeWinPercentage'] = df_teams['homeW'] / (df_teams['homeW'] + df_teams['homeL'])
# add a new feature that indicates the pecentage of away games won
df_teams['awayWinPercentage'] = df_teams['awayW'] / (df_teams['awayW'] + df_teams['awayL'])

# drop the W and L features
df_teams = df_teams.drop(columns=['homeW', 'homeL', 'awayW', 'awayL', 'won', 'lost'], axis=1)


# Conf League wins and losses n parece influenciar o acesso a playoffs mas convém ver se tem algo a ver
# remove confW and confL
df_teams = df_teams.drop(columns=['confW', 'confL'], axis=1)


df_teams.head(10)

Unnamed: 0,year,tmID,confID,seeded,o_fgm,o_fga,o_ftm,o_fta,o_3pm,o_3pa,...,tmTRB,opptmORB,opptmDRB,opptmTRB,min,attend,RoundReached,winPercentage,homeWinPercentage,awayWinPercentage
0,9,ATL,EA,0,895,2258,542,725,202,598,...,0,0,0,0,6825,141379,0.0,0.117647,0.058824,0.176471
1,10,ATL,EA,0,1089,2428,569,755,114,374,...,0,0,0,0,6950,120737,1.0,0.529412,0.705882,0.352941
2,1,CHA,EA,0,812,1903,431,577,131,386,...,0,0,0,0,6475,90963,0.0,0.25,0.3125,0.1875
3,2,CHA,EA,0,746,1780,410,528,153,428,...,0,0,0,0,6500,105525,3.0,0.5625,0.6875,0.4375
4,3,CHA,EA,0,770,1790,490,663,211,527,...,0,0,0,0,6450,106670,1.0,0.5625,0.6875,0.4375
5,4,CHA,EA,0,787,1881,456,590,187,517,...,0,0,0,0,6850,120061,1.0,0.529412,0.764706,0.294118
6,5,CHA,EA,0,745,1744,436,590,166,459,...,0,0,0,0,6900,116383,0.0,0.470588,0.588235,0.352941
7,6,CHA,EA,0,772,1913,447,624,104,301,...,0,0,0,0,6945,98054,0.0,0.176471,0.294118,0.058824
8,7,CHA,EA,0,864,2178,552,777,176,544,...,0,0,0,0,6825,106942,0.0,0.323529,0.411765,0.235294
9,7,CHI,EA,0,858,2175,449,643,157,536,...,0,0,0,0,6800,57635,0.0,0.147059,0.176471,0.117647


In [8]:
#check if any feature has only a unique value and if so drop it
for column in df_teams.columns:
    if len(df_teams[column].unique()) == 1:
        print(f"Column {column} has only one value: {df_teams[column].unique()[0]}")
        df_teams = df_teams.drop(columns=[column], axis=1)

df_teams.head(10)

Column seeded has only one value: 0
Column tmORB has only one value: 0
Column tmDRB has only one value: 0
Column tmTRB has only one value: 0
Column opptmORB has only one value: 0
Column opptmDRB has only one value: 0
Column opptmTRB has only one value: 0


Unnamed: 0,year,tmID,confID,o_fgm,o_fga,o_ftm,o_fta,o_3pm,o_3pa,o_oreb,...,d_stl,d_to,d_blk,d_pts,min,attend,RoundReached,winPercentage,homeWinPercentage,awayWinPercentage
0,9,ATL,EA,895,2258,542,725,202,598,340,...,310,561,134,2879,6825,141379,0.0,0.117647,0.058824,0.176471
1,10,ATL,EA,1089,2428,569,755,114,374,404,...,347,601,133,2797,6950,120737,1.0,0.529412,0.705882,0.352941
2,1,CHA,EA,812,1903,431,577,131,386,305,...,259,426,123,2429,6475,90963,0.0,0.25,0.3125,0.1875
3,2,CHA,EA,746,1780,410,528,153,428,309,...,257,447,124,2009,6500,105525,3.0,0.5625,0.6875,0.4375
4,3,CHA,EA,770,1790,490,663,211,527,302,...,208,424,103,2133,6450,106670,1.0,0.5625,0.6875,0.4375
5,4,CHA,EA,787,1881,456,590,187,517,342,...,264,469,104,2195,6850,120061,1.0,0.529412,0.764706,0.294118
6,5,CHA,EA,745,1744,436,590,166,459,256,...,243,437,114,2168,6900,116383,0.0,0.470588,0.588235,0.352941
7,6,CHA,EA,772,1913,447,624,104,301,316,...,269,534,146,2335,6945,98054,0.0,0.176471,0.294118,0.058824
8,7,CHA,EA,864,2178,552,777,176,544,347,...,286,619,125,2571,6825,106942,0.0,0.323529,0.411765,0.235294
9,7,CHI,EA,858,2175,449,643,157,536,357,...,262,535,120,2687,6800,57635,0.0,0.147059,0.176471,0.117647


In [9]:
# remaning features
print(f"Number of features: {len(df_teams.columns)}")
print(f"Features: {df_teams.columns}")

Number of features: 39
Features: Index(['year', 'tmID', 'confID', 'o_fgm', 'o_fga', 'o_ftm', 'o_fta', 'o_3pm',
       'o_3pa', 'o_oreb', 'o_dreb', 'o_reb', 'o_asts', 'o_pf', 'o_stl', 'o_to',
       'o_blk', 'o_pts', 'd_fgm', 'd_fga', 'd_ftm', 'd_fta', 'd_3pm', 'd_3pa',
       'd_oreb', 'd_dreb', 'd_reb', 'd_asts', 'd_pf', 'd_stl', 'd_to', 'd_blk',
       'd_pts', 'min', 'attend', 'RoundReached', 'winPercentage',
       'homeWinPercentage', 'awayWinPercentage'],
      dtype='object')


# Talvez n seja uma boa ideia ...

In [10]:

## # Create a decision tree classifier
## clf = tree.DecisionTreeClassifier()
## 
## df = df_teams.copy()
## df = df.drop(['tmID', 'confID'], axis=1)
## 
## df['playoff'] = df['playoff'].map(
##     {'N': 0, 'Y': 1})  # Swapping B and M for 0 and 1
## 
## X = df.drop(['playoff'], axis=1)
## y = df['playoff']
## 
## 
## # Create the RFE object and rank each pixel
## rfe = RFE(estimator=clf, n_features_to_select=10, step=1)
## rfe.fit(X, y)
## 
## # Get the feature importances
## feature_importances = rfe.ranking_
## 
## best_features = []
## 
## # Print the name of the most important features
## for index, feature in enumerate(feature_importances):
##     if feature == 1:
##         print(X.columns[index])
##         best_features.append(X.columns[index])
## 

### dps vemos estaticas de rebounds, steals, blocks, turnovers, etc para prever as estatisticas da equipa no ano seguinte

In [11]:
# save year 2010
df_teams_2010 = df_teams[df_teams.year == 10]
# 
# 
# Vai depender muito do que eles nos dão para testarmos.... provavelmente só nos vão dar a composição das equipas e não estatisiticas de jogos
# 
# 

#drop where year equals 2010
df_teams = df_teams[df_teams.year != 10]
tmids = df_teams_2010['tmID']
confIds = df_teams_2010['confID']
df_teams = df_teams.drop(columns=['tmID', 'year', 'confID'], axis=1)

df_teams_2010 = df_teams_2010.drop(columns=['tmID', 'year', 'confID'], axis=1)
# remove year and tmID

X, Y = df_teams.drop(columns=['winPercentage'], axis=1), df_teams['winPercentage']

best_rf = RandomForestRegressor(n_estimators = 600,min_samples_split = 2,min_samples_leaf = 4,max_features = 'auto',max_depth = 40,bootstrap = True)
base_rf = RandomForestRegressor()
best_model = best_rf.fit(X, Y)
base_model = base_rf.fit(X, Y)


X_TEST = df_teams_2010.drop(columns=['winPercentage'], axis=1)
Y_TEST = df_teams_2010['winPercentage']


def mae(test,pred):
    err = np.mean(abs(test-pred))
    return err
y_pred_base = base_model.predict(X_TEST)
y_pred_best = best_model.predict(X_TEST)
print('Mean absolute error (base rf model): {} '.format(mae(Y_TEST,y_pred_base)))
print('Mean absolute error (best rf model): {} '.format(mae(Y_TEST,y_pred_best)))
print(y_pred_base)
print(y_pred_best)

#add tmid again to df_teams_2010
df_teams_2010['tmID'] = tmids




# for every objet show the predicted winPercentage and the actual winPercentage
i=0
for index, row in df_teams_2010.iterrows():
    print(f"Team: {row['tmID']}\tPredicted winPercentage: {y_pred_best[i]}\tActual winPercentage: {row['winPercentage']}")
    i+=1


  warn(


Mean absolute error (base rf model): 0.022159219457013588 
Mean absolute error (best rf model): 0.022104159392772858 
[0.53143382 0.45040441 0.45772059 0.54773897 0.62512868 0.53766544
 0.46707721 0.38889706 0.67540441 0.33400735 0.50705882 0.56534926
 0.50444853]
[0.52975872 0.45133315 0.45795562 0.54162682 0.62384178 0.53283585
 0.46669717 0.38778628 0.66489683 0.33335143 0.50896862 0.56915411
 0.50844881]
Team: ATL	Predicted winPercentage: 0.5297587168676928	Actual winPercentage: 0.5294117647058824
Team: CHI	Predicted winPercentage: 0.4513331487849965	Actual winPercentage: 0.47058823529411764
Team: CON	Predicted winPercentage: 0.4579556152984327	Actual winPercentage: 0.47058823529411764
Team: DET	Predicted winPercentage: 0.5416268242752623	Actual winPercentage: 0.5294117647058824
Team: IND	Predicted winPercentage: 0.6238417786700858	Actual winPercentage: 0.6470588235294118
Team: LAS	Predicted winPercentage: 0.5328358486757058	Actual winPercentage: 0.5294117647058824
Team: MIN	Predic

In [12]:
#
#
# NOTE: This is tested against year 2009. It is able to predict the exact teams that made it to the playoffs because the dataset contains the playoff statistics features for that year.
#      However, the teacher won't probably give us the playoff statistics for the year 2010, so we will have to predict the teams that will make it to the playoffs based on other features.
#
#
#
# Make a prediction for the 2010 season
# say which teams will make it to the playoffs ... only four teams from each conference make it to the playoffs

df_teams_2010['confID'] = confIds

# add predWinPercentage to df_teams_2010
df_teams_2010['predWinPercentage'] = y_pred_best

ea_conf = df_teams_2010[df_teams_2010.confID == 'EA']
we_conf = df_teams_2010[df_teams_2010.confID == 'WE']


# get the top 4 teams from each conference
ea_conf = ea_conf.sort_values(by=['predWinPercentage'], ascending=False)
we_conf = we_conf.sort_values(by=['predWinPercentage'], ascending=False)


# print the teams that will make it to the playoffs

print("Eastern Conference")
for index, row in ea_conf.head(4).iterrows():
    print(f"{row['tmID']} with probality of {row['predWinPercentage']}")
print("Western Conference")
for index, row in we_conf.head(4).iterrows():
    print(f"{row['tmID']} with probality of {row['predWinPercentage']}")



Eastern Conference
IND with probality of 0.6238417786700858
DET with probality of 0.5416268242752623
ATL with probality of 0.5297587168676928
WAS with probality of 0.5084488110524418
Western Conference
PHO with probality of 0.6648968271947157
SEA with probality of 0.5691541132313992
LAS with probality of 0.5328358486757058
SAS with probality of 0.5089686221497287
