In [1]:
#USING XGBOSST MODEL ..........

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import xgboost as xgb

# DATASET
data = pd.read_csv(r"C:\Users\nh013\Desktop\NBA DATASET\GAMES.csv")

#DROP UNNEED COLUMN
columns_to_drop = ['SEASON_ID', 'TEAM_ID', 'GAME_ID', 'GAME_DATE', 'MATCHUP']
data = data.drop(columns_to_drop, axis=1)

#FEATURE
categorical_features = ['TEAM_ABBREVIATION', 'TEAM_NAME', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
                        'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS']

#LABEL ENCODER
label_encoder = LabelEncoder()

# CONVERT CATEGORICAL FEATURE TO NUMERIC
for feature in categorical_features:
    data[feature] = label_encoder.fit_transform(data[feature])

    
# CONVERT NUMERICALTO FLOAT
data = data.astype(float)

# FILLED MISSING VALUE WITH CONVERTED NUMERIC FORM
categorical_features_to_fill = ['TEAM_ABBREVIATION', 'TEAM_NAME', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
                        'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS']

# SIMPLEIMPUTER IN CATEGORICAL FEATURE
categorical_imputer = SimpleImputer(strategy='most_frequent')

data[categorical_features_to_fill] = categorical_imputer.fit_transform(data[categorical_features_to_fill])

# IDENTIFY MISSING VALUES
print(data.isnull().sum())

# DROP ROWS WITH MISSING VALUES
data.dropna(subset=categorical_features, inplace=True)

# FILL MISSING VALUES WITH FORWARD FILL
data.fillna(method='ffill', inplace=True)

# FILL MISSING VALUES WITH BACKWARD FILL
data.fillna(method='bfill', inplace=True)

# CONVERT NUMERICAL COLUMNS TO NUMERIC
data[categorical_features] = data[categorical_features].apply(pd.to_numeric, errors='coerce')

# FILL MISSING VALUES WITH MEAN
mean = data[categorical_features].mean()
data[categorical_features].fillna(mean, inplace=True)

# NORMALIZE AND SCALE 
scaler = MinMaxScaler()
data[categorical_features] = scaler.fit_transform(data[categorical_features])

# FEATURE ENGINEERING: REBUNDING PERFORMANCE
data['OREB/G'] = data['OREB'] / data['MIN']
data['DREB/G'] = data['DREB'] / data['MIN']

# FEATURE ENGINEERING: ASSIST TURN OVER RASIO
data['AST_TOV_RATIO'] = data['AST'] / data['TOV']

# FEATURE ENGINEERING: FREE THROW EFFICENCY
data['FT_PCT'] = data['FTM'] / data['FTA']

# FEATURE ENGINEERING: POINT DIFFERENTIAL
data['POINT_DIFF'] = data['PTS'] - data['PLUS_MINUS']

# FEATURE ENGINEERING:STEALS AND BLOCK
data['STL/G'] = data['STL'] / data['MIN']
data['BLK/G'] = data['BLK'] / data['MIN']

# FEATURE ENGINEERING: FOUL RATE
data['FOUL_RATE'] = data['PF'] / data['MIN']

data.dropna(inplace=True)  
data.replace([np.inf, -np.inf], np.nan, inplace=True)  
data.dropna(inplace=True)  

# FIX TARGET VARIABLE
X = data.drop('WL', axis=1)
y = data['WL']

# TRAIN AND TESTING SET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# CONVERT TARGET VARIABLE TO INTEGER
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)

# MODEL
model = xgb.XGBClassifier()

# TRAIN MODEL
model.fit(X_train, y_train)


#CATEGORICAL  LABELS INTO NUMERIC LEBELS

y_test_encoded = label_encoder.transform(y_test)

# MODEL ACURACY
accuracy = model.score(X_test, y_test_encoded)
print('Test Accuracy:', accuracy)

# PREDICTION ON TEST SET
predictions_test = model.predict(X_test)

# CONVERT THE ENCODED PREDICTED LABELS BACK TO THEIR ORGINAL STRING FORM
predicted_labels_test = label_encoder.inverse_transform(predictions_test)

# CONVERT PREDICTED VALUE TO PERCENT OVER 100
predicted_labels_test = (predicted_labels_test * 100).round(2)

# CONVERTS THE TEAM NAME TO THEIR ORGINAL STRING FORM
team_names_test = label_encoder.inverse_transform(X_test['TEAM_NAME'].astype(int))

# CREATE SEPERATE LIST FOR TEAM NAMES aAND WIN/LOSS PREDICTION
team_list = []
win_percentage_list = []
loss_percentage_list = []

for team, prediction in zip(team_names_test, predicted_labels_test):
    win_percentage = prediction
    loss_percentage = 100 - prediction
    team_list.append(team)
    win_percentage_list.append(win_percentage)
    loss_percentage_list.append(loss_percentage)

# OUTPUT
for team, win_percentage, loss_percentage in zip(team_list, win_percentage_list, loss_percentage_list):
    print("Team Name:", team)
    print("Win Percentage:", win_percentage, "%")
    print("Loss Percentage:", loss_percentage, "%")
    print()


TEAM_ABBREVIATION    0
TEAM_NAME            0
WL                   0
MIN                  0
PTS                  0
FGM                  0
FGA                  0
FG_PCT               0
FG3M                 0
FG3A                 0
FG3_PCT              0
FTM                  0
FTA                  0
FT_PCT               0
OREB                 0
DREB                 0
REB                  0
AST                  0
STL                  0
BLK                  0
TOV                  0
PF                   0
PLUS_MINUS           0
dtype: int64
Test Accuracy: 0.9913950933723911
Team Name: 0.0
Win Percentage: 0.0 %
Loss Percentage: 100.0 %

Team Name: 0.0
Win Percentage: 50.0 %
Loss Percentage: 50.0 %

Team Name: 0.0
Win Percentage: 0.0 %
Loss Percentage: 100.0 %

Team Name: 0.0
Win Percentage: 0.0 %
Loss Percentage: 100.0 %

Team Name: 0.0
Win Percentage: 50.0 %
Loss Percentage: 50.0 %

Team Name: 0.0
Win Percentage: 0.0 %
Loss Percentage: 100.0 %

Team Name: 0.0
Win Percentage: 0.0 %
Loss Perc

In [2]:
#USING XGBOOST MODEL TO PREDICT  W/L PERCENTS..


import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import xgboost as xgb

# DATASET
data = pd.read_csv(r"C:\Users\nh013\Desktop\NBA DATASET\GAMES.csv")

# DROP UNNEEDED COLUMNS
columns_to_drop = ['SEASON_ID', 'TEAM_ID', 'GAME_ID', 'GAME_DATE', 'MATCHUP']
data = data.drop(columns_to_drop, axis=1)

# SEPARATE THE TEAM NAMES
team_names = data['TEAM_NAME']

# FEATURE
categorical_features = ['TEAM_ABBREVIATION', 'TEAM_NAME', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
                        'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS']

# LABEL ENCODER
label_encoder = LabelEncoder()

# CONVERT CATEGORIAL COLUMN TO NUMERICAL
for feature in categorical_features:
    data[feature] = label_encoder.fit_transform(data[feature])

# CONVERT NUMERICAL   TO FLOAT
data = data.astype(float)

# FILLED MISSING VALUE WITH CONVERTED NUMERIC FORM
categorical_features_to_fill = ['TEAM_ABBREVIATION', 'TEAM_NAME', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
                        'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS']

# SIMPLEIMPUTER IN CATEGORICAL FEATURE
categorical_imputer = SimpleImputer(strategy='most_frequent')

data[categorical_features_to_fill] = categorical_imputer.fit_transform(data[categorical_features_to_fill])

# IDENTIFY MISSING VALUES
print(data.isnull().sum())

# DROP ROWS WITH MISSING VALUES
data.dropna(subset=categorical_features, inplace=True)

# FILL MISSING VALUES WITH FORWARD FILL
data.fillna(method='ffill', inplace=True)

# FILL MISSING VALUES WITH BACKWARD FILL
data.fillna(method='bfill', inplace=True)

# CONVERT NUMERICAL COLUMNS TO NUMERIC
data[categorical_features] = data[categorical_features].apply(pd.to_numeric, errors='coerce')

# FILL MISSING VALUES WITH MEAN
mean = data[categorical_features].mean()
data[categorical_features].fillna(mean, inplace=True)

# NORMALIZE AND SCALE 
scaler = MinMaxScaler()
data[categorical_features] = scaler.fit_transform(data[categorical_features])

# FEATURE ENGINEERING: REBUNDING PERFORMANCE
data['OREB/G'] = data['OREB'] / data['MIN']
data['DREB/G'] = data['DREB'] / data['MIN']

# FEATURE ENGINEERING: ASSIST TURN OVER RASIO
data['AST_TOV_RATIO'] = data['AST'] / data['TOV']

# FEATURE ENGINEERING: FREE THROW EFFICENCY
data['FT_PCT'] = data['FTM'] / data['FTA']

# FEATURE ENGINEERING: POINT DIFFERENTIAL
data['POINT_DIFF'] = data['PTS'] - data['PLUS_MINUS']

# FEATURE ENGINEERING:STEALS AND BLOCK
data['STL/G'] = data['STL'] / data['MIN']
data['BLK/G'] = data['BLK'] / data['MIN']

# FEATURE ENGINEERING: FOUL RATE
data['FOUL_RATE'] = data['PF'] / data['MIN']

#DROP ROWS WITH NAN VALUES
data.dropna(inplace=True)  
data.replace([np.inf, -np.inf], np.nan, inplace=True)  # CONVERT INF TO NAN
data.dropna(inplace=True)  

# FIX TARGET VARIABLE
X = data.drop('WL', axis=1)
y = data['WL']

# SPLIT DATA INTO TRAINING AND TESTING SET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#CONVERT TARGET VARIABLE TO NTEGER
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)

# BXGBOOST CLASSIFIER
model = xgb.XGBClassifier()

#FIT MODEL
model.fit(X_train, y_train)

# CONVERT TARGET VARIABLE FOR TESTING SET
y_test_encoded = label_encoder.transform(y_test)

# EVALUATE MODEL
accuracy = model.score(X_test, y_test_encoded)
print('Test Accuracy:', accuracy)

# PREDICTION
predictions_test = model.predict(X_test)

# CONVERT THE ENCODED PREDICTED LABELS BACK TO THEIR ORGINAL STRING FORM
predicted_labels_test = label_encoder.inverse_transform(predictions_test)

# CONVERT PREDICTED VALUE TO PERCENT OVER 100
predicted_labels_test = (predicted_labels_test * 100).round(2)

# CREATE SEPERATE LIST FOR TEAM NAMES aAND WIN/LOSS PREDICTION
team_list = []
win_percentage_list = []
loss_percentage_list = []

for team, prediction in zip(team_names[X_test.index], predicted_labels_test):
    win_percentage = prediction
    loss_percentage = 100 - prediction
    team_list.append(team)
    win_percentage_list.append(win_percentage)
    loss_percentage_list.append(loss_percentage)

# OUTPUT
for team, win_percentage, loss_percentage in zip(team_list, win_percentage_list, loss_percentage_list):
    print("Team Name:", team)
    print("Win Percentage:", win_percentage, "%")
    print("Loss Percentage:", loss_percentage, "%")
    print()


TEAM_ABBREVIATION    0
TEAM_NAME            0
WL                   0
MIN                  0
PTS                  0
FGM                  0
FGA                  0
FG_PCT               0
FG3M                 0
FG3A                 0
FG3_PCT              0
FTM                  0
FTA                  0
FT_PCT               0
OREB                 0
DREB                 0
REB                  0
AST                  0
STL                  0
BLK                  0
TOV                  0
PF                   0
PLUS_MINUS           0
dtype: int64
Test Accuracy: 0.9913950933723911
Team Name: Portland Trail Blazers
Win Percentage: 0.0 %
Loss Percentage: 100.0 %

Team Name: Iowa Wolves
Win Percentage: 50.0 %
Loss Percentage: 50.0 %

Team Name: Boys India
Win Percentage: 0.0 %
Loss Percentage: 100.0 %

Team Name:  T-Wolves Gaming
Win Percentage: 0.0 %
Loss Percentage: 100.0 %

Team Name: Chicago Bulls
Win Percentage: 50.0 %
Loss Percentage: 50.0 %

Team Name:  Jazz Gaming
Win Percentage: 0.0 %
Loss P