In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lazypredict
from lazypredict.Supervised import LazyClassifier

In [1]:
games_details = pd.read_csv('games_details.csv')
games_general = pd.read_csv('games_general.csv')

games_details.head(), games_general.head()


(  GAME_DATE_EST   GAME_ID GAME_STATUS_TEXT  HOME_TEAM_ID  VISITOR_TEAM_ID  \
 0    22/12/2022  22200477            Final    1610612740       1610612759   
 1    22/12/2022  22200478            Final    1610612762       1610612764   
 2    21/12/2022  22200466            Final    1610612739       1610612749   
 3    21/12/2022  22200467            Final    1610612755       1610612765   
 4    21/12/2022  22200468            Final    1610612737       1610612741   
 
    SEASON  TEAM_ID_home  PTS_home FG_PCT_home FT_PCT_home  ... AST_home  \
 0    2022    1610612740       126       0,484       0,926  ...       25   
 1    2022    1610612762       120       0,488       0,952  ...       16   
 2    2022    1610612739       114       0,482       0,786  ...       22   
 3    2022    1610612755       113       0,441       0,909  ...       27   
 4    2022    1610612737       108       0,429           1  ...       22   
 
    REB_home  TEAM_ID_away  PTS_away  FG_PCT_away FT_PCT_away FG3_PCT_aw

In [4]:
# Convert the columns to float
games_details["FG_PCT_home"] = games_details["FG_PCT_home"].str.replace(",", ".").astype(float)
games_details["FT_PCT_home"] = games_details["FT_PCT_home"].str.replace(",", ".").astype(float)
games_details["FG3_PCT_home"] = games_details["FG3_PCT_home"].str.replace(",", ".").astype(float)
games_details["FG_PCT_away"] = games_details["FG_PCT_away"].str.replace(",", ".").astype(float)
games_details["FT_PCT_away"] = games_details["FT_PCT_away"].str.replace(",", ".").astype(float)
games_details["FG3_PCT_away"] = games_details["FG3_PCT_away"].str.replace(",", ".").astype(float)

# Filter the data for seasons later than 2017
games_details_filtered = games_details[games_details["SEASON"] > 2017]
games_general_filtered = games_general[games_general["SEASON_ID"] > 2017]

# Merge the tables on TEAM_ID
merged_table = games_details_filtered.merge(games_general_filtered, left_on="TEAM_ID_home", right_on="TEAM_ID", how="left")


In [5]:
merged_table.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,CONFERENCE,TEAM,G,W,L,W_PCT,Home Win,Home Lose,Away Win,Away Lose
0,22/12/2022,22200477,Final,1610612740,1610612759,2022,1610612740,126,0.484,0.926,...,West,New Orleans,73,31,42,425,17,20,14,22
1,22/12/2022,22200477,Final,1610612740,1610612759,2022,1610612740,126,0.484,0.926,...,West,New Orleans,72,30,42,417,16,20,14,22
2,22/12/2022,22200477,Final,1610612740,1610612759,2022,1610612740,126,0.484,0.926,...,West,New Orleans,72,30,42,417,16,20,14,22
3,22/12/2022,22200477,Final,1610612740,1610612759,2022,1610612740,126,0.484,0.926,...,West,New Orleans,72,30,42,417,16,20,14,22
4,22/12/2022,22200477,Final,1610612740,1610612759,2022,1610612740,126,0.484,0.926,...,West,New Orleans,71,30,41,423,16,20,14,21


In [None]:
# Label encoder and select features

label_encoder = LabelEncoder()
merged_table['TEAM_home'] = label_encoder.fit_transform(merged_table['TEAM'])
merged_table['TEAM_away'] = label_encoder.transform(merged_table['TEAM'])
merged_table['HOME_TEAM_WINS'] = merged_table['HOME_TEAM_WINS'].astype(int)
features = ['PTS_home', 'FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home', 'AST_home', 'REB_home',
            'PTS_away', 'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away', 'REB_away',
            'TEAM_home', 'TEAM_away']
label = 'HOME_TEAM_WINS'

In [None]:
X = merged_table[features]
y = merged_table[label]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

In [17]:
# Function to predict win probability
def predict_win_probability(team1, team2, side):
    team1_encoded = label_encoder.transform([team1])[0]
    team2_encoded = label_encoder.transform([team2])[0]
    
    if side.lower() == 'home':
        data = {
            'TEAM_home': team1_encoded,
            'TEAM_away': team2_encoded,
        }
    else:
        data = {
            'TEAM_home': team2_encoded,
            'TEAM_away': team1_encoded,
        }
    
    df = pd.DataFrame(data, index=[0])
    # TODO: Add other necessary features (e.g., average stats)
    # For simplicity, use mean values from training data for missing stats
    for feature in features:
        if feature not in data:
            df[feature] = X_train[feature].mean()
    
    # Use the best model for prediction (current: best model was chosen) TODO: choose 3 models => tuning to get best results
    best_model = clf.models_['RandomForestClassifier']
    win_probability = best_model.predict_proba(df)[:, 1][0]
    
    return win_probability

# Example usage
team1 = "Utah"
team2 = "Dallas"
side = "home"
win_prob = predict_win_probability(team1, team2, side)
print(f"The win probability of {team1} against {team2} when playing at {side} is {win_prob:.2f}")


 58%|█████▊    | 18/31 [3:15:21<1:07:53, 313.32s/it] 