# NBA Production Predictor - AI Model
This program is designed to take in 5, 7, or 9 players on each team and compare them based on historical stats and our analysis on player production. Essentially the model will determine which team is stronger based past production. Our hopes is that the model can take into account how each player performed against certaint teams and other players in their perspective position, which may be also be a future goal of our model. Thank you for viewing and I hope you enjoy using our NBA Game Predictor Model. 

https://www.kaggle.com/datasets/gonzalogigena/nba-all-time-stats

## Import project dependencies

In [1]:
# Import pandas to read and manipulate data
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from utils import *
import os
from dotenv import load_dotenv

In [2]:
# Using pandas read in each csv project file
regular_season_data = pd.read_csv('data/Regular_Season.csv')
playoff_data = pd.read_csv('data/Playoffs.csv')

# Read in the player positions dataframe

In [3]:
player_position = pd.read_csv('data/advanced_stats.csv')
player_position.head()

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Player Reference,Season,League
0,Precious Achiuwa,PF-C,24.0,TOT,53,1191.0,14.3,0.536,0.223,0.231,...,1.5,2.3,0.094,-1.7,-0.1,-1.8,0.1,achiupr01,2023-2024,NBA
1,Precious Achiuwa,C,24.0,TOR,25,437.0,15.0,0.512,0.276,0.247,...,0.5,0.5,0.056,-1.4,-0.1,-1.5,0.1,achiupr01,2023-2024,NBA
2,Precious Achiuwa,PF,24.0,NYK,28,754.0,14.0,0.558,0.175,0.217,...,1.0,1.8,0.117,-1.8,-0.2,-2.0,0.0,achiupr01,2023-2024,NBA
3,Bam Adebayo,C,26.0,MIA,49,1697.0,19.6,0.571,0.016,0.415,...,2.9,4.7,0.132,0.4,1.6,2.0,1.7,adebaba01,2023-2024,NBA
4,Ochai Agbaji,SG,23.0,TOT,58,1105.0,8.1,0.536,0.565,0.095,...,0.5,0.6,0.024,-2.7,-0.4,-3.2,-0.3,agbajoc01,2023-2024,NBA


In [4]:
player_position = player_position[['Player', 'Pos']]
player_position.head()

Unnamed: 0,Player,Pos
0,Precious Achiuwa,PF-C
1,Precious Achiuwa,C
2,Precious Achiuwa,PF
3,Bam Adebayo,C
4,Ochai Agbaji,SG


# Function to reduce the amount of positions

#### Drop Forward and Guards from 'Position' column

In [5]:
# Split the column after the - to get the primary position of each player
player_position['Pos'] = player_position['Pos'].str.split('-').str[0]
# Remove the forward and guard position from the Pos row
player_position = player_position[~player_position['Pos'].isin(['F', 'G'])]
# Rename the player column to all caps for merge
player_position = player_position.rename(columns={'Player' : 'PLAYER'})

player_position.head()  

Unnamed: 0,PLAYER,Pos
0,Precious Achiuwa,PF
1,Precious Achiuwa,C
2,Precious Achiuwa,PF
3,Bam Adebayo,C
4,Ochai Agbaji,SG


### 1. Combine the two dataframes 'regular_season_data' and 'playoff_data'

In [6]:
"""
Function Name: combine_dataframes

Parameters: Takes in a [list of dataframes]

Output: A combined dataframe

Description: This function takes a list of DataFrames, reformats each, and 
combines them into one DataFrame along axis 0.

"""
# Put dataframes into a list
list_of_dataframes = [regular_season_data, playoff_data]

# Call 'combine_dataframes' function to combine the list of dataframes
combined_stats = combine_dataframes(list_of_dataframes)
combined_stats.head()

Unnamed: 0,year,Season_type,PLAYER_ID,RANK,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,AST_TOV,STL_TOV
0,2012,Regular Season,201142,1,Kevin Durant,1610612760,OKC,81,3119,731,...,594,640,374,116,105,280,143,2280,1.34,0.41
1,2012,Regular Season,977,2,Kobe Bryant,1610612747,LAL,78,3013,738,...,367,433,469,106,25,287,173,2133,1.63,0.37
2,2012,Regular Season,2544,3,LeBron James,1610612748,MIA,76,2877,765,...,513,610,551,129,67,226,110,2036,2.44,0.57
3,2012,Regular Season,201935,4,James Harden,1610612745,HOU,78,2985,585,...,317,379,455,142,38,295,178,2023,1.54,0.48
4,2012,Regular Season,2546,5,Carmelo Anthony,1610612752,NYK,67,2482,669,...,326,460,171,52,32,175,205,1920,0.98,0.3


# Merge the combined_stats dataframe with the player_position dataframe

In [7]:
merged_df = pd.merge(player_position, combined_stats, on='PLAYER')
merged_df

Unnamed: 0,PLAYER,Pos,year,Season_type,PLAYER_ID,RANK,TEAM_ID,TEAM,GP,MIN,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,AST_TOV,STL_TOV
0,Precious Achiuwa,PF,2020,Regular Season,1630173,292,1610612748,MIA,61,737,...,135,208,29,20,28,43,91,304,0.67,0.47
1,Precious Achiuwa,PF,2021,Regular Season,1630173,161,1610612761,TOR,73,1725,...,327,473,82,37,41,84,151,664,0.98,0.44
2,Precious Achiuwa,PF,2022,Regular Season,1630173,226,1610612761,TOR,55,1141,...,228,328,50,31,30,59,102,508,0.85,0.53
3,Precious Achiuwa,PF,2023,Regular Season,1630173,193,1610612752,NYK,74,1624,...,296,487,97,46,68,83,143,565,1.17,0.55
4,Precious Achiuwa,PF,2020,Playoffs,1630173,176,1610612748,MIA,3,12,...,6,6,0,0,2,4,1,7,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84933,George King,PG,2021,Regular Season,1628994,580,1610612742,DAL,4,19,...,5,5,0,0,0,1,4,1,0.00,0.00
84934,George King,PG,2018,Regular Season,1628994,521,1610612756,PHX,1,6,...,1,1,0,0,0,0,0,0,0.00,0.00
84935,George King,PG,2021,Regular Season,1628994,580,1610612742,DAL,4,19,...,5,5,0,0,0,1,4,1,0.00,0.00
84936,George King,PG,2018,Regular Season,1628994,521,1610612756,PHX,1,6,...,1,1,0,0,0,0,0,0,0.00,0.00


In [8]:
len(merged_df['PLAYER'].unique())

1430

### 2. Encode categorical columns

In [9]:
"""
Function Name: encode_categorical_columns

Parameters: Takes in a dataframe

Input: A dataframe
Output: Encodes the following columns using LabelEncoder() and OneHotEncoder():
    LabelEncoder():
        # Encode the `Season_type` columns, 0 for Regular Season games and 1 for Playoff games
        # Initialize LabelEncoder for players and teams column
        # Use LabelEncoder to encode the `PLAYER`, `TEAM`, `POSITION` columns
        # Save mappings to decode later
        # Drop original player, team, and position columns
        # Return the updated dataframe, player mappings, and  team mappings

Description: Encodes categorical columns, like Season_type, PLAYER, position, and TEAM, using 
LabelEncoder. Also, it returns mappings for decoding later.

"""

# Call the 'convert_to_single_game' function to convert dataframe
encoded_df, player_mapping, team_mapping, position_mapping = encode_categorical_columns(merged_df)
encoded_df.head()

Unnamed: 0,year,RANK,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,...,BLK,TOV,PF,PTS,AST_TOV,STL_TOV,season_type_encoded,player_encoded,team_encoded,position_encoded
0,2020,292,61,737,124,228,0.544,0,1,0.0,...,28,43,91,304,0.67,0.47,0,1120,15,1
1,2021,161,73,1725,265,603,0.439,56,156,0.359,...,41,84,151,664,0.98,0.44,0,1120,28,1
2,2022,226,55,1141,196,404,0.485,29,108,0.269,...,30,59,102,508,0.85,0.53,0,1120,28,1
3,2023,193,74,1624,235,469,0.501,26,97,0.268,...,68,83,143,565,1.17,0.55,0,1120,20,1
4,2020,176,3,12,3,4,0.75,0,0,0.0,...,2,4,1,7,0.0,0.0,1,1120,15,1


In [10]:
encoded_df.columns

Index(['year', 'RANK', 'GP', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A',
       'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS', 'AST_TOV', 'STL_TOV', 'season_type_encoded',
       'player_encoded', 'team_encoded', 'position_encoded'],
      dtype='object')

#### 3. Rename Columns

In [11]:
# Rename columns to clean dataframe
renamed_df = encoded_df.rename(columns={
    'RANK': 'rank',
    'GP' : 'games_played',
    'MIN' : 'minutes',
    'FGM' : 'two_made',
    'FGA' : 'two_attempted',
    'FG_PCT' : 'two_percentage',
    'FG3M' : 'three_made',
    'FG3A' : 'three_attempted',
    'FG3_PCT' : 'three_percentage',
    'FTM' : 'freeThrow_made',
    'FTA' : 'freeThrow_attempted',
    'FT_PCT' : 'freeThrow_percentage',
    'OREB' : 'offensive_reb',
    'DREB' : 'defensive_reb',
    'REB' : 'total_rebounds',
    'AST' : 'assists',
    'STL' : 'steals',
    'BLK' : 'blocks',
    'TOV' : 'turnover',
    'PF' : 'personal_fouls',
    'PTS' : 'points',
    'AST_TOV' : 'assist_turnover',
    'STL_TOV' : 'steals_turnover',
})
renamed_df.head()

Unnamed: 0,year,rank,games_played,minutes,two_made,two_attempted,two_percentage,three_made,three_attempted,three_percentage,...,blocks,turnover,personal_fouls,points,assist_turnover,steals_turnover,season_type_encoded,player_encoded,team_encoded,position_encoded
0,2020,292,61,737,124,228,0.544,0,1,0.0,...,28,43,91,304,0.67,0.47,0,1120,15,1
1,2021,161,73,1725,265,603,0.439,56,156,0.359,...,41,84,151,664,0.98,0.44,0,1120,28,1
2,2022,226,55,1141,196,404,0.485,29,108,0.269,...,30,59,102,508,0.85,0.53,0,1120,28,1
3,2023,193,74,1624,235,469,0.501,26,97,0.268,...,68,83,143,565,1.17,0.55,0,1120,20,1
4,2020,176,3,12,3,4,0.75,0,0,0.0,...,2,4,1,7,0.0,0.0,1,1120,15,1


In [12]:
renamed_df.columns

Index(['year', 'rank', 'games_played', 'minutes', 'two_made', 'two_attempted',
       'two_percentage', 'three_made', 'three_attempted', 'three_percentage',
       'freeThrow_made', 'freeThrow_attempted', 'freeThrow_percentage',
       'offensive_reb', 'defensive_reb', 'total_rebounds', 'assists', 'steals',
       'blocks', 'turnover', 'personal_fouls', 'points', 'assist_turnover',
       'steals_turnover', 'season_type_encoded', 'player_encoded',
       'team_encoded', 'position_encoded'],
      dtype='object')

#### 4. Change Dataframe To Per Game Data

In [13]:
# Selet the statistical game data columns
season_game_data = renamed_df[['games_played', 'minutes', 'two_made', 'two_attempted', 'three_made', 'three_attempted', 'freeThrow_made', 'freeThrow_attempted', 'offensive_reb', 'defensive_reb', 'total_rebounds', 'assists', 'steals','blocks', 'turnover', 'personal_fouls', 'points']]
season_game_data.head()

Unnamed: 0,games_played,minutes,two_made,two_attempted,three_made,three_attempted,freeThrow_made,freeThrow_attempted,offensive_reb,defensive_reb,total_rebounds,assists,steals,blocks,turnover,personal_fouls,points
0,61,737,124,228,0,1,56,110,73,135,208,29,20,28,43,91,304
1,73,1725,265,603,56,156,78,131,146,327,473,82,37,41,84,151,664
2,55,1141,196,404,29,108,87,124,100,228,328,50,31,30,59,102,508
3,74,1624,235,469,26,97,69,112,191,296,487,97,46,68,83,143,565
4,3,12,3,4,0,0,1,4,0,6,6,0,0,2,4,1,7


In [14]:
single_game_data = divide_by_games_played(season_game_data)
single_game_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = round(df[col] / df['games_played'],1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].replace([np.inf, -np.inf], np.nan)


Unnamed: 0,minutes,two_made,two_attempted,three_made,three_attempted,freeThrow_made,freeThrow_attempted,offensive_reb,defensive_reb,total_rebounds,assists,steals,blocks,turnover,personal_fouls,points
0,12.1,2.0,3.7,0.0,0.0,0.9,1.8,1.2,2.2,3.4,0.5,0.3,0.5,0.7,1.5,5.0
1,23.6,3.6,8.3,0.8,2.1,1.1,1.8,2.0,4.5,6.5,1.1,0.5,0.6,1.2,2.1,9.1
2,20.7,3.6,7.3,0.5,2.0,1.6,2.3,1.8,4.1,6.0,0.9,0.6,0.5,1.1,1.9,9.2
3,21.9,3.2,6.3,0.4,1.3,0.9,1.5,2.6,4.0,6.6,1.3,0.6,0.9,1.1,1.9,7.6
4,4.0,1.0,1.3,0.0,0.0,0.3,1.3,0.0,2.0,2.0,0.0,0.0,0.7,1.3,0.3,2.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84933,4.8,0.0,1.2,0.0,1.0,0.2,0.5,0.0,1.2,1.2,0.0,0.0,0.0,0.2,1.0,0.2
84934,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
84935,4.8,0.0,1.2,0.0,1.0,0.2,0.5,0.0,1.2,1.2,0.0,0.0,0.0,0.2,1.0,0.2
84936,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Replace the orignal dataframe values with the updated single game data
renamed_df[single_game_data.columns] = single_game_data
renamed_df.head()

Unnamed: 0,year,rank,games_played,minutes,two_made,two_attempted,two_percentage,three_made,three_attempted,three_percentage,...,blocks,turnover,personal_fouls,points,assist_turnover,steals_turnover,season_type_encoded,player_encoded,team_encoded,position_encoded
0,2020,292,61,12.1,2.0,3.7,0.544,0.0,0.0,0.0,...,0.5,0.7,1.5,5.0,0.67,0.47,0,1120,15,1
1,2021,161,73,23.6,3.6,8.3,0.439,0.8,2.1,0.359,...,0.6,1.2,2.1,9.1,0.98,0.44,0,1120,28,1
2,2022,226,55,20.7,3.6,7.3,0.485,0.5,2.0,0.269,...,0.5,1.1,1.9,9.2,0.85,0.53,0,1120,28,1
3,2023,193,74,21.9,3.2,6.3,0.501,0.4,1.3,0.268,...,0.9,1.1,1.9,7.6,1.17,0.55,0,1120,20,1
4,2020,176,3,4.0,1.0,1.3,0.75,0.0,0.0,0.0,...,0.7,1.3,0.3,2.3,0.0,0.0,1,1120,15,1
