# NBA Production Predictor - AI Model
This program is designed to take in 5, 7, or 9 players on each team and compare them based on historical stats and our analysis on player production. Essentially the model will determine which team is stronger based past production. Our hopes is that the model can take into account how each player performed against certaint teams and other players in their perspective position, which may be also be a future goal of our model. Thank you for viewing and I hope you enjoy using our NBA Game Predictor Model. 

https://www.kaggle.com/datasets/gonzalogigena/nba-all-time-stats

## Import project dependencies

In [50]:
# Import pandas to read and manipulate data
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from utils import *

In [51]:
# Using pandas read in each csv project file
regular_season_data = pd.read_csv('data/Regular_Season.csv')
playoff_data = pd.read_csv('data/Playoffs.csv')

# Read in the player positions dataframe

In [52]:
player_position = pd.read_csv('data/advanced_stats.csv')
player_position.head()

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Player Reference,Season,League
0,Precious Achiuwa,PF-C,24.0,TOT,53,1191.0,14.3,0.536,0.223,0.231,...,1.5,2.3,0.094,-1.7,-0.1,-1.8,0.1,achiupr01,2023-2024,NBA
1,Precious Achiuwa,C,24.0,TOR,25,437.0,15.0,0.512,0.276,0.247,...,0.5,0.5,0.056,-1.4,-0.1,-1.5,0.1,achiupr01,2023-2024,NBA
2,Precious Achiuwa,PF,24.0,NYK,28,754.0,14.0,0.558,0.175,0.217,...,1.0,1.8,0.117,-1.8,-0.2,-2.0,0.0,achiupr01,2023-2024,NBA
3,Bam Adebayo,C,26.0,MIA,49,1697.0,19.6,0.571,0.016,0.415,...,2.9,4.7,0.132,0.4,1.6,2.0,1.7,adebaba01,2023-2024,NBA
4,Ochai Agbaji,SG,23.0,TOT,58,1105.0,8.1,0.536,0.565,0.095,...,0.5,0.6,0.024,-2.7,-0.4,-3.2,-0.3,agbajoc01,2023-2024,NBA


In [53]:
player_position = player_position[['Player', 'Pos']]
player_position.head()

Unnamed: 0,Player,Pos
0,Precious Achiuwa,PF-C
1,Precious Achiuwa,C
2,Precious Achiuwa,PF
3,Bam Adebayo,C
4,Ochai Agbaji,SG


# Function to reduce the amount of positions

#### Drop Forward and Guards from 'Position' column

In [54]:
# Split the column after the - to get the primary position of each player
player_position['Pos'] = player_position['Pos'].str.split('-').str[0]
# Remove the forward and guard position from the Pos row
player_position = player_position[~player_position['Pos'].isin(['F', 'G'])]
# Rename the player column to all caps for merge
player_position = player_position.rename(columns={'Player' : 'PLAYER'})

player_position.head()  

Unnamed: 0,PLAYER,Pos
0,Precious Achiuwa,PF
1,Precious Achiuwa,C
2,Precious Achiuwa,PF
3,Bam Adebayo,C
4,Ochai Agbaji,SG


### 1. Combine the two dataframes 'regular_season_data' and 'playoff_data'

In [55]:
"""
Function Name: combine_dataframes

Parameters: Takes in a [list of dataframes]

Output: A combined dataframe

Description: This function takes a list of DataFrames, reformats each, and 
combines them into one DataFrame along axis 0.

"""
# Put dataframes into a list
list_of_dataframes = [regular_season_data, playoff_data]

# Call 'combine_dataframes' function to combine the list of dataframes
combined_stats = combine_dataframes(list_of_dataframes)
combined_stats.head()

Unnamed: 0,year,Season_type,PLAYER_ID,RANK,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,AST_TOV,STL_TOV
0,2012,Regular Season,201142,1,Kevin Durant,1610612760,OKC,81,3119,731,...,594,640,374,116,105,280,143,2280,1.34,0.41
1,2012,Regular Season,977,2,Kobe Bryant,1610612747,LAL,78,3013,738,...,367,433,469,106,25,287,173,2133,1.63,0.37
2,2012,Regular Season,2544,3,LeBron James,1610612748,MIA,76,2877,765,...,513,610,551,129,67,226,110,2036,2.44,0.57
3,2012,Regular Season,201935,4,James Harden,1610612745,HOU,78,2985,585,...,317,379,455,142,38,295,178,2023,1.54,0.48
4,2012,Regular Season,2546,5,Carmelo Anthony,1610612752,NYK,67,2482,669,...,326,460,171,52,32,175,205,1920,0.98,0.3


# Merge the combined_stats dataframe with the player_position dataframe

In [56]:
merged_df = pd.merge(player_position, combined_stats, on='PLAYER')
merged_df

Unnamed: 0,PLAYER,Pos,year,Season_type,PLAYER_ID,RANK,TEAM_ID,TEAM,GP,MIN,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,AST_TOV,STL_TOV
0,Precious Achiuwa,PF,2020,Regular Season,1630173,292,1610612748,MIA,61,737,...,135,208,29,20,28,43,91,304,0.67,0.47
1,Precious Achiuwa,PF,2021,Regular Season,1630173,161,1610612761,TOR,73,1725,...,327,473,82,37,41,84,151,664,0.98,0.44
2,Precious Achiuwa,PF,2022,Regular Season,1630173,226,1610612761,TOR,55,1141,...,228,328,50,31,30,59,102,508,0.85,0.53
3,Precious Achiuwa,PF,2023,Regular Season,1630173,193,1610612752,NYK,74,1624,...,296,487,97,46,68,83,143,565,1.17,0.55
4,Precious Achiuwa,PF,2020,Playoffs,1630173,176,1610612748,MIA,3,12,...,6,6,0,0,2,4,1,7,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84933,George King,PG,2021,Regular Season,1628994,580,1610612742,DAL,4,19,...,5,5,0,0,0,1,4,1,0.00,0.00
84934,George King,PG,2018,Regular Season,1628994,521,1610612756,PHX,1,6,...,1,1,0,0,0,0,0,0,0.00,0.00
84935,George King,PG,2021,Regular Season,1628994,580,1610612742,DAL,4,19,...,5,5,0,0,0,1,4,1,0.00,0.00
84936,George King,PG,2018,Regular Season,1628994,521,1610612756,PHX,1,6,...,1,1,0,0,0,0,0,0,0.00,0.00


In [57]:
len(merged_df['PLAYER'].unique())

1430

### 2. Encode categorical columns

In [58]:
"""
Function Name: encode_categorical_columns

Parameters: Takes in a dataframe

Input: A dataframe
Output: Encodes the following columns using LabelEncoder() and OneHotEncoder():
    LabelEncoder():
        # Encode the `Season_type` columns, 0 for Regular Season games and 1 for Playoff games
        # Initialize LabelEncoder for players and teams column
        # Use LabelEncoder to encode the `PLAYER`, `TEAM`, `POSITION` columns
        # Save mappings to decode later
        # Drop original player, team, and position columns
        # Return the updated dataframe, player mappings, and  team mappings

Description: Encodes categorical columns, like Season_type, PLAYER, position, and TEAM, using 
LabelEncoder. Also, it returns mappings for decoding later.

"""

# Call the 'convert_to_single_game' function to convert dataframe
encoded_df, player_mapping, team_mapping, position_mapping = encode_categorical_columns(merged_df)
encoded_df.head()

Unnamed: 0,year,RANK,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,...,AST,STL,BLK,TOV,PF,PTS,season_type_encoded,player_encoded,team_encoded,position_encoded
0,2020,292,61,737,124,228,0.544,0,1,0.0,...,29,20,28,43,91,304,0,1120,15,1
1,2021,161,73,1725,265,603,0.439,56,156,0.359,...,82,37,41,84,151,664,0,1120,28,1
2,2022,226,55,1141,196,404,0.485,29,108,0.269,...,50,31,30,59,102,508,0,1120,28,1
3,2023,193,74,1624,235,469,0.501,26,97,0.268,...,97,46,68,83,143,565,0,1120,20,1
4,2020,176,3,12,3,4,0.75,0,0,0.0,...,0,0,2,4,1,7,1,1120,15,1


In [59]:
encoded_df.columns

Index(['year', 'RANK', 'GP', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A',
       'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS', 'season_type_encoded', 'player_encoded',
       'team_encoded', 'position_encoded'],
      dtype='object')

#### 3. Rename Columns

In [60]:
# Rename columns to clean dataframe
renamed_df = encoded_df.rename(columns={
    'RANK': 'rank',
    'GP' : 'games_played',
    'MIN' : 'minutes',
    'FGM' : 'two_made',
    'FGA' : 'two_attempted',
    'FG_PCT' : 'two_percentage',
    'FG3M' : 'three_made',
    'FG3A' : 'three_attempted',
    'FG3_PCT' : 'three_percentage',
    'FTM' : 'freeThrow_made',
    'FTA' : 'freeThrow_attempted',
    'FT_PCT' : 'freeThrow_percentage',
    'OREB' : 'offensive_reb',
    'DREB' : 'defensive_reb',
    'REB' : 'total_rebounds',
    'AST' : 'assists',
    'STL' : 'steals',
    'BLK' : 'blocks',
    'TOV' : 'turnover',
    'PF' : 'personal_fouls',
    'PTS' : 'points',
    'AST_TOV' : 'assist_turnover',
    'STL_TOV' : 'steals_turnover',
})
renamed_df.head()

Unnamed: 0,year,rank,games_played,minutes,two_made,two_attempted,two_percentage,three_made,three_attempted,three_percentage,...,assists,steals,blocks,turnover,personal_fouls,points,season_type_encoded,player_encoded,team_encoded,position_encoded
0,2020,292,61,737,124,228,0.544,0,1,0.0,...,29,20,28,43,91,304,0,1120,15,1
1,2021,161,73,1725,265,603,0.439,56,156,0.359,...,82,37,41,84,151,664,0,1120,28,1
2,2022,226,55,1141,196,404,0.485,29,108,0.269,...,50,31,30,59,102,508,0,1120,28,1
3,2023,193,74,1624,235,469,0.501,26,97,0.268,...,97,46,68,83,143,565,0,1120,20,1
4,2020,176,3,12,3,4,0.75,0,0,0.0,...,0,0,2,4,1,7,1,1120,15,1


In [61]:
renamed_df.columns

Index(['year', 'rank', 'games_played', 'minutes', 'two_made', 'two_attempted',
       'two_percentage', 'three_made', 'three_attempted', 'three_percentage',
       'freeThrow_made', 'freeThrow_attempted', 'freeThrow_percentage',
       'offensive_reb', 'defensive_reb', 'total_rebounds', 'assists', 'steals',
       'blocks', 'turnover', 'personal_fouls', 'points', 'season_type_encoded',
       'player_encoded', 'team_encoded', 'position_encoded'],
      dtype='object')

#### 4. Change Dataframe To Per Game Data

In [62]:
# Selet the statistical game data columns
season_game_data = renamed_df[['games_played', 'minutes', 'two_made', 'two_attempted', 'three_made', 'three_attempted', 'freeThrow_made', 'freeThrow_attempted', 'offensive_reb', 'defensive_reb', 'total_rebounds', 'assists', 'steals','blocks', 'turnover', 'personal_fouls', 'points']]
season_game_data.head()

Unnamed: 0,games_played,minutes,two_made,two_attempted,three_made,three_attempted,freeThrow_made,freeThrow_attempted,offensive_reb,defensive_reb,total_rebounds,assists,steals,blocks,turnover,personal_fouls,points
0,61,737,124,228,0,1,56,110,73,135,208,29,20,28,43,91,304
1,73,1725,265,603,56,156,78,131,146,327,473,82,37,41,84,151,664
2,55,1141,196,404,29,108,87,124,100,228,328,50,31,30,59,102,508
3,74,1624,235,469,26,97,69,112,191,296,487,97,46,68,83,143,565
4,3,12,3,4,0,0,1,4,0,6,6,0,0,2,4,1,7


In [63]:
single_game_data = divide_by_games_played(season_game_data)
single_game_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = round(df[col] / df['games_played'],1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].replace([np.inf, -np.inf], np.nan)


Unnamed: 0,minutes,two_made,two_attempted,three_made,three_attempted,freeThrow_made,freeThrow_attempted,offensive_reb,defensive_reb,total_rebounds,assists,steals,blocks,turnover,personal_fouls,points
0,12.1,2.0,3.7,0.0,0.0,0.9,1.8,1.2,2.2,3.4,0.5,0.3,0.5,0.7,1.5,5.0
1,23.6,3.6,8.3,0.8,2.1,1.1,1.8,2.0,4.5,6.5,1.1,0.5,0.6,1.2,2.1,9.1
2,20.7,3.6,7.3,0.5,2.0,1.6,2.3,1.8,4.1,6.0,0.9,0.6,0.5,1.1,1.9,9.2
3,21.9,3.2,6.3,0.4,1.3,0.9,1.5,2.6,4.0,6.6,1.3,0.6,0.9,1.1,1.9,7.6
4,4.0,1.0,1.3,0.0,0.0,0.3,1.3,0.0,2.0,2.0,0.0,0.0,0.7,1.3,0.3,2.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84933,4.8,0.0,1.2,0.0,1.0,0.2,0.5,0.0,1.2,1.2,0.0,0.0,0.0,0.2,1.0,0.2
84934,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
84935,4.8,0.0,1.2,0.0,1.0,0.2,0.5,0.0,1.2,1.2,0.0,0.0,0.0,0.2,1.0,0.2
84936,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
# Replace the orignal dataframe values with the updated single game data
renamed_df[single_game_data.columns] = single_game_data
renamed_df.head()

final_df = renamed_df.copy()

# Connect To MongoDb and Create A Database

In [65]:
from pymongo import MongoClient

# Connect to MongoDB (adjust the URI as needed for your setup)
client = MongoClient("mongodb://localhost:27017/")

# Create or connect to the nba_analysis database
db = client["nba_stats"]


In [66]:
# Convert the DataFrame into a list of dictionaries (MongoDB Format)
data = final_df.to_dict("records")


#### Create collection for `player_stats` and `metadata`

In [67]:
player_stats_collection = db["player_stats"]  # Collection for player data
metadata_collection = db["metadata"]  # Collection for metadata (context)


#### Insert data into `player_stats_collection`

In [68]:
player_stats_collection.insert_many(data)
print("Data inserted into player_stats_collection")

Data inserted into player_stats_collection


#### Define and Insert Metadata For Context

In [69]:
metadata_entries = [
    {"field": "year", "description": "The year in which the stats were recorded", "example": 2023},
    {"field": "rank", "description": "Player's rank based on performance metrics", "example": 1},
    {"field": "games_played", "description": "Total games played by the player in the season", "example": 82},
    {"field": "minutes", "description": "Total minutes played by the player in the season", "example": 2400},
    {"field": "two_made", "description": "Total two-point field goals made by the player", "example": 300},
    {"field": "two_attempted", "description": "Total two-point field goals attempted by the player", "example": 600},
    {"field": "three_made", "description": "Total three-point field goals made by the player", "example": 150},
    {"field": "three_attempted", "description": "Total three-point field goals attempted by the player", "example": 400},
    {"field": "freeThrow_made", "description": "Total free throws made by the player", "example": 200},
    {"field": "freeThrow_attempted", "description": "Total free throws attempted by the player", "example": 250},
    {"field": "offensive_reb", "description": "Total offensive rebounds by the player", "example": 100},
    {"field": "defensive_reb", "description": "Total defensive rebounds by the player", "example": 300},
    {"field": "rebounds", "description": "Total rebounds (offensive + defensive) by the player", "example": 400},
    {"field": "assists", "description": "Total assists by the player in the season", "example": 250},
    {"field": "steals", "description": "Total steals by the player in the season", "example": 80},
    {"field": "blocks", "description": "Total blocks by the player in the season", "example": 50},
    {"field": "turnovers", "description": "Total turnovers by the player in the season", "example": 100},
    {"field": "personal_fouls", "description": "Total personal fouls committed by the player", "example": 150},
    {"field": "points", "description": "Total points scored by the player in the season", "example": 2000},
    {"field": "season_type", "description": "Type of season (e.g., Regular Season, Playoffs)", "example": "Regular Season"}
]


#### Insert metadata into `metdata_collection`

In [70]:
metadata_collection.insert_many(metadata_entries)
print("Metadata inserted into metadata_collection")


Metadata inserted into metadata_collection


#### Set up database 

In [71]:
# db = client["nba_database"]
# metadata_collection = db["metadata"]
# player_stats_collection = db["player_stats"]

#### Initialize the LLM Model using Llama

In [77]:
from transformers import cached_file
import os
import shutil

# Clear the Hugging Face model cache
cache_dir = os.path.expanduser('~/.cache/huggingface/transformers')  # Path to the cache directory
model_name = "meta-llama/Llama-3.2-3B"

# Check if the model folder exists in the cache directory and delete it
model_folder = os.path.join(cache_dir, model_name)
if os.path.exists(model_folder):
    shutil.rmtree(model_folder)
    print(f"Deleted model cache for {model_name}")
else:
    print(f"No cache found for {model_name}")


ImportError: cannot import name 'cached_file' from 'transformers' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/transformers/__init__.py)

In [None]:
import torch
from transformers import pipeline

access_token = ''
model_id = "meta-llama/Llama-3.2-3B"

llm_model = pipeline(
    "text-generation", 
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    use_auth_token = access_token
    )

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.2-3B.
403 Client Error. (Request ID: Root=1-673e7149-5d3bcd183f2168aa1eb97067;69436937-8564-4f04-b0cb-a78f3c5e3662)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-3B is restricted and you are not in the authorized list. Visit https://huggingface.co/meta-llama/Llama-3.2-3B to ask for access.