In [1]:
import pandas as pd
import numpy as np
from IPython.display import display

df_als = pd.read_csv("nbaallstargames.csv") # All-Star data
df_stat = pd.read_csv("correct_data.csv", sep=";") # Season Statistics

# Delete seasons older than 1980
df_stat = df_stat[df_stat["Year"] >= 1980]
df_stat = df_stat[df_stat["MP"] >= 300]

# Delete unnecessary statistics and customize the statistics in a per game fashion
deletedColumns = ["Column1", "Tm", "GS", "PER", "FTr", "ORB%", "DRB%", "TRB%", "AST%", "STL%", "BLK%", "TOV%", "blanl","OWS", "DWS","WS", "WS/48", "blank2", "OBPM", "DBPM","BPM", "VORP", "FG", "FGA", "3P", "3PA", "2P", "2PA", "eFG%", "FT", "FTA", "ORB", "DRB"]
df_stat = df_stat.drop(columns=deletedColumns)
df_stat = df_stat.assign(MinutesPlayed=df_stat["MP"].astype(int))
df_stat["PointsPerGame"] = (df_stat["PTS"] / df_stat["G"]).round(1)
df_stat["Rebounds"] = (df_stat["TRB"] / df_stat["G"]).round(1)
df_stat["Assists"] = (df_stat["AST"] / df_stat["G"]).round(1)
df_stat["Steals"] = (df_stat["STL"] / df_stat["G"]).round(1)
df_stat["Blocks"] = (df_stat["BLK"] / df_stat["G"]).round(1)
df_stat["Turnovers"] = (df_stat["TOV"] / df_stat["G"]).round(1)
df_stat["PersonalFouls"] = (df_stat["PF"] / df_stat["G"]).round(1)
df_stat = df_stat.assign(Age=df_stat["Age"].astype(int)) 
df_stat = df_stat.drop(columns=["G", "PTS", "TRB", "AST", "STL", "BLK", "TOV", "PF", "MP"])

# Reassign the player and year stat as one singular column for the Season Statistics and moving it the first column
df_stat = df_stat.assign(Season = df_stat["Player"].str.replace("*", '', regex=False).astype(str) + "-" + df_stat["Year"].astype(int).astype(str))
df_stat = df_stat.drop(columns=["Year", "Player"])
player_column = df_stat.pop("Season")
df_stat.insert(0, "Season", player_column)

# Reassign the player and year stat as one singular column for the All-Atar data and moving it the first column
df_als = df_als.assign(Season = df_als["Player"].astype(str) + "-" + df_als["Year"].astype(int).astype(str))
df_als = df_als.drop(columns=["Player", "Year"])
player_column = df_als.pop("Season")
df_als.insert(0, "Season", player_column)

# Merge the two dataframes based on Season and set Role as the new column for
df_merged = df_stat.merge(df_als[["Season", "Role"]], on="Season", how="left")

# Mapping the "Role" to numerical values
role_mapping = {
    "Starter": 2,
    "Reserve": 1,
    "DNP": 1  # DNP should be treated the same as "Reserve"
}

# Map the correct numbers on to the Seasons. Give 0 to all of the players not mentioned in the All-Star Data
df_merged["AllStarStatus"] = df_merged["Role"].map(role_mapping).fillna(0).astype(int)

# Drop the column Role because it is now described by 0, 1 or 2
df_merged = df_merged.drop(columns=["Role"])

# Shuffling and splitting the data into train, validation and test categories with a 60:20:20 split
df_merged = df_merged.sample(frac=1).reset_index(drop=True)
train_size = int(0.6 * len(df_merged))
val_size = int(0.2 * len(df_merged))

train_data = df_merged[:train_size]  # First 60%
val_data = df_merged[train_size:(train_size + val_size)]  # Next 20%
test_data = df_merged[(train_size + val_size):]  # Remaining 20%

# Checking the heads of each set to verify data
display(train_data.head())
display(val_data.head())
display(test_data.head())

Unnamed: 0,Season,Pos,Age,TS%,3PAr,USG%,FG%,3P%,2P%,FT%,MinutesPlayed,PointsPerGame,Rebounds,Assists,Steals,Blocks,Turnovers,PersonalFouls,AllStarStatus
0,Shannon Brown-2012,SG,26,0.507,0.314,22.8,0.42,0.362,0.446,0.808,1400,11.0,2.7,1.2,0.7,0.3,1.1,1.1,0
1,Maurice Cheeks-1983,PG,26,0.582,0.008,16.6,0.542,0.167,0.545,0.754,2465,12.5,2.6,6.9,2.3,0.4,2.3,2.3,2
2,Gary Neal-2014,SG,29,0.53,0.382,23.8,0.41,0.378,0.43,0.895,1114,10.5,1.7,1.6,0.3,0.0,1.2,1.2,0
3,Bill Cartwright-1980,C,22,0.608,0.0,20.6,0.547,,0.547,0.797,3150,21.7,8.9,2.0,0.6,1.2,2.7,3.4,1
4,Ed Davis-2013,C-PF,23,0.561,0.0,17.6,0.539,,0.539,0.617,1631,7.7,5.7,0.8,0.5,1.0,0.8,2.4,0


Unnamed: 0,Season,Pos,Age,TS%,3PAr,USG%,FG%,3P%,2P%,FT%,MinutesPlayed,PointsPerGame,Rebounds,Assists,Steals,Blocks,Turnovers,PersonalFouls,AllStarStatus
8586,Lynn Greer-2007,SG,27,0.538,0.369,18.1,0.433,0.346,0.483,0.844,432,4.1,0.7,1.3,0.4,0.0,0.4,0.8,0
8587,Purvis Short-1984,SF,26,0.529,0.048,25.6,0.473,0.306,0.482,0.793,2945,22.8,5.5,3.1,1.3,0.1,2.9,3.2,0
8588,Roger Phegley-1981,SG,24,0.545,0.029,22.1,0.491,0.286,0.497,0.839,2269,14.4,3.0,2.2,0.8,0.2,2.0,3.2,0
8589,Chris Bosh-2008,C,23,0.588,0.024,27.7,0.494,0.4,0.496,0.844,2425,22.3,8.7,2.6,0.9,1.0,2.3,2.3,2
8590,Bryon Russell-1995,SF,24,0.507,0.185,16.9,0.437,0.295,0.469,0.667,860,4.5,2.2,0.5,0.8,0.2,0.7,1.6,0


Unnamed: 0,Season,Pos,Age,TS%,3PAr,USG%,FG%,3P%,2P%,FT%,MinutesPlayed,PointsPerGame,Rebounds,Assists,Steals,Blocks,Turnovers,PersonalFouls,AllStarStatus
11448,Andrew Lang-1997,C,30,0.498,0.0,12.2,0.464,,0.464,0.721,1194,5.3,5.3,0.5,0.5,0.9,0.8,2.7,0
11449,John Wall-2014,PG,23,0.524,0.23,27.4,0.433,0.351,0.458,0.805,2980,19.3,4.1,8.8,1.8,0.5,3.6,2.7,1
11450,Kevin Johnson-1995,PG,28,0.561,0.05,24.2,0.47,0.154,0.487,0.81,1352,15.5,2.4,7.7,1.0,0.4,2.2,1.9,0
11451,Jerome Kersey-1989,SF,26,0.511,0.018,20.8,0.469,0.286,0.472,0.694,2716,17.5,8.3,3.2,1.8,1.1,2.2,3.6,0
11452,Pooh Richardson-1991,PG,24,0.492,0.095,21.8,0.47,0.328,0.485,0.539,3154,17.1,3.5,9.0,1.6,0.2,2.1,1.4,0
