In [5]:
import pandas as pd
import numpy as np

df_als = pd.read_csv("nbaallstargames.csv") # All-Star data
df_stat = pd.read_csv("correct_data.csv", sep=";") # Season Statistics

# Delete seasons older than 1980
df_stat = df_stat[df_stat["Year"] >= 1980]
df_stat = df_stat[df_stat["MP"] >= 300]

# Delete unnecessary statistics and customize the statistics in a per game fashion
deletedColumns = ["Column1", "Tm", "GS", "PER", "FTr", "ORB%", "DRB%", "TRB%", "AST%", "STL%", "BLK%", "TOV%", "blanl","OWS", "DWS","WS", "WS/48", "blank2", "OBPM", "DBPM","BPM", "VORP", "FG", "FGA", "3P", "3PA", "2P", "2PA", "eFG%", "FT", "FTA", "ORB", "DRB"]
df_stat = df_stat.drop(columns=deletedColumns)
df_stat = df_stat.assign(MinutesPlayed=df_stat["MP"].astype(int))
df_stat["PointsPerGame"] = (df_stat["PTS"] / df_stat["G"]).round(1)
df_stat["Rebounds"] = (df_stat["TRB"] / df_stat["G"]).round(1)
df_stat["Assists"] = (df_stat["AST"] / df_stat["G"]).round(1)
df_stat["Steals"] = (df_stat["STL"] / df_stat["G"]).round(1)
df_stat["Blocks"] = (df_stat["BLK"] / df_stat["G"]).round(1)
df_stat["Turnovers"] = (df_stat["TOV"] / df_stat["G"]).round(1)
df_stat["PersonalFouls"] = (df_stat["PF"] / df_stat["G"]).round(1)
df_stat = df_stat.assign(Age=df_stat["Age"].astype(int)) 
df_stat = df_stat.drop(columns=["G", "PTS", "TRB", "AST", "STL", "BLK", "TOV", "PF", "MP"])

# Reassign the player and year stat as one singular column for the Season Statistics and moving it the first column
df_stat = df_stat.assign(Season = df_stat["Player"].str.replace("*", '', regex=False).astype(str) + "-" + df_stat["Year"].astype(int).astype(str))
df_stat = df_stat.drop(columns=["Year", "Player"])
player_column = df_stat.pop("Season")
df_stat.insert(0, "Season", player_column)

# Reassign the player and year stat as one singular column for the All-Atar data and moving it the first column
df_als = df_als.assign(Season = df_als["Player"].astype(str) + "-" + df_als["Year"].astype(int).astype(str))
df_als = df_als.drop(columns=["Player", "Year"])
player_column = df_als.pop("Season")
df_als.insert(0, "Season", player_column)

# Merge the two dataframes based on Season and set Role as the new column for
df_merged = df_stat.merge(df_als[["Season", "Role"]], on="Season", how="left")

# Mapping the "Role" to numerical values
role_mapping = {
    "Starter": 2,
    "Reserve": 1,
    "DNP": 1  # DNP should be treated the same as "Reserve"
}

# Map the correct numbers on to the Seasons. Give 0 to all of the players not mentioned in the All-Star Data
df_merged["AllStarStatus"] = df_merged["Role"].map(role_mapping).fillna(0).astype(int)

# Drop the column Role because it is now described by 0, 1 or 2
df_merged = df_merged.drop(columns=["Role"])

df_merged.sample(10)

Unnamed: 0,Season,Pos,Age,TS%,3PAr,USG%,FG%,3P%,2P%,FT%,MinutesPlayed,PointsPerGame,Rebounds,Assists,Steals,Blocks,Turnovers,PersonalFouls,AllStarStatus
338,John Wall-2017,PG,26,0.541,0.19,30.6,0.451,0.327,0.48,0.801,2836,23.1,4.2,10.7,2.0,0.6,4.1,1.9,1
2462,Steve Novak-2012,PF,28,0.684,0.837,16.2,0.478,0.472,0.509,0.846,1020,8.8,1.9,0.2,0.3,0.2,0.4,1.1,0
438,Dejounte Murray-2017,PG,20,0.503,0.198,23.5,0.431,0.391,0.441,0.7,322,3.4,1.1,1.3,0.2,0.2,1.0,0.8,0
2420,Ray Allen-2012,SG,36,0.607,0.475,18.6,0.458,0.453,0.463,0.915,1565,14.2,3.1,2.4,1.1,0.2,1.5,1.8,0
13484,Ricky Sobers-1982,SG,29,0.515,0.095,24.4,0.453,0.25,0.474,0.768,1938,11.8,1.8,3.8,0.9,0.2,2.7,3.0,0
6970,Cherokee Parks-2001,C,28,0.506,0.025,14.7,0.492,0.0,0.504,0.704,876,4.8,3.6,0.8,0.4,0.5,0.6,2.0,0
5449,Matt Harpring-2005,SF,28,0.549,0.05,20.1,0.489,0.209,0.504,0.778,2584,14.0,6.2,1.8,0.9,0.2,1.7,3.2,0
10620,Tony Smith-1991,SG,22,0.477,0.032,19.7,0.441,0.0,0.455,0.702,695,3.7,1.1,2.1,0.4,0.2,1.1,1.2,0
12824,Stewart Granger-1984,PG,22,0.489,0.058,17.7,0.429,0.308,0.437,0.757,738,4.5,1.0,2.4,0.4,0.0,1.0,1.7,0
3789,Tyrus Thomas-2009,PF,22,0.525,0.013,19.4,0.451,0.333,0.453,0.783,2175,10.8,6.4,1.0,1.2,1.9,1.6,2.8,0
