In [1]:
import pandas as pd
from UtilFunctions import format_season

In [23]:
def pre_process(seasons):
    df = None
    for season in seasons:
        per_min_cols = ['FGA', '3PA', '2PA', 'PF', 'PTS', 'OWS', 'DWS', 'OBPM', 'DBPM', 'BPM']
        cols_to_drop = None
        first_year, second_year = format_season(season)
        player_data = pd.read_csv(f"DataCollection/Player_Stats/player_stats_{first_year}-{second_year}.csv")
        cols_to_drop = ['index', 'TRB', 'DRB', 'ORB', 'AST', 'G', 'GS', 'FT', 'FG', 'FG%', 'BLK', 'STL',
                                'WS', 'FTA', 'TOV', 'Age', '2P', '3P', 'VORP']
            
        player_data = player_data.fillna(0)
        player_data = player_data[(player_data["MP"] > 15) & (player_data["G"] >= 30)]
        player_data = player_data.sort_values(by=['G'], ascending=False)
        player_data = player_data[player_data["Tm"] != "TOT"]
        player_data = player_data.drop_duplicates(subset ="Player",keep = "first")
        player_data['Season'] = season
        player_data.reset_index(inplace=True)
        player_data.drop(columns=cols_to_drop, inplace=True)
        
        for col in per_min_cols:
            player_data[col] = round(player_data[col] / player_data['MP'] * 36, 2)
        
        if df is None:
            df = player_data
        else:
            df = pd.concat([df, player_data])
            
    df.reset_index(inplace=True)
    df.drop(['index', 'Tm'], axis=1, inplace=True)
    return df

In [24]:
df = pre_process(range(1980, 2023))

In [25]:
df

Unnamed: 0,Player,Pos,MP,FGA,3PA,3P%,2PA,2P%,eFG%,FT%,...,TOV%,USG%,OWS,DWS,WS/48,OBPM,DBPM,BPM,Salary,Season
0,Kareem Abdul-Jabbar*,C,38.3,15.89,0.00,0.000,15.89,0.604,0.604,0.765,...,15.7,24.1,8.93,4.98,0.227,4.51,2.26,6.77,0.0,1980
1,Larry Bird*,PF,36.0,17.80,1.70,0.406,16.10,0.481,0.494,0.836,...,14.0,25.3,5.60,5.60,0.182,3.00,1.50,4.50,0.0,1980
2,Otis Birdsong,SG,35.2,19.33,0.41,0.278,18.82,0.511,0.508,0.694,...,9.4,26.4,5.01,3.78,0.143,2.05,0.10,2.15,0.0,1980
3,Henry Bibby,PG,24.8,11.03,0.87,0.212,10.16,0.418,0.410,0.790,...,16.4,17.7,1.89,3.05,0.079,-2.47,-0.73,-3.19,0.0,1980
4,Greg Ballard,SF,29.7,16.24,0.73,0.340,15.64,0.502,0.502,0.753,...,10.0,21.6,4.97,3.39,0.136,2.30,0.12,2.42,0.0,1980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10971,Andre Iguodala,SF,19.5,7.20,4.43,0.230,2.77,0.617,0.450,0.750,...,17.7,11.4,0.92,2.03,0.125,-1.66,6.83,4.98,1669178.0,2022
10972,Vit Krejci,SG,23.0,8.77,5.17,0.327,3.60,0.522,0.503,0.864,...,14.1,12.9,0.16,0.78,0.042,-6.73,-1.57,-8.30,925258.0,2022
10973,Marvin Bagley III,PF,21.9,13.48,3.45,0.242,10.03,0.538,0.494,0.745,...,7.3,18.8,0.82,0.66,0.068,-1.81,-3.95,-5.75,11312114.0,2022
10974,Terence Davis,SG,17.9,17.90,10.06,0.329,7.84,0.542,0.515,0.818,...,10.3,25.6,0.00,1.01,0.044,-1.21,-0.80,-1.81,4000000.0,2022


In [26]:
df_sample = df.sample(n=100, random_state=1)

df_sample_unique = df_sample.drop_duplicates(subset='Player') 

while len(df_sample_unique) < 100:
    additional_sample = df.sample(n=100 - len(df_sample_unique))
    df_sample_unique = pd.concat([df_sample_unique, additional_sample]).drop_duplicates(subset='Player')

df_sample_unique = df_sample_unique.head(100)

df_sample_unique.reset_index(inplace=True)
df_sample_unique.to_csv("DataCollection/Player_Stats/player_stats_sample.csv", index=False)

In [22]:
df_sample_unique

Unnamed: 0,index,Player,Pos,MP,FGA,3PA,3P%,2PA,2P%,eFG%,...,TOV%,USG%,OWS,DWS,WS/48,OBPM,DBPM,BPM,Salary,Season
0,2333,Sedale Threatt,PG,25.8,14.511628,0.558140,0.286,13.953488,0.529,0.525,...,13.2,21.3,4.325581,1.953488,0.106,0.279070,0.000000,0.418605,0.0,1991
1,5768,Nick Collison,C,17.0,9.105882,0.000000,0.000,9.105882,0.541,0.537,...,13.3,15.5,5.717647,2.541176,0.135,-2.752941,-0.847059,-3.600000,0.0,2005
2,10908,OG Anunoby,SF,36.0,14.500000,6.600000,0.363,7.900000,0.511,0.526,...,9.7,20.5,1.800000,1.900000,0.104,0.400000,0.100000,0.500000,0.0,2022
3,8426,Jameer Nelson,PG,32.0,12.375000,6.525000,0.348,5.850000,0.444,0.485,...,17.3,20.1,1.800000,1.350000,0.062,0.675000,-1.912500,-1.237500,0.0,2014
4,1094,Robert Parish*,C,36.1,12.864266,0.000000,0.000,12.864266,0.542,0.542,...,13.5,19.7,4.986150,3.988920,0.151,1.096953,-0.299169,0.797784,0.0,1985
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3669,Tyrone Corbin,SF,19.0,10.042105,0.378947,0.083,9.473684,0.470,0.454,...,14.3,16.5,1.326316,2.084211,0.092,-2.463158,1.705263,-0.757895,0.0,1996
96,8769,Ryan Kelly,PF,23.7,8.962025,3.949367,0.336,5.164557,0.337,0.409,...,9.5,14.0,0.607595,0.455696,0.027,-4.556962,-0.455696,-5.012658,1650000.0,2015
97,10769,Reggie Bullock,SF,28.0,9.514286,7.457143,0.360,2.057143,0.550,0.542,...,7.1,13.3,1.542857,2.571429,0.080,-1.928571,0.000000,-1.800000,9536000.0,2022
98,6342,Ben Gordon,SG,33.0,17.781818,5.018182,0.413,12.763636,0.471,0.513,...,14.0,29.3,5.127273,4.254545,0.152,2.727273,-0.109091,2.618182,3862080.0,2007
