In [1]:
import pandas as pd
import json


### Arranging the Players DataFrame

In [2]:
df_player = pd.read_csv('/Users/pierrecanadas/code/pirroux/personal_projects/nbadata/01-Project-Setup/data/data_all_players.csv')

### Extracting players info from dictionnary into separate columns

In [4]:
data = df_player
df_player = pd.DataFrame(data)

# Apply lambda function to create a new 'id_column'
df_player['player_team_id'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('id'))
df_player['player_team_abb'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('abbreviation'))
df_player['player_team_city'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('city'))
df_player['player_team_conf'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('conference'))
df_player['player_team_div'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('division'))
df_player['player_team_full_name'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('full_name'))
df_player['player_team_name'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('name'))

In [5]:
df_player.drop(columns=["team", "weight_pounds", "height_feet", "height_inches"], inplace=True)

### Arranging the Game DataFrame

In [6]:
df_games = pd.read_csv('/Users/pierrecanadas/code/pirroux/personal_projects/nbadata/01-Project-Setup/data/games_data.csv')

In [7]:
### Extracting game info from dictionnary into separate columns

In [8]:
data_games = df_games
df_games = pd.DataFrame(data_games)

# Apply lambda function to create a new 'id_column'
df_games['new_home_team_id'] = df_games['id'].apply(lambda x: json.loads(x.replace("'", "\"")).get('id'))
df_games['new_vis_team_id'] = df_games['time'].apply(lambda x: json.loads(x.replace("'", "\"")).get('id'))


In [9]:
df_games.drop(columns=["id", "time"], inplace=True)

In [10]:
df_games.rename(columns={"date": "score_home",
                        "postseason": "score_vis"}, inplace=True)

In [11]:
# Managing a MultiIndex in the DataFrame
df_games = df_games.reset_index()
df_games.rename(columns={"level_0":"game_id",
                        "level_1": "game_date"}, inplace=True)

### Arranging the Stats DataFrame

In [12]:
import ast

In [13]:
df_stats = pd.read_csv('/Users/pierrecanadas/code/pirroux/personal_projects/nbadata/01-Project-Setup/data/all_stats.csv')

In [14]:
# Transform columns with lists (dictionnaries looking like lists) into dictionaries
df_stats['game'] = df_stats['game'].apply(ast.literal_eval)
df_stats['team'] = df_stats['team'].apply(ast.literal_eval)
df_stats['player'] = df_stats['player'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else None)

In [15]:
# Extract game_id, player_id and team_id from the columns
df_stats['game_id']=df_stats['game'].apply(lambda x : x['id'] if 'id' in x else None)
df_stats['player_id'] = df_stats['player'].apply(lambda x: int(x.get('id')) if pd.notna(x) and 'id' in x and pd.notna(x.get('id')) else None).astype('Int64')
df_stats['team_id'] = df_stats['team'].apply(lambda x: int(x.get('id')) if pd.notna(x) and 'id' in x and pd.notna(x.get('id')) else None).astype('Int64')

### Extracting info from dictionnary into separate columns

In [16]:
df_stats['first_name'] = df_stats['player'].apply(lambda x: x.get('first_name') if pd.notna(x) and 'first_name' in x else None)
df_stats['last_name'] = df_stats['player'].apply(lambda x: x.get('last_name') if pd.notna(x) and 'last_name' in x else None)
df_stats['position'] = df_stats['player'].apply(lambda x: x.get('position') if pd.notna(x) and 'position' in x else None)
df_stats['abbreviation'] = df_stats['team'].apply(lambda x: x.get('abbreviation') if pd.notna(x) and 'abbreviation' in x else None)

### Remove ambiguous team id in player's data dictionnnary

In [17]:
def remove_key_from_dict(d, key):
    if isinstance(d, dict):
        d.pop(key, None)
    return d

key_to_remove = 'team_id'
df_stats['player'] = df_stats['player'].apply(lambda x: remove_key_from_dict(x, key_to_remove))

### Merging DataFrames together

In [18]:
df = df_games.merge(df_stats, how='left', on='game_id')

In [19]:
df_player.rename(columns={"id": "player_id"}, inplace=True)
df_player.player_id = df_player.player_id.astype('Int64')

In [20]:
df.rename(columns={"new_home_team_id":"home_team_id",
                  "new_vis_team_id": "vis_team_id",
                  })

Unnamed: 0,game_id,game_date,score_home,home_team_id,visitor_team_id,season,period,status,score_vis,home_team_id.1,...,reb,stl,team,turnover,player_id,team_id,first_name,last_name,position,abbreviation
0,47179,2019-01-30,126,4,False,2018,Final,,94,2,...,2.0,2.0,"{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",3.0,33,4,Nicolas,Batum,F-G,CHA
1,47179,2019-01-30,126,4,False,2018,Final,,94,2,...,2.0,1.0,"{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",0.0,482,4,Marvin,Williams,F,CHA
2,47179,2019-01-30,126,4,False,2018,Final,,94,2,...,5.0,0.0,"{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",1.0,48,4,Bismack,Biyombo,C,CHA
3,47179,2019-01-30,126,4,False,2018,Final,,94,2,...,5.0,0.0,"{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",2.0,267,4,Jeremy,Lamb,G,CHA
4,47179,2019-01-30,126,4,False,2018,Final,,94,2,...,1.0,1.0,"{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",1.0,465,4,Kemba,Walker,G,CHA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235362,1038294,2024-02-03,117,4,False,2023,Final,Final,129,7,...,0.0,0.0,"{'id': 17, 'abbreviation': 'MIL', 'city': 'Mil...",0.0,284,17,Robin,Lopez,C,MIL
235363,1038294,2024-02-03,117,4,False,2023,Final,Final,129,7,...,7.0,2.0,"{'id': 17, 'abbreviation': 'MIL', 'city': 'Mil...",1.0,38,17,Malik,Beasley,G,MIL
235364,1038294,2024-02-03,117,4,False,2023,Final,Final,129,7,...,3.0,0.0,"{'id': 17, 'abbreviation': 'MIL', 'city': 'Mil...",6.0,278,17,Damian,Lillard,G,MIL
235365,1038294,2024-02-03,117,4,False,2023,Final,Final,129,7,...,2.0,2.0,"{'id': 17, 'abbreviation': 'MIL', 'city': 'Mil...",0.0,112,17,Jae,Crowder,F,MIL


### Re-arranging columns in the Dataframe

In [21]:
best_column_order = ['game_id', 'game_date', "season", 'new_home_team_id', 'new_vis_team_id',
                    'score_home','score_vis', 'team_id', 'abbreviation',
                     'player_id', 'position', 'first_name', 'last_name',
                      'pts', 'reb', 'stl', 
         'ast', 'blk','oreb', 'dreb', 'fg3_pct', 'fg3a',
       'fg3m', 'fg_pct', 'fgm','fga',  'ft_pct', 'ftm', 'fta',  
                     'turnover', 'pf', 'min','home_team_id', 'visitor_team_id', 
                     'period', 'status', 'id', 'game', 'player', 'team']

df = df[best_column_order]

In [22]:
df.drop(columns=["status", "period"])

Unnamed: 0,game_id,game_date,season,new_home_team_id,new_vis_team_id,score_home,score_vis,team_id,abbreviation,player_id,...,fta,turnover,pf,min,home_team_id,visitor_team_id,id,game,player,team
0,47179,2019-01-30,2018,2,4,126,94,4,CHA,33,...,0.0,3.0,3.0,26:38,4,False,1076654.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 33, 'first_name': 'Nicolas', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char..."
1,47179,2019-01-30,2018,2,4,126,94,4,CHA,482,...,0.0,0.0,0.0,27:23,4,False,1076655.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 482, 'first_name': 'Marvin', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char..."
2,47179,2019-01-30,2018,2,4,126,94,4,CHA,48,...,0.0,1.0,1.0,13:34,4,False,1076656.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 48, 'first_name': 'Bismack', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char..."
3,47179,2019-01-30,2018,2,4,126,94,4,CHA,267,...,0.0,2.0,2.0,23:10,4,False,1076657.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 267, 'first_name': 'Jeremy', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char..."
4,47179,2019-01-30,2018,2,4,126,94,4,CHA,465,...,1.0,1.0,0.0,26:31,4,False,1076658.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 465, 'first_name': 'Kemba', 'height_fee...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235362,1038294,2024-02-03,2023,7,17,117,129,17,MIL,284,...,0.0,0.0,0.0,04,4,False,14391931.0,"{'id': 1038294, 'date': '2024-02-03', 'home_te...","{'id': 284, 'first_name': 'Robin', 'height_fee...","{'id': 17, 'abbreviation': 'MIL', 'city': 'Mil..."
235363,1038294,2024-02-03,2023,7,17,117,129,17,MIL,38,...,2.0,1.0,1.0,33,4,False,14391932.0,"{'id': 1038294, 'date': '2024-02-03', 'home_te...","{'id': 38, 'first_name': 'Malik', 'height_feet...","{'id': 17, 'abbreviation': 'MIL', 'city': 'Mil..."
235364,1038294,2024-02-03,2023,7,17,117,129,17,MIL,278,...,6.0,6.0,3.0,38,4,False,14391933.0,"{'id': 1038294, 'date': '2024-02-03', 'home_te...","{'id': 278, 'first_name': 'Damian', 'height_fe...","{'id': 17, 'abbreviation': 'MIL', 'city': 'Mil..."
235365,1038294,2024-02-03,2023,7,17,117,129,17,MIL,112,...,0.0,0.0,3.0,21,4,False,14391937.0,"{'id': 1038294, 'date': '2024-02-03', 'home_te...","{'id': 112, 'first_name': 'Jae', 'height_feet'...","{'id': 17, 'abbreviation': 'MIL', 'city': 'Mil..."


### Creating Fantasy score

In [24]:
df["fantasy_bonus"] = df.apply(lambda row: row.loc['pts'] + row.loc['reb'] +row.loc['ast']+row.loc['stl']+ row.loc['fgm'] + row.loc['fg3m'] + row.loc['ftm'], axis=1)
df["fantasy_malus"] = df.apply(lambda row: row.loc['turnover'] + (row.loc["fga"] - row.loc["fgm"]) + (row.loc["fg3a"] - row.loc["fg3m"]) + (row.loc["fta"] - row.loc["ftm"]), axis=1)
df["fantasy_score"] = df["fantasy_bonus"] - df["fantasy_malus"]

### Creating an impact table - In progress

In [42]:
pd.set_option('display.max_columns', None)

In [44]:
columns_to_keep = df.player_id,df.game_id,df.season df_game_date ?
intertable = df.iloc[1:100]

In [47]:
intertable.groupby(by="player_id").mean()

Unnamed: 0_level_0,game_id,season,new_home_team_id,new_vis_team_id,score_home,score_vis,team_id,pts,reb,stl,ast,blk,oreb,dreb,fg3_pct,fg3a,fg3m,fg_pct,fgm,fga,ft_pct,ftm,fta,turnover,pf,home_team_id,visitor_team_id,id,fantasy_bonus,fantasy_malus,fantasy_score
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
23,47179.0,2018.0,2.0,4.0,126.000000,94.000000,4.0,5.000000,4.000000,0.0,0.000000,1.0,1.000000,3.000000,0.500,2.0,1.0,0.500000,2.000000,4.000000,0.000,0.0,0.0,0.000000,2.0,4.0,0.0,1.076664e+06,12.0,3.000000,9.000000
31,48739.0,2018.0,23.0,8.0,117.000000,110.000000,8.0,14.000000,8.000000,0.0,7.000000,0.0,2.000000,6.000000,0.000,1.0,0.0,0.375000,6.000000,16.000000,1.000,2.0,2.0,2.000000,4.0,4.0,0.0,1.115469e+06,37.0,13.000000,24.000000
35,47179.0,2018.0,2.0,4.0,126.000000,94.000000,2.0,2.000000,1.000000,0.0,1.000000,0.0,0.000000,1.000000,0.000,0.0,0.0,0.500000,1.000000,2.000000,0.000,0.0,0.0,0.000000,0.0,4.0,0.0,1.076674e+06,5.0,1.000000,4.000000
38,48739.0,2018.0,23.0,8.0,117.000000,110.000000,8.0,12.000000,3.000000,0.0,0.000000,0.0,0.000000,3.000000,0.333,6.0,2.0,0.455000,5.000000,11.000000,,0.0,0.0,2.000000,2.0,4.0,0.0,1.115472e+06,22.0,12.000000,10.000000
45,48751.0,2018.0,2.0,13.0,112.000000,123.000000,13.0,12.000000,9.000000,2.0,7.000000,0.0,1.000000,8.000000,0.500,4.0,2.0,0.429000,3.000000,7.000000,1.000,4.0,4.0,2.000000,4.0,4.0,0.0,1.121376e+06,39.0,8.000000,31.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481,48751.0,2018.0,2.0,13.0,112.000000,123.000000,13.0,15.000000,3.000000,0.0,6.000000,1.0,1.000000,2.000000,0.250,4.0,1.0,0.267000,4.000000,15.000000,0.600,6.0,10.0,3.000000,1.0,4.0,0.0,1.121381e+06,35.0,21.000000,14.000000
482,47179.0,2018.0,2.0,4.0,126.000000,94.000000,4.0,10.000000,2.000000,1.0,0.000000,0.0,0.000000,2.000000,1.000,2.0,2.0,1.000000,4.000000,4.000000,0.000,0.0,0.0,0.000000,0.0,4.0,0.0,1.076655e+06,19.0,0.000000,19.000000
488,48227.0,2018.0,2.0,10.0,116.666667,113.333333,2.0,0.666667,0.666667,0.0,0.333333,0.0,0.333333,0.333333,0.000,0.0,0.0,0.333333,0.333333,0.333333,0.000,0.0,0.0,0.333333,0.0,4.0,0.0,1.106491e+06,2.0,0.333333,1.666667
493,48751.0,2018.0,2.0,13.0,112.000000,123.000000,13.0,12.000000,9.000000,0.0,0.000000,3.0,1.000000,8.000000,,0.0,0.0,0.500000,5.000000,10.000000,0.667,2.0,3.0,2.000000,3.0,4.0,0.0,1.121375e+06,28.0,8.000000,20.000000


In [43]:
impact = df.pts.mean().groupby(by="player_id").mean

Unnamed: 0,game_id,game_date,season,new_home_team_id,new_vis_team_id,score_home,score_vis,team_id,abbreviation,player_id,position,first_name,last_name,pts,reb,stl,ast,blk,oreb,dreb,fg3_pct,fg3a,fg3m,fg_pct,fgm,fga,ft_pct,ftm,fta,turnover,pf,min,home_team_id,visitor_team_id,period,status,id,game,player,team,fantasy_bonus,fantasy_malus,fantasy_score
0,47179,2019-01-30,2018,2,4,126,94,4,CHA,33,F-G,Nicolas,Batum,13.0,2.0,2.0,4.0,0.0,2.0,0.0,0.25,4.0,1.0,0.545,6.0,11.0,0.0,0.0,0.0,3.0,3.0,26:38,4,False,Final,,1076654.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 33, 'first_name': 'Nicolas', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",28.0,11.0,17.0
1,47179,2019-01-30,2018,2,4,126,94,4,CHA,482,F,Marvin,Williams,10.0,2.0,1.0,0.0,0.0,0.0,2.0,1.0,2.0,2.0,1.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,27:23,4,False,Final,,1076655.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 482, 'first_name': 'Marvin', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",19.0,0.0,19.0
2,47179,2019-01-30,2018,2,4,126,94,4,CHA,48,C,Bismack,Biyombo,2.0,5.0,0.0,2.0,1.0,1.0,4.0,0.0,0.0,0.0,0.333,1.0,3.0,0.0,0.0,0.0,1.0,1.0,13:34,4,False,Final,,1076656.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 48, 'first_name': 'Bismack', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",10.0,3.0,7.0
3,47179,2019-01-30,2018,2,4,126,94,4,CHA,267,G,Jeremy,Lamb,6.0,5.0,0.0,1.0,0.0,0.0,5.0,0.4,5.0,2.0,0.286,2.0,7.0,0.0,0.0,0.0,2.0,2.0,23:10,4,False,Final,,1076657.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 267, 'first_name': 'Jeremy', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",16.0,10.0,6.0
4,47179,2019-01-30,2018,2,4,126,94,4,CHA,465,G,Kemba,Walker,21.0,1.0,1.0,2.0,1.0,0.0,1.0,0.286,7.0,2.0,0.45,9.0,20.0,1.0,1.0,1.0,1.0,0.0,26:31,4,False,Final,,1076658.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 465, 'first_name': 'Kemba', 'height_fee...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",37.0,17.0,20.0
