In [29]:
import pandas as pd
import json

# Reset display options to default
pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')

# Optionally, you can set the maximum columns to display
pd.set_option('display.max_columns', None)

### Arranging the Players DataFrame

In [2]:
df_player = pd.read_csv('/Users/pierrecanadas/code/pirroux/personal_projects/nbadata/01-Project-Setup/data/data_all_players.csv')

### Extracting players info from dictionnary into separate columns

In [3]:
data = df_player
df_player = pd.DataFrame(data)

# Apply lambda function to create a new 'id_column'
df_player['player_team_id'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('id'))
df_player['player_team_abb'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('abbreviation'))
df_player['player_team_city'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('city'))
df_player['player_team_conf'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('conference'))
df_player['player_team_div'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('division'))
df_player['player_team_full_name'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('full_name'))
df_player['player_team_name'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('name'))

In [4]:
df_player.drop(columns=["team", "weight_pounds", "height_feet", "height_inches"], inplace=True)

### Arranging the Game DataFrame

In [5]:
df_games = pd.read_csv('/Users/pierrecanadas/code/pirroux/personal_projects/nbadata/01-Project-Setup/data/games_data.csv')

In [6]:
### Extracting game info from dictionnary into separate columns

In [7]:
data_games = df_games
df_games = pd.DataFrame(data_games)

# Apply lambda function to create a new 'id_column'
df_games['new_home_team_id'] = df_games['id'].apply(lambda x: json.loads(x.replace("'", "\"")).get('id'))
df_games['new_vis_team_id'] = df_games['time'].apply(lambda x: json.loads(x.replace("'", "\"")).get('id'))


In [8]:
df_games.drop(columns=["id", "time"], inplace=True)

In [9]:
df_games.rename(columns={"date": "score_home",
                        "postseason": "score_vis"}, inplace=True)

In [10]:
# Managing a MultiIndex in the DataFrame
df_games = df_games.reset_index()
df_games.rename(columns={"level_0":"game_id",
                        "level_1": "game_date"}, inplace=True)
df_games["game_date"] = pd.to_datetime(df_games["game_date"])

### Arranging the Stats DataFrame

In [11]:
import ast

In [12]:
df_stats = pd.read_csv('/Users/pierrecanadas/code/pirroux/personal_projects/nbadata/01-Project-Setup/data/all_stats.csv')

In [13]:
# Transform columns with lists (dictionnaries looking like lists) into dictionaries
df_stats['game'] = df_stats['game'].apply(ast.literal_eval)
df_stats['team'] = df_stats['team'].apply(ast.literal_eval)
df_stats['player'] = df_stats['player'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else None)

In [14]:
# Extract game_id, player_id and team_id from the columns
df_stats['game_id']=df_stats['game'].apply(lambda x : x['id'] if 'id' in x else None)
df_stats['player_id'] = df_stats['player'].apply(lambda x: int(x.get('id')) if pd.notna(x) and 'id' in x and pd.notna(x.get('id')) else None).astype('Int64')
df_stats['team_id'] = df_stats['team'].apply(lambda x: int(x.get('id')) if pd.notna(x) and 'id' in x and pd.notna(x.get('id')) else None).astype('Int64')

### Extracting info from dictionnary into separate columns

In [15]:
df_stats['first_name'] = df_stats['player'].apply(lambda x: x.get('first_name') if pd.notna(x) and 'first_name' in x else None)
df_stats['last_name'] = df_stats['player'].apply(lambda x: x.get('last_name') if pd.notna(x) and 'last_name' in x else None)
df_stats['position'] = df_stats['player'].apply(lambda x: x.get('position') if pd.notna(x) and 'position' in x else None)
df_stats['abbreviation'] = df_stats['team'].apply(lambda x: x.get('abbreviation') if pd.notna(x) and 'abbreviation' in x else None)

### Remove ambiguous team id in player's data dictionnnary

In [16]:
def remove_key_from_dict(d, key):
    if isinstance(d, dict):
        d.pop(key, None)
    return d

key_to_remove = 'team_id'
df_stats['player'] = df_stats['player'].apply(lambda x: remove_key_from_dict(x, key_to_remove))

### Merging DataFrames together

In [17]:
# Merging the games and stats dataframes

In [18]:
merged_df = df_games.merge(df_stats, how='left', on='game_id')

In [35]:
len(merged_df)

Unnamed: 0,game_id,game_date,season,new_home_team_id,new_vis_team_id,score_home,score_vis,team_id,abbreviation,player_id,position,first_name,last_name,pts,reb,stl,ast,blk,oreb,dreb,fg3_pct,fg3a,fg3m,fg_pct,fgm,fga,ft_pct,ftm,fta,turnover,pf,min,home_team_id,visitor_team_id,period,status,id,game,player,team,fantasy_bonus,fantasy_malus,fantasy_score
0,47179,2019-01-30,2018,2,4,126,94,4,CHA,33,F-G,Nicolas,Batum,13.0,2.0,2.0,4.0,0.0,2.0,0.0,0.25,4.0,1.0,0.545,6.0,11.0,0.0,0.0,0.0,3.0,3.0,26:38,4,False,Final,,1076654.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 33, 'first_name': 'Nicolas', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",28.0,11.0,17.0
1,47179,2019-01-30,2018,2,4,126,94,4,CHA,482,F,Marvin,Williams,10.0,2.0,1.0,0.0,0.0,0.0,2.0,1.0,2.0,2.0,1.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,27:23,4,False,Final,,1076655.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 482, 'first_name': 'Marvin', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",19.0,0.0,19.0
2,47179,2019-01-30,2018,2,4,126,94,4,CHA,48,C,Bismack,Biyombo,2.0,5.0,0.0,2.0,1.0,1.0,4.0,0.0,0.0,0.0,0.333,1.0,3.0,0.0,0.0,0.0,1.0,1.0,13:34,4,False,Final,,1076656.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 48, 'first_name': 'Bismack', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",10.0,3.0,7.0
3,47179,2019-01-30,2018,2,4,126,94,4,CHA,267,G,Jeremy,Lamb,6.0,5.0,0.0,1.0,0.0,0.0,5.0,0.4,5.0,2.0,0.286,2.0,7.0,0.0,0.0,0.0,2.0,2.0,23:10,4,False,Final,,1076657.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 267, 'first_name': 'Jeremy', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",16.0,10.0,6.0
4,47179,2019-01-30,2018,2,4,126,94,4,CHA,465,G,Kemba,Walker,21.0,1.0,1.0,2.0,1.0,0.0,1.0,0.286,7.0,2.0,0.45,9.0,20.0,1.0,1.0,1.0,1.0,0.0,26:31,4,False,Final,,1076658.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 465, 'first_name': 'Kemba', 'height_fee...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",37.0,17.0,20.0


In [19]:
df_player.rename(columns={"id": "player_id"}, inplace=True)
df_player.player_id = df_player.player_id.astype('Int64')

In [20]:
merged_df.rename(columns={"new_home_team_id":"home_team_id",
                  "new_vis_team_id": "vis_team_id",
                  })

Unnamed: 0,game_id,game_date,score_home,home_team_id,visitor_team_id,season,period,status,score_vis,home_team_id.1,...,reb,stl,team,turnover,player_id,team_id,first_name,last_name,position,abbreviation
0,47179,2019-01-30,126,4,False,2018,Final,,94,2,...,2.0,2.0,"{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",3.0,33,4,Nicolas,Batum,F-G,CHA
1,47179,2019-01-30,126,4,False,2018,Final,,94,2,...,2.0,1.0,"{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",0.0,482,4,Marvin,Williams,F,CHA
2,47179,2019-01-30,126,4,False,2018,Final,,94,2,...,5.0,0.0,"{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",1.0,48,4,Bismack,Biyombo,C,CHA
3,47179,2019-01-30,126,4,False,2018,Final,,94,2,...,5.0,0.0,"{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",2.0,267,4,Jeremy,Lamb,G,CHA
4,47179,2019-01-30,126,4,False,2018,Final,,94,2,...,1.0,1.0,"{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",1.0,465,4,Kemba,Walker,G,CHA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235567,1038345,2024-02-10,113,4,False,2023,Final,Final,119,30,...,,,,,,,,,,
235568,1038346,2024-02-10,122,4,False,2023,Final,Final,113,1,...,,,,,,,,,,
235569,1038348,2024-02-10,95,4,False,2023,Final,Final,119,28,...,,,,,,,,,,
235570,1038349,2024-02-10,113,4,False,2023,Final,Final,112,10,...,,,,,,,,,,


### Re-arranging columns in the Dataframe

In [21]:
best_column_order = ['game_id', 'game_date', "season", 'new_home_team_id', 'new_vis_team_id',
                    'score_home','score_vis', 'team_id', 'abbreviation',
                     'player_id', 'position', 'first_name', 'last_name',
                      'pts', 'reb', 'stl',
         'ast', 'blk','oreb', 'dreb', 'fg3_pct', 'fg3a',
       'fg3m', 'fg_pct', 'fgm','fga',  'ft_pct', 'ftm', 'fta',
                     'turnover', 'pf', 'min','home_team_id', 'visitor_team_id',
                     'period', 'status', 'id', 'game', 'player', 'team']

merged_df = merged_df[best_column_order]

In [22]:
merged_df.drop(columns=["status", "period"])

Unnamed: 0,game_id,game_date,season,new_home_team_id,new_vis_team_id,score_home,score_vis,team_id,abbreviation,player_id,...,fta,turnover,pf,min,home_team_id,visitor_team_id,id,game,player,team
0,47179,2019-01-30,2018,2,4,126,94,4,CHA,33,...,0.0,3.0,3.0,26:38,4,False,1076654.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 33, 'first_name': 'Nicolas', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char..."
1,47179,2019-01-30,2018,2,4,126,94,4,CHA,482,...,0.0,0.0,0.0,27:23,4,False,1076655.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 482, 'first_name': 'Marvin', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char..."
2,47179,2019-01-30,2018,2,4,126,94,4,CHA,48,...,0.0,1.0,1.0,13:34,4,False,1076656.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 48, 'first_name': 'Bismack', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char..."
3,47179,2019-01-30,2018,2,4,126,94,4,CHA,267,...,0.0,2.0,2.0,23:10,4,False,1076657.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 267, 'first_name': 'Jeremy', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char..."
4,47179,2019-01-30,2018,2,4,126,94,4,CHA,465,...,1.0,1.0,0.0,26:31,4,False,1076658.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 465, 'first_name': 'Kemba', 'height_fee...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235567,1038345,2024-02-10,2023,30,23,113,119,,,,...,,,,,4,False,,,,
235568,1038346,2024-02-10,2023,1,11,122,113,,,,...,,,,,4,False,,,,
235569,1038348,2024-02-10,2023,28,6,95,119,,,,...,,,,,4,False,,,,
235570,1038349,2024-02-10,2023,10,24,113,112,,,,...,,,,,4,False,,,,


### Creating Fantasy score

In [23]:
merged_df["fantasy_bonus"] = merged_df.apply(lambda row: row.loc['pts'] + row.loc['reb'] +row.loc['ast']+row.loc['stl']+ row.loc['fgm'] + row.loc['fg3m'] + row.loc['ftm'], axis=1)
merged_df["fantasy_malus"] = merged_df.apply(lambda row: row.loc['turnover'] + (row.loc["fga"] - row.loc["fgm"]) + (row.loc["fg3a"] - row.loc["fg3m"]) + (row.loc["fta"] - row.loc["ftm"]), axis=1)
merged_df["fantasy_score"] = merged_df["fantasy_bonus"] - merged_df["fantasy_malus"]

### Creating an impact table - In progress

In [24]:
pd.set_option('display.max_columns', None)

In [26]:
def create_impact_df(merged_df, given_date):

    # Filter for the last 30 days
    date_threshold = pd.to_datetime(given_date) - pd.Timedelta(days=1)
    last_30_days_df = merged_df[(merged_df['game_date'] >= date_threshold) & (merged_df['game_date'] <= given_date)]

    # Group by player_id and aggregate the statistics
    impact_table = last_30_days_df.groupby('last_name').agg({
        'pts': 'sum',
        'ast': 'sum',
        'reb': 'sum',
        'fantasy_score': 'sum',
        # add other stats as needed
    }).reset_index()

    # Add player information from players_df if needed
    #impact_table = pd.merge(impact_table, df_player[['player_id', 'last_name', 'team_id']], on='player_id', how='left')

    #sort by descending fantasy_score
    impact_table = impact_table.sort_values('fantasy_score', ascending=False)
    #round pts ast reb and fantasy_score to 1 decimal places
    impact_table = impact_table.round({'pts': 1, 'ast': 1, 'reb': 1, 'fantasy_score': 1})
    return impact_table

In [27]:
# Example usage:
given_date = '2024-01-31'
impact_table_df = create_impact_df(merged_df, given_date)

In [31]:
impact_table_df.head(50)

Unnamed: 0,last_name,pts,ast,reb,fantasy_score
248,White,87.0,28.0,10.0,137.0
164,Murray,73.0,15.0,12.0,113.0
28,Brown,63.0,8.0,20.0,109.0
259,Young,58.0,25.0,14.0,98.0
63,Dosunmu,63.0,6.0,15.0,96.0
222,Tatum,60.0,14.0,14.0,96.0
56,DeRozan,75.0,15.0,6.0,90.0
99,Harris,52.0,4.0,20.0,88.0
238,Vucevic,42.0,9.0,27.0,81.0
210,Siakam,46.0,4.0,12.0,76.0
