In [1]:
import pandas as pd
import json


### Arranging the Players DataFrame

In [2]:
df_player = pd.read_csv('/Users/pierrecanadas/code/pirroux/personal_projects/nbadata/01-Project-Setup/data/data_all_players.csv')

### Extracting players info from dictionnary into separate columns

In [3]:
data = df_player
df_player = pd.DataFrame(data)

# Apply lambda function to create a new 'id_column'
df_player['player_team_id'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('id'))
df_player['player_team_abb'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('abbreviation'))
df_player['player_team_city'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('city'))
df_player['player_team_conf'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('conference'))
df_player['player_team_div'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('division'))
df_player['player_team_full_name'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('full_name'))
df_player['player_team_name'] = df_player['team'].apply(lambda x: json.loads(x.replace("'", "\"")).get('name'))

In [4]:
df_player.drop(columns=["team", "weight_pounds", "height_feet", "height_inches"], inplace=True)

### Arranging the Game DataFrame

In [5]:
df_games = pd.read_csv('/Users/pierrecanadas/code/pirroux/personal_projects/nbadata/01-Project-Setup/data/games_data.csv')

In [6]:
### Extracting game info from dictionnary into separate columns

In [7]:
data_games = df_games
df_games = pd.DataFrame(data_games)

# Apply lambda function to create a new 'id_column'
df_games['new_home_team_id'] = df_games['id'].apply(lambda x: json.loads(x.replace("'", "\"")).get('id'))
df_games['new_vis_team_id'] = df_games['time'].apply(lambda x: json.loads(x.replace("'", "\"")).get('id'))


In [8]:
df_games.drop(columns=["id", "time"], inplace=True)

In [9]:
df_games.rename(columns={"date": "score_home",
                        "postseason": "score_vis"}, inplace=True)

In [10]:
# Managing a MultiIndex in the DataFrame
df_games = df_games.reset_index()
df_games.rename(columns={"level_0":"game_id",
                        "level_1": "game_date"}, inplace=True)
df_games["game_date"] = pd.to_datetime(df_games["game_date"])

### Arranging the Stats DataFrame

In [11]:
import ast

In [12]:
df_stats = pd.read_csv('/Users/pierrecanadas/code/pirroux/personal_projects/nbadata/01-Project-Setup/data/all_stats.csv')

In [13]:
# Transform columns with lists (dictionnaries looking like lists) into dictionaries
df_stats['game'] = df_stats['game'].apply(ast.literal_eval)
df_stats['team'] = df_stats['team'].apply(ast.literal_eval)
df_stats['player'] = df_stats['player'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else None)

In [14]:
# Extract game_id, player_id and team_id from the columns
df_stats['game_id']=df_stats['game'].apply(lambda x : x['id'] if 'id' in x else None)
df_stats['player_id'] = df_stats['player'].apply(lambda x: int(x.get('id')) if pd.notna(x) and 'id' in x and pd.notna(x.get('id')) else None).astype('Int64')
df_stats['team_id'] = df_stats['team'].apply(lambda x: int(x.get('id')) if pd.notna(x) and 'id' in x and pd.notna(x.get('id')) else None).astype('Int64')

### Extracting info from dictionnary into separate columns

In [15]:
df_stats['first_name'] = df_stats['player'].apply(lambda x: x.get('first_name') if pd.notna(x) and 'first_name' in x else None)
df_stats['last_name'] = df_stats['player'].apply(lambda x: x.get('last_name') if pd.notna(x) and 'last_name' in x else None)
df_stats['position'] = df_stats['player'].apply(lambda x: x.get('position') if pd.notna(x) and 'position' in x else None)
df_stats['abbreviation'] = df_stats['team'].apply(lambda x: x.get('abbreviation') if pd.notna(x) and 'abbreviation' in x else None)

### Remove ambiguous team id in player's data dictionnnary

In [16]:
def remove_key_from_dict(d, key):
    if isinstance(d, dict):
        d.pop(key, None)
    return d

key_to_remove = 'team_id'
df_stats['player'] = df_stats['player'].apply(lambda x: remove_key_from_dict(x, key_to_remove))

### Merging DataFrames together

In [17]:
# Merging the games and stats dataframes

In [205]:
merged_df = df_games.merge(df_stats, how='inner', on='game_id')

In [206]:
#merged_df.duplicated().sum()

In [207]:
df_player.rename(columns={"id": "player_id"}, inplace=True)
df_player.player_id = df_player.player_id.astype('Int64')

In [208]:
merged_df.rename(columns={"new_home_team_id":"home_team_id",
                  "new_vis_team_id": "vis_team_id",
                  })

Unnamed: 0,game_id,game_date,score_home,home_team_id,visitor_team_id,season,period,status,score_vis,home_team_id.1,vis_team_id,id,ast,blk,dreb,fg3_pct,fg3a,fg3m,fg_pct,fga,fgm,ft_pct,fta,ftm,game,min,oreb,pf,player,pts,reb,stl,team,turnover,player_id,team_id,first_name,last_name,position,abbreviation
0,47179,2019-01-30,126,4,False,2018,Final,,94,2,4,1076654,4.0,0.0,0.0,0.250,4.0,1.0,0.545000,11.0,6.0,0.000000,0.0,0.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...",26:38,2.0,3.0,"{'id': 33, 'first_name': 'Nicolas', 'height_fe...",13.0,2.0,2.0,"{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",3.0,33,4,Nicolas,Batum,F-G,CHA
1,47179,2019-01-30,126,4,False,2018,Final,,94,2,4,1076655,0.0,0.0,2.0,1.000,2.0,2.0,1.000000,4.0,4.0,0.000000,0.0,0.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...",27:23,0.0,0.0,"{'id': 482, 'first_name': 'Marvin', 'height_fe...",10.0,2.0,1.0,"{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",0.0,482,4,Marvin,Williams,F,CHA
2,47179,2019-01-30,126,4,False,2018,Final,,94,2,4,1076656,2.0,1.0,4.0,0.000,0.0,0.0,0.333000,3.0,1.0,0.000000,0.0,0.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...",13:34,1.0,1.0,"{'id': 48, 'first_name': 'Bismack', 'height_fe...",2.0,5.0,0.0,"{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",1.0,48,4,Bismack,Biyombo,C,CHA
3,47179,2019-01-30,126,4,False,2018,Final,,94,2,4,1076657,1.0,0.0,5.0,0.400,5.0,2.0,0.286000,7.0,2.0,0.000000,0.0,0.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...",23:10,0.0,2.0,"{'id': 267, 'first_name': 'Jeremy', 'height_fe...",6.0,5.0,0.0,"{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",2.0,267,4,Jeremy,Lamb,G,CHA
4,47179,2019-01-30,126,4,False,2018,Final,,94,2,4,1076658,2.0,1.0,1.0,0.286,7.0,2.0,0.450000,20.0,9.0,1.000000,1.0,1.0,"{'id': 47179, 'date': '2019-01-30', 'home_team...",26:31,0.0,0.0,"{'id': 465, 'first_name': 'Kemba', 'height_fee...",21.0,1.0,1.0,"{'id': 4, 'abbreviation': 'CHA', 'city': 'Char...",1.0,465,4,Kemba,Walker,G,CHA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234533,1038294,2024-02-03,117,4,False,2023,Final,Final,129,7,17,14391931,0.0,0.0,0.0,0.000,0.0,0.0,1.000000,1.0,1.0,0.000000,0.0,0.0,"{'id': 1038294, 'date': '2024-02-03', 'home_te...",04,0.0,0.0,"{'id': 284, 'first_name': 'Robin', 'height_fee...",2.0,0.0,0.0,"{'id': 17, 'abbreviation': 'MIL', 'city': 'Mil...",0.0,284,17,Robin,Lopez,C,MIL
234534,1038294,2024-02-03,117,4,False,2023,Final,Final,129,7,17,14391932,0.0,0.0,7.0,0.375,8.0,3.0,0.333333,9.0,3.0,0.000000,2.0,0.0,"{'id': 1038294, 'date': '2024-02-03', 'home_te...",33,0.0,1.0,"{'id': 38, 'first_name': 'Malik', 'height_feet...",9.0,7.0,2.0,"{'id': 17, 'abbreviation': 'MIL', 'city': 'Mil...",1.0,38,17,Malik,Beasley,G,MIL
234535,1038294,2024-02-03,117,4,False,2023,Final,Final,129,7,17,14391933,8.0,1.0,3.0,1.000,5.0,5.0,0.909091,11.0,10.0,0.833333,6.0,5.0,"{'id': 1038294, 'date': '2024-02-03', 'home_te...",38,0.0,3.0,"{'id': 278, 'first_name': 'Damian', 'height_fe...",30.0,3.0,0.0,"{'id': 17, 'abbreviation': 'MIL', 'city': 'Mil...",6.0,278,17,Damian,Lillard,G,MIL
234536,1038294,2024-02-03,117,4,False,2023,Final,Final,129,7,17,14391937,2.0,0.0,2.0,0.000,1.0,0.0,0.666667,6.0,4.0,0.000000,0.0,0.0,"{'id': 1038294, 'date': '2024-02-03', 'home_te...",21,0.0,3.0,"{'id': 112, 'first_name': 'Jae', 'height_feet'...",8.0,2.0,2.0,"{'id': 17, 'abbreviation': 'MIL', 'city': 'Mil...",0.0,112,17,Jae,Crowder,F,MIL


### Re-arranging columns in the Dataframe

In [209]:
best_column_order = ['game_id', 'game_date', "season", 'new_home_team_id', 'new_vis_team_id',
                    'score_home','score_vis', 'team_id', 'abbreviation',
                     'player_id', 'position', 'first_name', 'last_name',
                      'pts', 'reb', 'stl',
         'ast', 'blk','oreb', 'dreb', 'fg3_pct', 'fg3a',
       'fg3m', 'fg_pct', 'fgm','fga',  'ft_pct', 'ftm', 'fta',
                     'turnover', 'pf', 'min','home_team_id', 'visitor_team_id',
                     'period', 'status', 'id', 'game', 'player', 'team']

merged_df = merged_df[best_column_order]

In [210]:
merged_df.drop(columns=["status", "period"])

Unnamed: 0,game_id,game_date,season,new_home_team_id,new_vis_team_id,score_home,score_vis,team_id,abbreviation,player_id,position,first_name,last_name,pts,reb,stl,ast,blk,oreb,dreb,fg3_pct,fg3a,fg3m,fg_pct,fgm,fga,ft_pct,ftm,fta,turnover,pf,min,home_team_id,visitor_team_id,id,game,player,team
0,47179,2019-01-30,2018,2,4,126,94,4,CHA,33,F-G,Nicolas,Batum,13.0,2.0,2.0,4.0,0.0,2.0,0.0,0.250,4.0,1.0,0.545000,6.0,11.0,0.000000,0.0,0.0,3.0,3.0,26:38,4,False,1076654,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 33, 'first_name': 'Nicolas', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char..."
1,47179,2019-01-30,2018,2,4,126,94,4,CHA,482,F,Marvin,Williams,10.0,2.0,1.0,0.0,0.0,0.0,2.0,1.000,2.0,2.0,1.000000,4.0,4.0,0.000000,0.0,0.0,0.0,0.0,27:23,4,False,1076655,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 482, 'first_name': 'Marvin', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char..."
2,47179,2019-01-30,2018,2,4,126,94,4,CHA,48,C,Bismack,Biyombo,2.0,5.0,0.0,2.0,1.0,1.0,4.0,0.000,0.0,0.0,0.333000,1.0,3.0,0.000000,0.0,0.0,1.0,1.0,13:34,4,False,1076656,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 48, 'first_name': 'Bismack', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char..."
3,47179,2019-01-30,2018,2,4,126,94,4,CHA,267,G,Jeremy,Lamb,6.0,5.0,0.0,1.0,0.0,0.0,5.0,0.400,5.0,2.0,0.286000,2.0,7.0,0.000000,0.0,0.0,2.0,2.0,23:10,4,False,1076657,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 267, 'first_name': 'Jeremy', 'height_fe...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char..."
4,47179,2019-01-30,2018,2,4,126,94,4,CHA,465,G,Kemba,Walker,21.0,1.0,1.0,2.0,1.0,0.0,1.0,0.286,7.0,2.0,0.450000,9.0,20.0,1.000000,1.0,1.0,1.0,0.0,26:31,4,False,1076658,"{'id': 47179, 'date': '2019-01-30', 'home_team...","{'id': 465, 'first_name': 'Kemba', 'height_fee...","{'id': 4, 'abbreviation': 'CHA', 'city': 'Char..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234533,1038294,2024-02-03,2023,7,17,117,129,17,MIL,284,C,Robin,Lopez,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,1.000000,1.0,1.0,0.000000,0.0,0.0,0.0,0.0,04,4,False,14391931,"{'id': 1038294, 'date': '2024-02-03', 'home_te...","{'id': 284, 'first_name': 'Robin', 'height_fee...","{'id': 17, 'abbreviation': 'MIL', 'city': 'Mil..."
234534,1038294,2024-02-03,2023,7,17,117,129,17,MIL,38,G,Malik,Beasley,9.0,7.0,2.0,0.0,0.0,0.0,7.0,0.375,8.0,3.0,0.333333,3.0,9.0,0.000000,0.0,2.0,1.0,1.0,33,4,False,14391932,"{'id': 1038294, 'date': '2024-02-03', 'home_te...","{'id': 38, 'first_name': 'Malik', 'height_feet...","{'id': 17, 'abbreviation': 'MIL', 'city': 'Mil..."
234535,1038294,2024-02-03,2023,7,17,117,129,17,MIL,278,G,Damian,Lillard,30.0,3.0,0.0,8.0,1.0,0.0,3.0,1.000,5.0,5.0,0.909091,10.0,11.0,0.833333,5.0,6.0,6.0,3.0,38,4,False,14391933,"{'id': 1038294, 'date': '2024-02-03', 'home_te...","{'id': 278, 'first_name': 'Damian', 'height_fe...","{'id': 17, 'abbreviation': 'MIL', 'city': 'Mil..."
234536,1038294,2024-02-03,2023,7,17,117,129,17,MIL,112,F,Jae,Crowder,8.0,2.0,2.0,2.0,0.0,0.0,2.0,0.000,1.0,0.0,0.666667,4.0,6.0,0.000000,0.0,0.0,0.0,3.0,21,4,False,14391937,"{'id': 1038294, 'date': '2024-02-03', 'home_te...","{'id': 112, 'first_name': 'Jae', 'height_feet'...","{'id': 17, 'abbreviation': 'MIL', 'city': 'Mil..."


### Creating Fantasy score

In [211]:
merged_df["fantasy_bonus"] = merged_df.apply(lambda row: row.loc['pts'] + row.loc['reb'] +row.loc['ast']+row.loc['stl']+ row.loc['fgm'] + row.loc['fg3m'] + row.loc['ftm'], axis=1)
merged_df["fantasy_malus"] = merged_df.apply(lambda row: row.loc['turnover'] + (row.loc["fga"] - row.loc["fgm"]) + (row.loc["fg3a"] - row.loc["fg3m"]) + (row.loc["fta"] - row.loc["ftm"]), axis=1)
merged_df["fantasy_score"] = merged_df["fantasy_bonus"] - merged_df["fantasy_malus"]

### Creating an impact table - In progress

In [212]:
pd.set_option('display.max_columns', None)

In [239]:
def create_impact_df(merged_df, given_date):

    # Filter for the last 30 days
    date_threshold = pd.to_datetime(given_date) - pd.Timedelta(days=15)
    last_30_days_df = merged_df[(merged_df['game_date'] >= date_threshold) & (merged_df['game_date'] <= given_date)]

    # Remove duplicates based on 'player_id' and 'game_date'
    last_30_days_df = last_30_days_df.drop_duplicates(subset=['player_id', 'game_id'], keep='first')

    # Group by player_id and aggregate the statistics
    impact_table = last_30_days_df.groupby('last_name').agg({
        'pts': 'sum',
        'ast': 'sum',
        'reb': 'sum',
        'fantasy_score': 'sum',
        # add other stats as needed
    })

    # Add player information from players_df if needed
    #impact_table = pd.merge(impact_table, df_player[['player_id', 'last_name', 'team_id']], on='player_id', how='left')

    #sort by descending fantasy_score
    impact_table = impact_table.sort_values('pts', ascending=False)
    #round pts ast reb and fantasy_score to 1 decimal places
    impact_table = impact_table.round({'pts': 1, 'ast': 1, 'reb': 1, 'fantasy_score': 1})
    return impact_table

In [240]:
# Example usage:
given_date = '2024-01-15'
last_name='Gilgeous-Alexander'
impact_table = create_impact_df(merged_df, given_date)

In [242]:
impact_table.head(20)

Unnamed: 0_level_0,pts,ast,reb,fantasy_score
last_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Murray,431.0,84.0,99.0,613.0
Williams,281.0,85.0,104.0,446.0
Green,271.0,50.0,79.0,377.0
White,251.0,64.0,52.0,351.0
Jones,238.0,82.0,70.0,381.0
Bridges,232.0,34.0,71.0,314.0
Johnson,224.0,40.0,90.0,293.0
George,219.0,41.0,40.0,316.0
Gilgeous-Alexander,218.0,44.0,44.0,352.0
Brown,211.0,37.0,58.0,318.0


# Prochaines étapes :

In [245]:
#duplicates have been removed
#matches manquants dans données initiales - est-ce dans un merge ou dans les données initiales?
#duplicates pour noms de famille - faire concat nom et prenom pour éviter les doublons
#faire différents fantasy scores pour pour 30, 15 et 7 jours


In [1]:
!git pull origin main

From github.com:pirroux/Personal_project
 * branch            main       -> FETCH_HEAD
Already up to date.


In [6]:
!git status

On branch main
Your branch is ahead of 'origin/main' by 4 commits.
  (use "git push" to publish your local commits)

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
  (commit or discard the untracked or modified content in submodules)
	[31mmodified:   ../../../../02-Data-Toolkit/01-Data-Analysis/data-optional-nba_shot_log[m (new commits)
	[31mmodified:   ../../../../06-Deep-Learning/03-Convolutional-Neural-Networks/.DS_Store[m
	[31mmodified:   ../../../../06-Deep-Learning/03-Convolutional-Neural-Networks/data-intuition-on-convolutions[m (modified content)
	[31mmodified:   ../../../../06-Deep-Learning/03-Convolutional-Neural-Networks/data-mnist-classification[m (modified content)
	[31mmodified:   ../../../../06-Deep-Learning/03-Convolutional-Neural-Networks/data-mnist-classification_redo[m (untracked content)
	[31mmodified:   ../../../../06-Deep-Learning/03-C