In [2]:
import pandas as pd

In [3]:

# read the csv file into a pandas dataframe
df = pd.read_csv('season_data_all.csv')

# group the data by player and season, and sum the statistics
grouped_df = df.groupby(['Player_id', 'Season']).agg({
    'Games': 'sum',
    'PPG': lambda x: x.mean().round(2),
    'Goals': 'sum',
    'Assists': 'sum',
    'Yellow_cards': 'sum',
    'Yellow_red_cards': 'sum',
    'Red_cards': 'sum',
    'Penalty_goals': 'sum',
    'Minutes_per_goal': lambda x: x.mean().round(0),
    'Minutes_played': 'sum',
    'Club_id': 'first'
}).reset_index()


In [29]:
# for the 'competition' column, get the most played competition that is also a 'wettbewerb'
def get_most_played_wettbewerb(group):
    # wettbewerb_count = group.loc[group['League_type'] == 'wettbewerb'][['Competition','Games']]
    wettbewerb_count = group.loc[group['League_type'] == 'wettbewerb'].groupby('Competition')['Games'].sum()
    # print(wettbewerb_count)
    if wettbewerb_count.empty:
        return ''
    else:
        return wettbewerb_count.idxmax()

competition_df = df.groupby(['Player_id', 'Season']).apply(lambda x: get_most_played_wettbewerb(x)).rename('Competition').reset_index()
# competition_df


# join the statistics dataframe with the competition dataframe
result_df = grouped_df.merge(competition_df, left_on=['Player_id', 'Season'],right_on=['Player_id', 'Season'], how='inner')

# result_df.drop(result_df['Minutes_played'] == 0)
result_df = result_df[(result_df['Minutes_played'] != 0) & (result_df['Competition'] != '')]

# save the result to a new csv file
result_df.to_csv('cleaned_player_season_data_grouped.csv', index=False)