In [64]:
import pandas as pd
import json

In [65]:
# Loading data :
df = pd.read_csv('data/csv/int_matchs_results_1872_2021.csv',index_col = 0)
country_codes = pd.read_csv('data/csv/countries_codes.csv',dtype="str")

# Loading dictionaries where replacement and additional values are stored :
dictio_replace = json.load(open("data/json/dict_replace_country_code.json",'rb'))
dict_replace_foot_data = json.load(open("data/json/dict_replace_foot_data.json",'rb'))
uk_countries = json.load(open("data/json/uk_countries.json","rb"))

# Make index to datetime :
df.index = pd.to_datetime(df.index)

# Replace country names to make it consistent alltogether :
country_codes['name'] = country_codes['name'].replace(dictio_replace)
df['home_team'] = df['home_team'].replace(dict_replace_foot_data)
df['away_team'] = df['away_team'].replace(dict_replace_foot_data)

# Remove nans
df = df.dropna()

# Remove matches where same country (e.g. Germany vs Germany) because of replacement :
df = df.drop(df[df['away_team'] == df['home_team']].index)

# Keep only country code and name :
country_codes = country_codes[['name','country-code']]

# Add UK sub-countries to country_codes :
for ct in uk_countries:
    new = pd.DataFrame([[ct,uk_countries[ct]]],
    columns = ['name','country-code'])
    country_codes = country_codes.append(new,ignore_index=True)
country_codes.to_csv('data/csv/countries_codes_with_uk_subcountries.csv')

# Keep only countries that are in country codes list :
df = df[df['home_team'].isin(country_codes['name'].unique())]
df = df[df['away_team'].isin(country_codes['name'].unique())]

In [66]:
# Calculate number of scored goals per year :
year_index = df.index.to_period("Y")
yearly_home_scored_goals = df.groupby([year_index,'home_team']).sum().reset_index()[[
    'date',
    'home_team',
    'home_score'
    ]].rename(
        columns = {
            'home_team':'team',
            'home_score':'scored_goals'
        }
    )
yearly_away_scored_goals = df.groupby([year_index,'away_team']).sum().reset_index()[
    [
        'date',
        'away_team',
        'away_score'
    ]
].rename(
    columns = {
        'away_team':'team',
        'away_score':'scored_goals'
    }
)
yearly_total_scored_goals = pd.concat((
    yearly_home_scored_goals,
    yearly_away_scored_goals
    ),
    ignore_index = True)
yearly_total_scored_goals = yearly_total_scored_goals.groupby([
    'date',
    'team'
    ]).sum().reset_index()

yearly_total_scored_goals.to_csv('data/csv/yearly_scored_goals_by_team.csv',index=False)

In [67]:
# Calculate number of conceded goals per year :
yearly_home_conceded_goals = df.groupby([year_index,'home_team']).sum().reset_index()[[
    'date',
    'home_team',
    'away_score'
    ]].rename(
        columns = {
            'home_team':'team',
            'away_score':'conceded_goals'
        }
    )
yearly_away_conceded_goals = df.groupby([year_index,'away_team']).sum().reset_index()[
    [
        'date',
        'away_team',
        'home_score'
    ]
].rename(
    columns = {
        'away_team':'team',
        'home_score':'conceded_goals'
    }
)
yearly_total_conceded_goals = pd.concat((
    yearly_home_conceded_goals,
    yearly_away_conceded_goals
    ),
    ignore_index = True)
yearly_total_conceded_goals = yearly_total_conceded_goals.groupby([
    'date',
    'team'
    ]).sum().reset_index()

yearly_total_conceded_goals.to_csv('data/csv/yearly_conceded_goals_by_team.csv',index=False)

In [68]:
# Calculate number of wins, losses and draws :
home_wins_df = df.copy()
home_wins_df['wins'] = home_wins_df['home_score'] > home_wins_df['away_score']
home_wins_df['lost'] = home_wins_df['home_score'] < home_wins_df['away_score']
home_wins_df['draws'] = home_wins_df['home_score'] == home_wins_df['away_score']
home_wins_df = home_wins_df[
    [
        'home_team',
        'wins',
        'lost',
        'draws'
    ]
].rename(
    columns = {'home_team':'team'}
)
home_wins_df = home_wins_df.groupby([year_index,'team']).sum().reset_index()

away_wins_df = df.copy()
away_wins_df['wins'] = away_wins_df['home_score'] < away_wins_df['away_score']
away_wins_df['lost'] = away_wins_df['home_score'] > away_wins_df['away_score']
away_wins_df['draws'] = away_wins_df['home_score'] == away_wins_df['away_score']
away_wins_df = away_wins_df[
    [
        'away_team',
        'wins',
        'lost',
        'draws'
    ]
].rename(
    columns = {'away_team':'team'}
)
away_wins_df = away_wins_df.groupby([year_index,'team']).sum().reset_index()

yearly_total_wins = pd.concat((home_wins_df,away_wins_df),ignore_index=True).groupby(
    [
        'date',
        'team'
    ]
).sum().reset_index()
yearly_total_wins.to_csv('data/csv/yearly_wins_by_team.csv',index=False)