In [1]:
import pandas as pd
from typing import List, Dict

## We are going to obtain our football dataset, delete the irrelevant features, and normalize it.

#### First, we will get our data from FBREF. To do that, we generate a csv for each category.

In [2]:
def generate_csv(url: str, csv_name: str, stats_to_drop: List[str]=['Rk', 'Born', 'Matches', '90s', 'Nation', 'Pos', 'Squad', 'Comp', 'Age']) -> None:
    data = pd.read_html(url, header=1)[0]

    #keep=False is to delete also the first one.
    data.drop_duplicates(subset=['Rk'], keep=False, inplace=True) 

    #Drop the players that during the season changed teams, meaning they appear twice with the same stats.
    data.drop_duplicates(subset=['Player'], inplace=True)

    data.drop(columns=stats_to_drop, inplace=True)

    data.set_index(['Player'], inplace=True)

    data.to_csv(csv_name)



In [3]:
url = 'https://fbref.com/en/comps/Big5/2022-2023/{category}/players/2022-2023-Big-5-European-Leagues-Stats'

categories = [('shooting', 'shooting.csv'), ('passing', 'passing.csv'), ('passing_types', 'passtypes.csv'),
            ('gca', 'gca.csv'), ('defense', 'def.csv'), ('possession', 'pos.csv'), ('misc', 'mis.csv')]

generate_csv('https://fbref.com/en/comps/Big5/2022-2023/stats/players/2022-2023-Big-5-European-Leagues-Stats', 'std.csv', ['Rk', 'Born','Matches','90s'])

for i in categories:
    completed_url = url.format(category=i[0])

    generate_csv(completed_url, i[1])


### Then, we merge the csvs into one that contains all the information.

In [4]:
def merge_csvs(csv_left: str, csv_right: str) -> str:
    left = pd.read_csv(csv_left)
    right = pd.read_csv(csv_right)

    # Since some different stats have the same names, we need to add suffixes to differentiate them.
    data = left.merge(right, left_on='Player', right_on='Player', suffixes=(f'_{csv_left[:-4]}',f'_{csv_right[:-4]}')) 

    data.set_index(['Player'], inplace=True)

    output_name = f'{csv_left[:-4]}_{csv_right[:-4]}.csv'

    data.to_csv(output_name)

    return output_name


In [5]:
csv_names = ['std.csv', 'shooting.csv', 'passing.csv', 'passtypes.csv', 'gca.csv', 'def.csv', 'pos.csv', 'mis.csv']

merged_file = csv_names[0]
for new_csv in csv_names[1:]:
    merged_file = merge_csvs(merged_file, new_csv)


### By looking at the merged csv, we notice that there are many stats that are repeated or irrelevant. We now delete them.

In [6]:
def columns_drop(csv_name: str, stats: List[str]) -> None:
    data = pd.read_csv(csv_name)

    data.drop(stats, inplace=True, axis=1) 

    data.set_index(['Player'], inplace=True)

    data.to_csv('Final.csv')

In [7]:
stats_to_drop = ['Gls.1', 'Ast.1', 'G+A.1', 'G-PK.1', 'G+A-PK', 'xG.1', 'xAG.1', 'xG+xAG', 'npxG.1' ,'npxG+xAG.1',
                'Gls_shooting', 'Sh/90',	'SoT/90','PK_shooting',	'PKatt_shooting',	'xG_shooting',	'npxG_shooting',
                'Ast_passing',	'xAG_passing', 'PrgP_passing',
                'Att_passtypes','Cmp_passtypes',
                'SCA90', 'GCA90',
                'PrgC_pos','PrgR_pos',
                'CrdY_mis','CrdR_mis','Crs_mis','Int_mis','TklW_mis',
                'Cmp%','Cmp%.1','Cmp%.2','Cmp%.3','Tkl%','Succ%','Tkld%','Won%','SoT%']

columns_drop('std_shooting_passing_passtypes_gca_def_pos_mis.csv', stats_to_drop)

### Finally, we normalize our data using Z-score.

In [8]:
def normalize(csv_name: str) -> None:

    data = pd.read_csv(csv_name)

    #Divide by number of minutes
    data.iloc[:, 9:] = data.iloc[:, 9:].div(data['Min'], axis=0)

    #Z-score
    data.iloc[:, 9:] = (data.iloc[:, 9:]-data.iloc[:, 9:].mean()) / data.iloc[:, 9:].std()

    data.set_index(['Player'], inplace=True)

    data.to_csv('Final.csv')

In [9]:
normalize('Final.csv')

### We perform a couple of basic tests besides manually looking at the csv.

In [10]:
data = pd.read_csv('Final.csv')

print(data['Tkld'].mean())
print(data['Sh'].std())

2.4208177921845034e-17
1.0000000000000002
