In [1]:
%run "../Functions/functions_v1.ipynb"

# Ingest and clean data
In this notebook we ingest in the scraped data and create two dataframes
*  cleaned_matches.csv: The 2016 - 2024 matches cleaned and joined together in one dataframe
*  cleaned_matches_stacked.csv: The cleaned_matches.csv dataframe with two rows per match with the format team and opponent. This table is created to simplify the training process. 

In [3]:
#ingest raw data
years = [x for x in range(2016, 2025)]

new_table = True
df_joined_fin = None
df_clean_fin = None

#Look through each of the years
for year in years:
    print(year)

    #Get data from year
    df_raw = pd.read_csv(f"../Data/{year}_matches.csv")

    #clean the scraped data using the clean_raw_scraped_df function
    df_clean = clean_raw_scraped_df(df_raw)

    #transform data into each team having a single row. This will make it easier to keep track of an individuals team performance
    #by being able to filter on df.team == "Afghanistan" for example, which will return all of Afghanistan's (home and away) matches
    df_merged = create_one_col_df(df_clean)
    df_stacked = get_opp_stats(df_merged)

    #Concat the different years of each table
    if new_table:
        
        df_clean_fin = df_clean
        df_stacked_fin = df_stacked
        new_table = False
    else:
        
        df_clean_fin = pd.concat([df_clean_fin, df_clean])
        df_stacked_fin = pd.concat([df_stacked_fin, df_stacked])

#Save down the two tables
df_clean_fin.to_csv(f'../Data/cleaned_matches.csv', index=False)
df_stacked_fin.to_csv(f'../Data/cleaned_matches_stacked.csv', index=False)

2016
2017
2018
2019
2020
2021
2022
2023
2024


In [4]:
df_clean_fin

Unnamed: 0,date,team_1,team_2,score_1,score_2,location,elo_change_1,elo_change_2,new_elo_1,new_elo_2,...,new_rank_2,old_elo_1,old_elo_2,old_rank_1,old_rank_2,elo_diff_1,elo_diff_2,result,result_class,game_id
0,2016-01-03,India,Afghanistan,2,1,South Asian Championship in India,+19,−19,1082,1124,...,174,1063,1143,181,174,-80,80,"[1, 0, 0]",T1,2016-01-03IndiaAfghanistan
1,2016-01-06,Estonia,Sweden,1,1,Friendly in the United Arab Emirates,+7,−7,1501,1760,...,29,1494,1767,87,31,-273,273,"[0, 1, 0]",D,2016-01-06EstoniaSweden
2,2016-01-08,Bangladesh,Sri Lanka,4,2,Bangabandhu Gold Cup in Bangladesh,+10,−10,900,765,...,211,890,775,202,211,115,-115,"[1, 0, 0]",T1,2016-01-08BangladeshSri Lanka
3,2016-01-08,Haiti,Trinidad and Tobago,1,0,Copa América qualifier in Panama,+28,−28,1504,1587,...,61,1476,1615,82,67,-139,139,"[1, 0, 0]",T1,2016-01-08HaitiTrinidad and Tobago
4,2016-01-08,Panama,Cuba,4,0,Copa América qualifier in Panama,+5,−5,1688,1335,...,125,1683,1340,42,128,343,-343,"[1, 0, 0]",T1,2016-01-08PanamaCuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
654,2024-07-10,England,Netherlands,2,1,European Championship in Germany,+26,−26,1997,1963,...,8,1971,1989,6,9,-18,18,"[1, 0, 0]",T1,2024-07-10EnglandNetherlands
655,2024-07-10,Colombia,Uruguay,1,0,Copa América in the United States,+24,−24,2070,2003,...,6,2046,2027,2,7,19,-19,"[1, 0, 0]",T1,2024-07-10ColombiaUruguay
656,2024-07-13,Canada,Uruguay,2,2,Copa América in the United States,+16,−16,1756,1987,...,7,1740,2003,30,8,-263,263,"[0, 1, 0]",D,2024-07-13CanadaUruguay
657,2024-07-14,Spain,England,2,1,European Championship in Germany,+16,−16,2150,1981,...,7,2134,1997,2,8,137,-137,"[1, 0, 0]",T1,2024-07-14SpainEngland


In [5]:
df_stacked_fin

Unnamed: 0,game_id,date,location,team,opp,score,opp_score,elo,opp_elo,elo_diff,result_class
0,2016-01-03IndiaAfghanistan,2016-01-03,South Asian Championship in India,India,Afghanistan,2,1,1063,1143,-80,T1
1,2016-01-03IndiaAfghanistan,2016-01-03,South Asian Championship in India,Afghanistan,India,1,2,1143,1063,80,T2
2,2016-01-06EstoniaSweden,2016-01-06,Friendly in the United Arab Emirates,Estonia,Sweden,1,1,1494,1767,-273,D
3,2016-01-06EstoniaSweden,2016-01-06,Friendly in the United Arab Emirates,Sweden,Estonia,1,1,1767,1494,273,D
4,2016-01-08BangladeshSri Lanka,2016-01-08,Bangabandhu Gold Cup in Bangladesh,Bangladesh,Sri Lanka,4,2,890,775,115,T1
...,...,...,...,...,...,...,...,...,...,...,...
1313,2024-07-13CanadaUruguay,2024-07-13,Copa América in the United States,Canada,Uruguay,2,2,1740,2003,-263,D
1314,2024-07-14ArgentinaColombia,2024-07-14,Copa América in the United States,Argentina,Colombia,1,0,2149,2070,79,T1
1315,2024-07-14ArgentinaColombia,2024-07-14,Copa América in the United States,Colombia,Argentina,0,1,2070,2149,-79,T2
1316,2024-07-14SpainEngland,2024-07-14,European Championship in Germany,Spain,England,2,1,2134,1997,137,T1
