In [1]:
import pandas as pd
import os
from tqdm import tqdm
from data_wrapper import FootballPredictorDataWrapper, add_h2h_goals, add_h2h_results, add_h2h_stats

ModuleNotFoundError: No module named 'data_wrapper'

In [2]:
#this method takes every csv from every season and every league that will be used for modelling and runs data_wrapper.py on it
def prepare_data():
    for league_folder in tqdm(os.listdir('./modelling_data/raw_data')):
        for csv_file in tqdm(os.listdir(f'./modelling_data/raw_data/{league_folder}')):
            try:
                df = pd.read_csv(f'./modelling_data/raw_data/{league_folder}/{csv_file}')
                wrapper = FootballPredictorDataWrapper(df)
                df_prepared = wrapper.run()
                df_prepared.to_csv(f'./modelling_data/prepared_data/{league_folder}/{csv_file.split(".")[0]}-prepared.csv')
            except Exception as e:
                print(f'There was a problem with preparing data from {csv_file} -> {e}')

In [3]:
prepare_data()

  0%|          | 0/20 [00:00<?, ?it/s]

  df['date_GMT'] = pd.to_datetime(df['date_GMT'])
  df['date_GMT'] = pd.to_datetime(df['date_GMT'])
  df['date_GMT'] = pd.to_datetime(df['date_GMT'])
  df['date_GMT'] = pd.to_datetime(df['date_GMT'])
  df['date_GMT'] = pd.to_datetime(df['date_GMT'])
  df['date_GMT'] = pd.to_datetime(df['date_GMT'])
100%|██████████| 6/6 [00:03<00:00,  1.74it/s]
  df['date_GMT'] = pd.to_datetime(df['date_GMT'])
  df['date_GMT'] = pd.to_datetime(df['date_GMT'])
  df['date_GMT'] = pd.to_datetime(df['date_GMT'])
  df['date_GMT'] = pd.to_datetime(df['date_GMT'])
  df['date_GMT'] = pd.to_datetime(df['date_GMT'])
  df['date_GMT'] = pd.to_datetime(df['date_GMT'])
100%|██████████| 6/6 [00:03<00:00,  1.64it/s]
  df['date_GMT'] = pd.to_datetime(df['date_GMT'])
  df['date_GMT'] = pd.to_datetime(df['date_GMT'])
  df['date_GMT'] = pd.to_datetime(df['date_GMT'])
  df['date_GMT'] = pd.to_datetime(df['date_GMT'])
  df['date_GMT'] = pd.to_datetime(df['date_GMT'])
  df['date_GMT'] = pd.to_datetime(df['date_GMT'])
100%|███

In [6]:
#this method takes every prepared csv from every season and every league that will be used for modelling, joins data from different seasons and adds h2h features
def join_data_and_add_h2h_features():
    for league_folder in tqdm(os.listdir('./modelling_data/prepared_data')):
        league_data_all_seasons = pd.DataFrame()
        for csv_file in tqdm(os.listdir(f'./modelling_data/prepared_data/{league_folder}')):
            data_from_specific_season = pd.read_csv(f'./modelling_data/prepared_data/{league_folder}/{csv_file}')
            league_data_all_seasons = pd.concat([league_data_all_seasons, data_from_specific_season], axis=0, ignore_index=True)
            
        league_data_all_seasons = add_h2h_results(league_data_all_seasons)
        league_data_all_seasons = add_h2h_goals(league_data_all_seasons)
        league_data_all_seasons = add_h2h_stats(league_data_all_seasons)
        league_data_all_seasons.drop(['team_home_x', 'team_away_x', 'team_home_y', 'team_away_y'], axis=1, inplace=True)
        
        league_data_all_seasons.to_csv(f'./modelling_data/concatenated_data/{league_folder}_data_prepared_all_seasons.csv', index=False)    

In [7]:
join_data_and_add_h2h_features()

100%|██████████| 6/6 [00:00<00:00, 86.56it/s]
100%|██████████| 6/6 [00:00<00:00, 87.01it/s]]
100%|██████████| 6/6 [00:00<00:00, 89.09it/s]]
100%|██████████| 6/6 [00:00<00:00, 68.08it/s]]
100%|██████████| 6/6 [00:00<00:00, 84.79it/s]]
100%|██████████| 6/6 [00:00<00:00, 101.92it/s]
100%|██████████| 6/6 [00:00<00:00, 80.13it/s]]
100%|██████████| 6/6 [00:00<00:00, 87.21it/s]]
100%|██████████| 6/6 [00:00<00:00, 93.16it/s]]
100%|██████████| 6/6 [00:00<00:00, 76.11it/s]]
100%|██████████| 6/6 [00:00<00:00, 76.56it/s]s]
100%|██████████| 6/6 [00:00<00:00, 65.99it/s]s]
100%|██████████| 7/7 [00:00<00:00, 67.63it/s]s]
100%|██████████| 7/7 [00:00<00:00, 63.35it/s]s]
100%|██████████| 6/6 [00:00<00:00, 75.52it/s]s]
100%|██████████| 6/6 [00:00<00:00, 96.77it/s]s]
100%|██████████| 6/6 [00:00<00:00, 74.36it/s]s]
100%|██████████| 6/6 [00:00<00:00, 96.95it/s]s]
100%|██████████| 6/6 [00:00<00:00, 73.69it/s]s]
100%|██████████| 6/6 [00:00<00:00, 91.09it/s]s]
100%|██████████| 20/20 [00:06<00:00,  2.94it/s]


In [1]:
def final_data_cleaning(data):
    try:
        data.drop('Unnamed: 0', axis=1, inplace=True)
    except KeyError:
        print('There is no old index column in dataframe.')
    
    """
    filling empties in stadium attendance
    """
    
    #step 1 - filling empties with mean value for home team and season
    data['attendance'] = data['attendance'].fillna(data.groupby(['home_team_name', 'season'])['attendance'].transform('mean'))
    
    #step 2 (optional) - filling empties with mean value for whole league and stadium
    data['attendance'] = data['attendance'].fillna(data.groupby(['league', 'season'])['attendance'].transform('mean'))

    #step 3 (optional) - filling empties with mean value for whole league across all seasons
    data['attendance'] = data['attendance'].fillna(data.groupby(['league'])['attendance'].transform('mean'))
    
    """
    renaming columns
    """

    data.rename(columns={
        'Game Week': 'game_week',
        'team_a_xg': 'home_team_xg',
        'team_b_xg': 'away_team_xg'
    }, inplace=True)
    
    data.columns = [col.lower() for col in data.columns]    

    """
    removing unnecessary features
    """
    
    features_to_drop = [
        'referee',
        'home_ppg',
        'away_ppg',
        'stadium_name'
    ]
    
    data.drop(features_to_drop, axis=1, inplace=True)
    
    return data

In [2]:
import pandas as pd

In [4]:
data = pd.read_csv(r'..\modelling_data\concatenated_data\austria_data_prepared_all_seasons.csv')

In [5]:
data

Unnamed: 0.1,Unnamed: 0,attendance,home_team_name,away_team_name,referee,Game Week,home_ppg,away_ppg,home_team_goal_count,away_team_goal_count,...,average_total_fouls_in_away_team_games_in_last_5_games,average_fouls_by_home_team_in_last_5_games,average_fouls_by_away_team_in_last_5_games,h2h_home_team_wins_pre_game,h2h_away_team_wins_pre_game,h2h_draws_pre_game,average_goals_h2h,average_yellow_cards_h2h,average_red_cards_h2h,average_corners_h2h
0,0,13155.0,Austria Wien,Wacker Innsbruck,Christopher Jäger,1.0,1.50,0.75,2,1,...,0.0,0.0,0.0,0,0,0,0.000000,0.000000,0.000000,0.000000
1,1,10785.0,Sturm Graz,Hartberg,Rene Eisner,1.0,0.94,0.88,3,2,...,0.0,0.0,0.0,0,0,0,0.000000,0.000000,0.000000,0.000000
2,2,4523.0,Rheindorf Altach,Mattersburg,Dieter Muckenhammer,1.0,0.94,1.00,2,3,...,0.0,0.0,0.0,0,0,0,0.000000,0.000000,0.000000,0.000000
3,3,11532.0,Salzburg,LASK Linz,Harald Lechner,1.0,2.75,2.06,3,1,...,0.0,0.0,0.0,0,0,0,0.000000,0.000000,0.000000,0.000000
4,4,5200.0,Admira,Rapid Wien,Alexander Harkam,1.0,1.00,1.41,0,3,...,0.0,0.0,0.0,0,0,0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1165,190,,Sturm Graz,Austria Klagenfurt,Harald Lechner,10.0,2.13,1.25,2,0,...,18.4,13.8,8.2,9,1,1,2.909091,3.181818,0.181818,9.636364
1166,191,,Rapid Wien,Hartberg,Markus Hameter,10.0,1.19,1.35,0,3,...,18.6,10.6,7.4,7,6,4,3.411765,4.705882,0.176471,11.411765
1167,192,,Wolfsberger AC,Austria Wien,Stefan Ebner,,1.39,1.57,1,2,...,24.4,11.4,12.2,9,7,4,2.550000,4.050000,0.250000,10.200000
1168,193,,Austria Wien,Hartberg,Sebastian Gishamer,,1.57,1.32,2,1,...,18.8,13.2,8.0,10,5,2,3.235294,3.588235,0.176471,10.000000


In [6]:
data_clean = final_data_cleaning(data)

In [7]:
data_clean

Unnamed: 0,attendance,home_team_name,away_team_name,game_week,home_team_goal_count,away_team_goal_count,total_goal_count,total_goals_at_half_time,home_team_goal_count_half_time,away_team_goal_count_half_time,...,average_total_fouls_in_away_team_games_in_last_5_games,average_fouls_by_home_team_in_last_5_games,average_fouls_by_away_team_in_last_5_games,h2h_home_team_wins_pre_game,h2h_away_team_wins_pre_game,h2h_draws_pre_game,average_goals_h2h,average_yellow_cards_h2h,average_red_cards_h2h,average_corners_h2h
0,13155.000000,Austria Wien,Wacker Innsbruck,1.0,2,1,3,2,1,1,...,0.0,0.0,0.0,0,0,0,0.000000,0.000000,0.000000,0.000000
1,10785.000000,Sturm Graz,Hartberg,1.0,3,2,5,3,2,1,...,0.0,0.0,0.0,0,0,0,0.000000,0.000000,0.000000,0.000000
2,4523.000000,Rheindorf Altach,Mattersburg,1.0,2,3,5,2,1,1,...,0.0,0.0,0.0,0,0,0,0.000000,0.000000,0.000000,0.000000
3,11532.000000,Salzburg,LASK Linz,1.0,3,1,4,3,3,0,...,0.0,0.0,0.0,0,0,0,0.000000,0.000000,0.000000,0.000000
4,5200.000000,Admira,Rapid Wien,1.0,0,3,3,3,0,3,...,0.0,0.0,0.0,0,0,0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1165,4578.833219,Sturm Graz,Austria Klagenfurt,10.0,2,0,2,0,0,0,...,18.4,13.8,8.2,9,1,1,2.909091,3.181818,0.181818,9.636364
1166,4578.833219,Rapid Wien,Hartberg,10.0,0,3,3,1,0,1,...,18.6,10.6,7.4,7,6,4,3.411765,4.705882,0.176471,11.411765
1167,4578.833219,Wolfsberger AC,Austria Wien,,1,2,3,1,0,1,...,24.4,11.4,12.2,9,7,4,2.550000,4.050000,0.250000,10.200000
1168,4578.833219,Austria Wien,Hartberg,,2,1,3,0,0,0,...,18.8,13.2,8.0,10,5,2,3.235294,3.588235,0.176471,10.000000


In [9]:
data_clean.to_csv('../modelling_data/concatenated_data/cleaned_data/austria_data_prepared_all_seasons_cleaned.csv', index=False)

In [10]:
data_clean.isna().sum()

attendance                   0
home_team_name               0
away_team_name               0
game_week                   12
home_team_goal_count         0
                            ..
h2h_draws_pre_game           0
average_goals_h2h            0
average_yellow_cards_h2h     0
average_red_cards_h2h        0
average_corners_h2h          0
Length: 162, dtype: int64

In [None]:
for csv_file in os.listdir('./modelling_data/concatenated_data/')

In [15]:
root

'./modelling_data/concatenated_data/cleaned_data'

In [18]:
os.path.join(root, name)

'./modelling_data/concatenated_data/cleaned_data\\ukraine_data_prepared_all_seasons.csv'

In [19]:
os.path.join('./modelling_data/concatenated_data', 'cleaned_data', 'data.csv')

'./modelling_data/concatenated_data\\cleaned_data\\data.csv'

In [25]:
for root, _, files in os.walk('./modelling_data/concatenated_data/'):
    for name in tqdm(files):
        csv_file_path = os.path.join(root, name)
        data = pd.read_csv(csv_file_path)
        data_clean = final_data_cleaning(data)
        data_clean.to_csv(os.path.join(root, 'cleaned_data', f'{name.split(".")[0]}_cleaned.csv'), index=False)

100%|██████████| 20/20 [00:04<00:00,  4.20it/s]
  0%|          | 0/20 [00:00<?, ?it/s]

There is no old index column in dataframe.





KeyError: "['referee', 'home_ppg', 'away_ppg', 'stadium_name'] not found in axis"

In [37]:
norway_data = pd.read_csv(r'modelling_data\concatenated_data\norway_data_prepared_all_seasons.csv')

In [39]:
norway_data_clean = final_data_cleaning(norway_data)

In [40]:
norway_data_clean.tail()

Unnamed: 0,attendance,home_team_name,away_team_name,game_week,home_team_goal_count,away_team_goal_count,total_goal_count,total_goals_at_half_time,home_team_goal_count_half_time,away_team_goal_count_half_time,...,average_total_fouls_in_away_team_games_in_last_5_games,average_fouls_by_home_team_in_last_5_games,average_fouls_by_away_team_in_last_5_games,h2h_home_team_wins_pre_game,h2h_away_team_wins_pre_game,h2h_draws_pre_game,average_goals_h2h,average_yellow_cards_h2h,average_red_cards_h2h,average_corners_h2h
1675,7422.941566,Kristiansund,Rosenborg,30,0,0,0,0,0,0,...,-2.0,-1.0,-1.0,1,5,5,2.545455,2.909091,0.090909,14.818182
1676,7422.941566,Strømsgodset,Molde,30,0,0,0,0,0,0,...,-2.0,-1.0,-1.0,1,11,1,3.615385,2.538462,0.076923,9.0
1677,7422.941566,Sandefjord,Fredrikstad,30,0,0,0,0,0,0,...,-2.0,-1.0,-1.0,0,1,0,1.0,2.0,0.0,11.0
1678,7422.941566,Haugesund,Odd,30,0,0,0,0,0,0,...,-2.0,-1.0,-1.0,6,3,4,3.692308,3.461538,0.0,10.076923
1679,7422.941566,FK Bodo - Glimt,Lillestrøm,30,0,0,0,0,0,0,...,-2.0,-1.0,-1.0,8,0,3,2.909091,3.0,0.0,11.181818


In [12]:
import os

In [13]:
all_data = pd.DataFrame()

for csv_file in os.listdir('../modelling_data/concatenated_data/cleaned_data'):
    data_from_specific_league = pd.read_csv(f'../modelling_data/concatenated_data/cleaned_data/{csv_file}', delimiter=';')
    all_data = pd.concat([all_data, data_from_specific_league], axis=0)
    
all_data.to_csv('../modelling_data/concatenated_data/cleaned_data/all_games_1819_2324_cleaned.csv', index=False)