In [35]:
import pandas as pd

transform from all divisions and year into 1 dataframe for each country

In [36]:
import os
import pandas as pd

root_dir = "data"
folder_dfs = {}  # Nested dict: country -> subfolder -> CSV -> DataFrame

# Walk through all directories
for dirpath, dirnames, filenames in os.walk(root_dir):
    if dirpath == root_dir:
        continue  # skip root

    # Split the path components relative to data/
    rel_path = os.path.relpath(dirpath, root_dir).split(os.sep)

    # Expecting: ["england", "0"] or ["england", "1"], etc.
    if len(rel_path) != 2:
        continue  # skip if not exactly two levels below root

    country, subfolder = rel_path

    # Initialize nested dicts
    folder_dfs.setdefault(country, {})
    folder_dfs[country].setdefault(subfolder, {})

    # Loop through CSV files
    for filename in filenames:
        if filename.endswith(".csv"):
            file_path = os.path.join(dirpath, filename)
            df = pd.read_csv(file_path)

            csv_name = os.path.splitext(filename)[0]  # e.g. "1920"
            folder_dfs[country][subfolder][csv_name] = df

# Check the structure
for country, subfolders in folder_dfs.items():
    print(f"\nCountry: {country}")
    for sub, csvs in subfolders.items():
        print(f"Subfolder: {sub}")
        for name, df in csvs.items():
            print(f"    - {name}.csv → shape={df.shape}")


def concat_country_data(country_name):
    """Concatenate all CSVs for a given country (across all subfolders)."""
    # Flatten all subfolder DataFrames into one list
    dfs = []
    for subfolder in folder_dfs[country_name].values():
        dfs.extend(subfolder.values())
    # Combine them into one big DataFrame
    return pd.concat(dfs, ignore_index=True)

# Build DataFrames per country
belgium_data     = concat_country_data("belgium")
england_data     = concat_country_data("england")
france_data      = concat_country_data("france")
germany_data     = concat_country_data("germany")
greece_data      = concat_country_data("greece")
italy_data       = concat_country_data("italy")
netherlands_data = concat_country_data("netherlands")
portugal_data    = concat_country_data("portugal")
scotland_data    = concat_country_data("scotland")
spain_data       = concat_country_data("spain")
turkey_data      = concat_country_data("turkey")



FILEPATH=""
country_dfs = {
    "belgium": belgium_data,
    "england": england_data,
    "france": france_data,
    "germany": germany_data,
    "greece": greece_data,
    "italy": italy_data,
    "netherlands": netherlands_data,
    "portugal": portugal_data,
    "scotland": scotland_data,
    "spain": spain_data,
    "turkey": turkey_data
}


output_dir = "combined_csvs"  # folder to save all CSVs
os.makedirs(output_dir, exist_ok=True)  # create folder if it doesn't exist

for country, df in country_dfs.items():
    # Safe file path
    file_path = os.path.join(output_dir, f"{country}.csv")
    
    df.to_csv(file_path, index=False)
    print(f" Saved {file_path} (shape={df.shape})")


Country: belgium
Subfolder: 1
    - 1920.csv → shape=(232, 105)
    - 2021.csv → shape=(306, 105)
    - 2122.csv → shape=(306, 106)
    - 2223.csv → shape=(306, 105)
    - 2324.csv → shape=(312, 105)
    - 2425.csv → shape=(312, 121)

Country: england
Subfolder: 0
    - 1920.csv → shape=(380, 106)
    - 2021.csv → shape=(380, 106)
    - 2122.csv → shape=(380, 106)
    - 2223.csv → shape=(380, 106)
    - 2324.csv → shape=(380, 106)
    - 2425.csv → shape=(380, 120)
Subfolder: 1
    - 1920.csv → shape=(552, 106)
    - 2021.csv → shape=(552, 106)
    - 2122.csv → shape=(552, 106)
    - 2223.csv → shape=(552, 106)
    - 2324.csv → shape=(552, 106)
    - 2425.csv → shape=(552, 120)
Subfolder: 2
    - 1920.csv → shape=(400, 106)
    - 2021.csv → shape=(552, 106)
    - 2122.csv → shape=(552, 106)
    - 2223.csv → shape=(552, 106)
    - 2324.csv → shape=(552, 106)
    - 2425.csv → shape=(552, 120)
Subfolder: 3
    - 1920.csv → shape=(440, 106)
    - 2021.csv → shape=(552, 106)
    - 2122.csv 

all the single countries import for all years and all divisions

In [37]:
#this import a bit useless since data is already here, but in case of separation its good 
#import of data
belgium_data=pd.read_csv(r'combined_csvs\belgium.csv')
eng_data = pd.read_csv(r'combined_csvs\england.csv')
fr_data=pd.read_csv(r'combined_csvs\france.csv')
d_data=pd.read_csv(r'combined_csvs\germany.csv')
gr_data=pd.read_csv(r'combined_csvs\greece.csv')
it_data=pd.read_csv(r'combined_csvs\italy.csv')
ne_data=pd.read_csv(r'combined_csvs\netherlands.csv')
por_data=pd.read_csv(r'combined_csvs\portugal.csv')
sc_data=pd.read_csv(r'combined_csvs\scotland.csv')
sp_data=pd.read_csv(r'combined_csvs\spain.csv')
tur_data=pd.read_csv(r'combined_csvs\turkey.csv')

  sc_data=pd.read_csv(r'combined_csvs\scotland.csv')


# function creates the small dataset(s) for A0 model

In [None]:
col_list_original=['Div','Date','Time','HomeTeam','AwayTeam','FTHG','FTAG','FTR']
def original_data(df,feature_list):
    df=df[feature_list]
    df[['Country', 'Division']] =df['Div'].str.extract(r'([A-Za-z]+)(\d+)')
    df.drop(columns=['Div'], inplace=True)
    df['Total_goals']=df['FTHG']+df['FTAG']
    df['Target']= (df['Total_goals']>2.5).astype(int) #1 if more than 2 goals were scored, else 0
    return df


belgium_data= original_data(belgium_data,col_list_original)
eng_data = original_data(eng_data,col_list_original)
fr_data=original_data(fr_data,col_list_original)
d_data=original_data(d_data,col_list_original)
gr_data=original_data(gr_data,col_list_original)
it_data=original_data(it_data,col_list_original)
ne_data=original_data(ne_data,col_list_original)
por_data=original_data(por_data,col_list_original)
sc_data=original_data(sc_data,col_list_original)
sp_data=original_data(sp_data,col_list_original)
tur_data=original_data(tur_data,col_list_original)

data_og=pd.DataFrame()

for df in [belgium_data, eng_data, fr_data, d_data, gr_data, it_data, ne_data, por_data, sc_data, sp_data, tur_data]:
    data_og=pd.concat([data_og,df], ignore_index=True)
data_og.info()
data_og.head()

data_og.to_csv(r'combined_csvs\full_data.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['Country', 'Division']] =df['Div'].str.extract(r'([A-Za-z]+)(\d+)')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['Country', 'Division']] =df['Div'].str.extract(r'([A-Za-z]+)(\d+)')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Div'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42593 entries, 0 to 42592
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Date         42593 non-null  object
 1   Time         42593 non-null  object
 2   HomeTeam     42593 non-null  object
 3   AwayTeam     42593 non-null  object
 4   FTHG         42593 non-null  int64 
 5   FTAG         42593 non-null  int64 
 6   FTR          42593 non-null  object
 7   Country      42593 non-null  object
 8   Division     42593 non-null  object
 9   Total_goals  42593 non-null  int64 
 10  Target       42593 non-null  int64 
dtypes: int64(4), object(7)
memory usage: 3.6+ MB


# creation of bigger dataset

In [None]:
'''Planned features for a1 dataset creation:
original dataset: 
Div (later split into)
    Country
    Division
Date (later split into)
    Year
    Month
    Dayofweek
    isweekend
Time (only keep hour, then standartize ?   range(0-24) - midnight is 0, noon is 12, etc.
HomeTeam encoded 
AwayTeam encoded same as HomeTeam
FTHG- get averages, then drop    expected   range(0-6)
FTAG- get averages, then drop    expected   range(0-6)
FTR- drop (full time result) not needed IDK BOUT THI ONE GUYS)

-------------------------------------------
bigger dataset:
will be used for shot conversion:
    HST = Home Team Shots on Target
    AST = Away Team Shots on Target
    HHW = Home Team Hit Woodwork
    AHW = Away Team Hit Woodwork
    formula: shot conversion=goals scored/(shot on target+hit woodwork)... in last 5 games
    Note: result can be used with average number of ((Shots on target+woodwork)/shots)* shot conversion
referee:
    boolean if referee mentioned in data
    then encoded referee
BetBrain Over/Under variables:
    BbOu- can be used as weight for the averages and maximums
    BbMx>2.5 = Betbrain maximum over 2.5 goals
    BbAv>2.5 = Betbrain average over 2.5 goals
    BbMx<2.5 = Betbrain maximum under 2.5 goals
    BbAv<2.5 = Betbrain average under 2.5 goals
BetBrain AsianHandicap variables:
    BbAH = -can be used as weight for the averages and maximums
    BbAHh = Betbrain size of handicap (home team)
    AHh = Market size of handicap (home team) (since 2019/2020) IDK what to do with this one
    BbMxAHH = Betbrain maximum Asian handicap home team odds
    BbAvAHH = Betbrain average Asian handicap home team odds
    BbMxAHA = Betbrain maximum Asian handicap away team odds
    BbAvAHA = Betbrain average Asian handicap away team odds 
    Note: the home and away asian handicap are inverse of each other, so only one is necessary

    MaxAHH = Market maximum Asian handicap home team odds
    MaxAHA = Market maximum Asian handicap away team odds	
    AvgAHH = Market average Asian handicap home team odds
    AvgAHA = Market average Asian handicap away team odds
    Note: idk if this it a whole market or just market for a specific bookmaker

-------------------------------------------
target variable necessarry:
Total_goals - sum of FTHG and FTAG  range(0-12)
Target - (Total_goals>2.5)  0 or 1
-------------------------------------------
'''

col_list_a1=['Div','Date','Time','HomeTeam','AwayTeam','FTHG','FTAG','FTR']
def a1_data(df,feature_list):
    
    df=df[feature_list]
    df[['Country', 'Division']] =df['Div'].str.extract(r'([A-Za-z]+)(\d+)')
    df.drop(columns=['Div'], inplace=True)
    df['Total_goals']=df['FTHG']+df['FTAG']
    df['Target']= (df['Total_goals']>2.5).astype(int) #1 if more than 2 goals were scored, else 0
    return df


b_a1= a1_data(belgium_data,col_list_a1)
eng_a1 = a1_data(eng_data,col_list_a1)
fr_a1=a1_data(fr_data,col_list_a1)
d_a1=a1_data(d_data,col_list_a1)
gr_a1=a1_data(gr_data,col_list_a1)
it_a1=a1_data(it_data,col_list_a1)
ne_a1=a1_data(ne_data,col_list_a1)
por_a1=a1_data(por_data,col_list_a1)
sc_a1=a1_data(sc_data,col_list_a1)
sp_a1=a1_data(sp_data,col_list_a1)
tur_a1=a1_data(tur_data,col_list_a1)

KeyError: "['Div'] not in index"

# Data encoding and transformation

going to create function  to prepare data by the same key for all the files, full or country by country if chosen to train 11 different models

NOTE: A0 means the model with original data, A1 will be the model with newer data in this file

In [None]:
'''
Date - idk wtf to do with dat
Time - only keep hour, then standartize ?   range(0-24) - midnight is 0, noon is 12, etc.
HomeTeam - encode same as away              range(idk)
AwayTeam- Encode same as home               range(idk)
FTHG- get averages, then drop    expected   range(0-6)
FTAG-get averages, then drop     expected   range(0-6)
FTR- get winrate( in 0-1 range) then drop   range(0-1)
Country- dummies, only for full dataset     {0,1}
Division - dummies                          {0,1}
Total_goals- drop in the end i guess
Target - KEEP TARGET                        values:{0,1} Note: 0 is less than 2.5, 1 is more than 2.5
'''
SEED=66
from sklearn.preprocessing import LabelEncoder

team_encoder = LabelEncoder()
df['HomeTeam_enc'] = team_encoder.fit_transform(df['HomeTeam'])
df['AwayTeam_enc'] = team_encoder.fit_transform(df['AwayTeam'])

def transf_encode_a0(df):
    #encodery:
    '''
    jestli chcete i transformovat s custom seedem
    if seed is None:
    seed=42
    #misto pro encodery:
    '''
    team_encoder = LabelEncoder()
    #all this should work generally
    df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
    df['Time'] = pd.to_datetime(df['Time'], format='%H:%M').dt.hour
    #mean goals calc:
    df=df.sort_values('Date')# sort to make sure averages are for the 5 previous matches
    mean_goals=df.groupby  
    # 'HomeTeam','AwayTeam' same encoding to keep the team numbers same for easiness
    #home
    df['HomeTeam_enc'] = team_encoder.fit_transform(df['HomeTeam'])

    df = df.sort_values(["HomeTeam_enc", "Date"])
    df['avg_goals_in_last5_home']=(
    df.groupby("HomeTeam_enc")["FTHG"]
      .transform(lambda x: x.shift().rolling(5, min_periods=1).mean()))
    
    df['avg_goals_conceded_last5_home']=(
    df.groupby("HomeTeam_enc")["FTAG"]
      .transform(lambda x: x.shift().rolling(5, min_periods=1).mean()))
    

    #away
    df['AwayTeam_enc'] = team_encoder.transform(df['AwayTeam'])

    df = df.sort_values(["AwayTeam_enc", "Date"])
    df['avg_goals_in_last5_away']= (
    df.groupby("AwayTeam_enc")["FTAG"]
      .transform(lambda x: x.shift().rolling(5, min_periods=1).mean()))

    df['avg_goals_conceded_last5_away']=(
    df.groupby("AwayTeam_enc")["FTHG"]
      .transform(lambda x: x.shift().rolling(5, min_periods=1).mean()))
    

    df=df.sort_values('Date')# sort back to original order
        
    # Basic calendar-based features
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Dayofweek'] = df['Date'].dt.dayofweek  # Monday=0, Sunday=6
    df['Is_weekend'] = df['Dayofweek'].isin([5, 6]).astype(int)
    #this should make it work for the full file - dummies for country, division should be just fine i guess :eshrug:
    if 'Country' in df.columns:
        df=pd.get_dummies(df, columns=['Country','Division'],drop_first=True)
        #encode country/dummies
    else:
        df=pd.get_dummies(df, columns=['Division'],drop_first=True)
        df=df.drop(columns=['Country'])


    
    df=df.drop(columns=['Total_goals','FTHG','FTAG','FTR','HomeTeam','AwayTeam','Date'])
    return df
    


In [None]:
data_test1=transf_encode_a0(eng_data)


ValueError: time data "20" doesn't match format "%H:%M", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [41]:
data_test2=transf_encode_a0(data_og)

In [44]:

data_test2.tail(20)
data_test2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42593 entries, 19697 to 40416
Data columns (total 25 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Time                           42593 non-null  int32  
 1   Target                         42593 non-null  int64  
 2   HomeTeam_enc                   42593 non-null  int64  
 3   avg_goals_in_last5_home        42115 non-null  float64
 4   avg_goals_conceded_last5_home  42115 non-null  float64
 5   AwayTeam_enc                   42593 non-null  int64  
 6   avg_goals_in_last5_away        42115 non-null  float64
 7   avg_goals_conceded_last5_away  42115 non-null  float64
 8   Year                           42593 non-null  int32  
 9   Month                          42593 non-null  int32  
 10  Dayofweek                      42593 non-null  int32  
 11  Is_weekend                     42593 non-null  int64  
 12  Country_D                      42593 non-null  

transformation for A1(model with more data)

In [None]:
'''
Date - idk wtf to do with dat
Time - only keep hour, then standartize ?   range(0-24) - midnight is 0, noon is 12, etc.
HomeTeam - encode same as away              range(idk)
AwayTeam- Encode same as home               range(idk)
FTHG- get averages, then drop    expected   range(0-6)
FTAG-get averages, then drop     expected   range(0-6)
FTR- get winrate( in 0-1 range) then drop   range(0-1)
Country- dummies, only for full dataset     {0,1}
Division - dummies                          {0,1}
Total_goals- drop in the end i guess
Target - KEEP TARGET                        values:{0,1} Note: 0 is less than 2.5, 1 is more than 2.5

'''

def transf_encode_a1(df):
    #encodery:
    '''
    jestli chcete i transformovat s custom seedem
    if seed is None:
    seed=42
    #misto pro encodery:
    '''
    team_encoder = LabelEncoder()
    #all this should work generally
    df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
    df['Time'] = pd.to_datetime(df['Time'], format='%H:%M').dt.hour
    #mean goals calc:
    df=df.sort_values('Date')# sort to make sure averages are for the 5 previous matches
    mean_goals=df.groupby  
    # 'HomeTeam','AwayTeam' same encoding to keep the team numbers same for easiness
    #home
    df['HomeTeam_enc'] = team_encoder.fit_transform(df['HomeTeam'])

    df = df.sort_values(["HomeTeam_enc", "Date"])
    df['avg_goals_in_last5_home']=(
    df.groupby("HomeTeam_enc")["FTHG"]
      .transform(lambda x: x.shift().rolling(5, min_periods=1).mean()))
    
    df['avg_goals_conceded_last5_home']=(
    df.groupby("HomeTeam_enc")["FTAG"]
      .transform(lambda x: x.shift().rolling(5, min_periods=1).mean()))
    

    #away
    df['AwayTeam_enc'] = team_encoder.transform(df['AwayTeam'])

    df = df.sort_values(["AwayTeam_enc", "Date"])
    df['avg_goals_in_last5_away']= (
    df.groupby("AwayTeam_enc")["FTAG"]
      .transform(lambda x: x.shift().rolling(5, min_periods=1).mean()))

    df['avg_goals_conceded_last5_away']=(
    df.groupby("AwayTeam_enc")["FTHG"]
      .transform(lambda x: x.shift().rolling(5, min_periods=1).mean()))
    

    df=df.sort_values('Date')# sort back to original order
        
    # Basic calendar-based features
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Dayofweek'] = df['Date'].dt.dayofweek  # Monday=0, Sunday=6
    df['Is_weekend'] = df['Dayofweek'].isin([5, 6]).astype(int)
    #this should make it work for the full file - dummies for country, division should be just fine i guess :eshrug:
    if 'Country' in df.columns:
        df=pd.get_dummies(df, columns=['Country','Division'],drop_first=True)
        #encode country/dummies
    else:
        df=pd.get_dummies(df, columns=['Division'],drop_first=True)
        df=df.drop(columns=['Country'])


    
    df=df.drop(columns=['Total_goals','FTHG','FTAG','FTR','HomeTeam','AwayTeam','Date'])
    return df