In [1]:
# packages
import pandas as pd
import sqlite3
from datetime import timedelta
from dtype_dictionaries import create_dtype_dict
import math
import numpy as np

## Metdata (weather) imputation
All the data is weather related and has a time dimension. 

#### Numeric values
(a) Sparse missings

Impute by interpolating the time series from the same Station

(b) Dens missings

Impute by taking averaging nearby stations


#### Wind direction 

Take the direction and calculate the mean degrees of the three closest weather stations

In [3]:
conn = sqlite3.connect('../data/formatted_zone/formatted_zone.db')
table_name = 'Metoffice_01_22_2023-10-13'
dtype_dict, date_columns = create_dtype_dict(table_name)
df = pd.read_sql_query(f"SELECT * FROM \"{table_name}\";", conn, parse_dates=date_columns)

dtype_dict, date_columns = create_dtype_dict('weather_station_locations_2023-10-20')
loc_df = pd.read_sql_query(f"SELECT * FROM \"{'weather_station_locations_2023-10-20'}\";", conn, parse_dates=date_columns)
conn.close()

Metoffice_01_22_2023-10-13
weather_station_locations_2023-10-20


In [4]:
def euclidean_distance(lat1, lon1, lat2, lon2):
    # Assuming the Britain can be approximated as a flat plane, calculate the distance using Pythagoras' theorem
    lat_diff = lat2 - lat1
    lon_diff = lon2 - lon1
    distance = math.sqrt(lat_diff**2 + lon_diff**2)

    return distance

In [5]:
def wdir_to_deg(wdir): 
    wdir_dict = {
        "N": 0,
        "E": 90,
        "S": 180,
        "W": 270
    }

    for i, c in enumerate(wdir): 
        if i == 0: deg = wdir_dict[wdir[len(wdir) - 1 - i]]
        else: 
            deg = (deg + wdir_dict[wdir[len(wdir) - 1 - i]]) / 2

    return deg

def deg_to_wdir(deg): 
    wind_directions = [
        ('N', (345, 15)),
        ('NNE', (15, 30)),
        ('NE', (30, 60)),
        ('ENE', (60, 75)),
        ('E', (75, 105)),
        ('ESE', (105, 120)),
        ('SE', (120, 150)),
        ('SSE', (150, 165)),
        ('S', (165,195)),
        ('SSW', (195,210)),
        ('SW', (210,240)),
        ('WSW', (240,255)),
        ('W', (255,285)),
        ('WNW', (285,300)),
        ('NW', (300,330)),
        ('NNW', (330,345))
    ]
    
    # Loop through the wind direction abbreviations and degree ranges
    for direction, (lower, upper) in wind_directions:
        if lower <= deg < upper:
            return direction

    # If the input degrees are outside the defined ranges, return 'N' (North)
    return 'N'

In [6]:
interpolation_threshold = 0.8
station_counts = df.Station_name.value_counts() / 2
imputed_df = df.__deepcopy__()

for column in df.columns:
    # count null values per station for this column
    # station_null_counts = df.loc[df.loc[:,column].isna()].Station_name.value_counts()
    
    # iterate through the dataframe
    for i, value in enumerate(df[column]):
        if pd.isna(value):
            station = df.loc[i, "Station_name"]
            date = df.iloc[i].Date

            # check if this station should be interpolated for this column
            # perc_missing = station_null_counts[station] / station_counts[station] # percentage based criteria
            # if  perc_missing < interpolation_threshold: # TODO check if we can interpolate
            
            start_date = date - timedelta(days = 1)
            end_date = date + timedelta(days = 1)
            
            prev_measure = np.nan
            post_measure = np.nan
            t = df.loc[(df.Station_name == station) & (df.Date == start_date)]
            if len(t) > 0:
                prev_measure = t[column].iloc[0]
            t = df.loc[(df.Station_name == station) & (df.Date == end_date)]
            if len(t) > 0:
                post_measure = t[column].iloc[0]

            if not np.isnan(prev_measure) and isinstance(prev_measure, (int, float)) and not np.isnan(post_measure) and isinstance(post_measure, (int, float)): 
                interpolated_value = np.mean([prev_measure, post_measure])
            else: 
                # mean of nearby stations
                lat = loc_df.loc[loc_df.SITE == station].LAT
                lon = loc_df.loc[loc_df.SITE == station].LON
                loc_df['Distance'] = loc_df.apply(lambda row: euclidean_distance(lat, lon, row['LAT'], row['LON']), axis=1)
                # Sort the DataFrame by distance
                loc_df = loc_df.sort_values(by='Distance')

                # Select the top three closest stations
                closest_stations = list(loc_df.head(4).SITE)
                closest_df = df.loc[(df.Date == date) & df.Station_name.apply(lambda x: x in closest_stations)]

                if column == 'WDIR': 
                    interpolated_value = deg_to_wdir(closest_df.loc[:,column].apply(wdir_to_deg).mean())
                else: 
                    interpolated_value = closest_df.loc[:,column].mean()
            
            # print(i, interpolated_value, station, column, date)
            imputed_df.loc[i, column] = interpolated_value

In [7]:
imputed_df.isna().sum()

Date            0
Station_no      0
Station_name    0
PRESS           0
WDIR            0
WSPD            0
CLOUD           0
TEMP            0
TDEW            0
dtype: int64

## Imputation of football-data

It is very hard to deal with missing data in the game statistic columns. This just tries to deal with the missing values in the game odds. 

In [8]:
conn = sqlite3.connect('../data/formatted_zone/formatted_zone.db')
table_name = 'football-data_2223_2023-10-13'
dtype_dict, date_columns = create_dtype_dict(table_name)
df = pd.read_sql_query(f"SELECT * FROM \"{table_name}\";", conn, parse_dates=date_columns)

conn.close()

football-data_2223_2023-10-13


In [51]:
assert (df.FTR != df.apply(lambda row: "D" if row["FTHG"] == row["FTAG"] else "H" if row["FTHG"] > row["FTAG"] else "A", axis=1)).sum() == 0
assert (df.HTR != df.apply(lambda row: "D" if row["HTHG"] == row["HTAG"] else "H" if row["HTHG"] > row["HTAG"] else "A", axis=1)).sum() == 0
assert (df.HS < df.HST).sum() == 0
assert (df.AS < df.AST).sum() == 0

In [29]:
for column in df.columns[24:]:
    backup_column = None
    original_columns = None

    if   column in ["B365H","BWH","IWH","PSH","WHH","VCH"]: 
        backup_column = "AvgH" 
        original_columns = ["B365H","BWH","IWH","PSH","WHH","VCH"]
    elif column in ["B365D","BWD","IWD","PSD","WHD","VCD"]: 
        backup_column = "AvgD" 
        original_columns = ["B365D","BWD","IWD","PSD","WHD","VCD"]
    elif column in ["B365A","BWA","IWA","PSA","WHA","VCA"]: 
        backup_column = "AvgA" 
        original_columns = ["B365A","BWA","IWA","PSA","WHA","VCA"]
    elif column in ["B365>2.5","P>2.5"]: 
        backup_column = "Avg>2.5" 
        original_columns = ["B365>2.5","P>2.5"]
    elif column in ["B365<2.5","P<2.5"]: 
        backup_column = "Avg<2.5" 
        original_columns = ["B365<2.5","P<2.5"]
    elif column in ["AHh","B365AHH","PAHH"]: 
        backup_column = "AvgAHH" 
        original_columns = ["AHh","B365AHH","PAHH"]
    elif column in ["B365AHA","PAHA"]: 
        backup_column = "AvgAHA" 
        original_columns = ["B365AHA","PAHA"]
    elif column in ["B365CH","BWCH","IWCH","PSCH","WHCH","VCCH"]: 
        backup_column = "AvgCH" 
        original_columns = ["B365CH","BWCH","IWCH","PSCH","WHCH","VCCH"]
    elif column in ["B365CD","BWCD","IWCD","PSCD","WHCD","VCCD"]: 
        backup_column = "AvgCD" 
        original_columns = ["B365CD","BWCD","IWCD","PSCD","WHCD","VCCD"]
    elif column in ["B365CA","BWCA","IWCA","PSCA","WHCA","VCCA"]: 
        backup_column = "AvgCA" 
        original_columns = ["B365CA","BWCA","IWCA","PSCA","WHCA","VCCA"]
    elif column in ["B365C>2.5","PC>2.5"]: 
        backup_column = "AvgC>2.5" 
        original_columns = ["B365C>2.5","PC>2.5"]
    elif column in ["B365C<2.5","PC<2.5"]: 
        backup_column = "AvgC<2.5" 
        original_columns = ["B365C<2.5","PC<2.5"]
    elif column in ["AHCh","B365CAHH","PCAHH"]: 
        backup_column = "AvgCAHH" 
        original_columns = ["AHCh","B365CAHH","PCAHH"]
    elif column in ["B365CAHA","PCAHA"]: 
        backup_column = "AvgCAHA" 
        original_columns = ["B365CAHA","PCAHA"]

    
    if   column in ["AvgH", "MaxH"]:        original_columns = ["B365H","BWH","IWH","PSH","WHH","VCH"]
    elif column in ["AvgD", "MaxD"]:        original_columns = ["B365D","BWD","IWD","PSD","WHD","VCD"]
    elif column in ["AvgA", "MaxA"]:        original_columns = ["B365A","BWA","IWA","PSA","WHA","VCA"]
    elif column in ["Avg>2.5", "Max>2.5"]:  original_columns = ["B365>2.5","P>2.5"]
    elif column in ["Avg<2.5", "Max<2.5"]:  original_columns = ["B365<2.5","P<2.5"]
    elif column in ["AvgAHH", "MaxAHH"]:    original_columns = ["AHh","B365AHH","PAHH"]
    elif column in ["AvgAHA", "MaxAHA"]:    original_columns = ["B365AHA","PAHA"]
    elif column in ["AvgCH", "MaxCH"]:      original_columns = ["B365CH","BWCH","IWCH","PSCH","WHCH","VCCH"]
    elif column in ["AvgCD", "MaxCD"]:      original_columns = ["B365CD","BWCD","IWCD","PSCD","WHCD","VCCD"]
    elif column in ["AvgCA", "MaxCA"]:      original_columns = ["B365CA","BWCA","IWCA","PSCA","WHCA","VCCA"]
    elif column in ["AvgC>2.5", "MaxC>2.5"]:original_columns = ["B365C>2.5","PC>2.5"]
    elif column in ["AvgC<2.5", "MaxC<2.5"]:original_columns = ["B365C<2.5","PC<2.5"]
    elif column in ["AvgCAHH", "MaxCAHH"]:  original_columns = ["AHCh","B365CAHH","PCAHH"]
    elif column in ["AvgCAHA", "MaxCAHA"]:  original_columns = ["B365CAHA","PCAHA"]

    for i, value in enumerate(df[column]):
        if pd.isna(value): 
            if backup_column is None: # Avg or Max column
                if column.startswith("Avg"): 
                    imputed_value = df.loc[i, original_columns].mean()
                elif column.startswith("Max"): 
                    imputed_value = df.loc[i, original_columns].max()
            else: 
                if pd.isna(df.loc[i, backup_column]): 
                    imputed_value = df.loc[i, original_columns].mean()
                else: 
                    imputed_value = df.loc[i, backup_column]
        
            print(i, column,imputed_value)

79 P>2.5 1.25
79 P<2.5 3.9
79 PC>2.5 1.26
79 PC<2.5 3.87
