In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
train_files = [
    'Datasets/Processed/season-1314_with_weather.csv',
    'Datasets/Processed/season-1415_with_weather.csv',
    'Datasets/Processed/season-1516_with_weather.csv',
    'Datasets/Processed/season-1617_with_weather.csv'
]

train_dfs = []
for i, file in enumerate(train_files):
    df = pd.read_csv(file, parse_dates=['Date'], date_format='%d/%m/%y')
    df['season'] = 2013 + i
    train_dfs.append(df)

train_raw = pd.concat(train_dfs).sort_values('Date').reset_index(drop=True)

# Verification
print("Date column sample:")
print(train_raw['Date'].head(3))
print("\nDate range:", train_raw['Date'].min(), "to", train_raw['Date'].max())

Date column sample:
0   2013-08-09
1   2013-08-10
2   2013-08-10
Name: Date, dtype: datetime64[ns]

Date range: 2013-08-09 00:00:00 to 2017-05-20 00:00:00


In [3]:
def create_features(df):
    """Comprehensive feature engineering with all pre-match features"""
    # -------------------------------------------------------------------------
    # 1. League averages for fallback
    # -------------------------------------------------------------------------
    df['league_home_avg'] = df.groupby('season')['FTHG'].transform('mean')
    df['league_away_avg'] = df.groupby('season')['FTAG'].transform('mean')
    
    # -------------------------------------------------------------------------
    # 2. Team Form Features (Basic)
    # -------------------------------------------------------------------------
    for team_type in ['HomeTeam', 'AwayTeam']:
        # Goals scored/conceded
        df[f'{team_type}_goals_5avg'] = df.groupby(team_type)['FTHG' if team_type=='HomeTeam' else 'FTAG']\
                                        .transform(lambda x: x.shift(1).rolling(5, min_periods=1).mean())
        
        # Win percentage
        df[f'{team_type}_win_pct'] = df.groupby(team_type)['FTR']\
                                    .transform(lambda x: (x.shift(1) == ('H' if team_type=='HomeTeam' else 'A')))\
                                    .rolling(10, min_periods=3).mean()
    
    # -------------------------------------------------------------------------
    # 3. Defensive Stability Features
    # -------------------------------------------------------------------------
    # Home Team
    df['Home_clean_sheets_5'] = df.groupby('HomeTeam')['FTAG']\
                                .transform(lambda x: (x.shift(1) == 0).rolling(5, min_periods=3).sum())
    df['Home_goals_conceded_5avg'] = df.groupby('HomeTeam')['FTAG']\
                                     .transform(lambda x: x.shift(1).rolling(5, min_periods=3).mean())
    
    # Away Team
    df['Away_clean_sheets_5'] = df.groupby('AwayTeam')['FTHG']\
                                .transform(lambda x: (x.shift(1) == 0).rolling(5, min_periods=3).sum())
    df['Away_goals_conceded_5avg'] = df.groupby('AwayTeam')['FTHG']\
                                     .transform(lambda x: x.shift(1).rolling(5, min_periods=3).mean())
    
    # -------------------------------------------------------------------------
    # 4. Attack Efficiency Features
    # -------------------------------------------------------------------------
    df['Home_shot_conv_3'] = (df.groupby('HomeTeam')['FTHG'].transform(lambda x: x.shift(1).rolling(3, min_periods=1).mean())) / \
                            (df.groupby('HomeTeam')['HS'].transform(lambda x: x.shift(1).rolling(3, min_periods=1).mean() + 0.001))
    
    df['Away_shot_conv_3'] = (df.groupby('AwayTeam')['FTAG'].transform(lambda x: x.shift(1).rolling(3, min_periods=1).mean())) / \
                            (df.groupby('AwayTeam')['AS'].transform(lambda x: x.shift(1).rolling(3, min_periods=1).mean() + 0.001))
    
    # -------------------------------------------------------------------------
    # 5. Discipline Features
    # -------------------------------------------------------------------------
    df['Home_avg_cards_5'] = df.groupby('HomeTeam')['HY']\
                             .transform(lambda x: x.shift(1).rolling(5, min_periods=3).mean())
    df['Away_avg_cards_5'] = df.groupby('AwayTeam')['AY']\
                             .transform(lambda x: x.shift(1).rolling(5, min_periods=3).mean())
    
    # -------------------------------------------------------------------------
    # 6. Comeback Ability Features
    # -------------------------------------------------------------------------
    def calculate_comeback(group):
        return ((group['FTR'].shift(1) == 'H') & (group['HTR'].shift(1) == 'A')).astype(int)
    
    df['Home_comeback_pts_10'] = df.groupby('HomeTeam').apply(calculate_comeback)\
                                  .reset_index(level=0, drop=True)\
                                  .rolling(10, min_periods=5).sum()
    
    df['Away_comeback_pts_10'] = df.groupby('AwayTeam').apply(
        lambda x: ((x['FTR'].shift(1) == 'A') & (x['HTR'].shift(1) == 'H')).astype(int))\
                                  .reset_index(level=0, drop=True)\
                                  .rolling(10, min_periods=5).sum()
    
    # -------------------------------------------------------------------------
    # 7. Head-to-Head Features
    # -------------------------------------------------------------------------
    df['h2h_goal_diff'] = df.apply(lambda row: get_h2h_diff(row['HomeTeam'], row['AwayTeam'], row['Date'], df), axis=1)
    
    return df

def get_h2h_diff(home, away, date, full_df):
    """Calculate historical goal difference between teams"""
    past_matches = full_df[
        (full_df['Date'] < date) & 
        (full_df['HomeTeam'] == home) & 
        (full_df['AwayTeam'] == away)
    ]
    return (past_matches['FTHG'].mean() - past_matches['FTAG'].mean()) if len(past_matches) > 0 else 0

In [9]:
train_engineered = create_features(train_raw)
all_features = [col for col in train_engineered.columns 
                if col not in ['Date', 'FTR', 'FTHG', 'FTAG', 'season', 
                              'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 
                              'HF', 'AF', 'HY', 'AY', 'HR', 'AR',]] # 'temperature', 'wind_speed', 'precipitation'

print(f"Total engineered features: {len(all_features)}")
print(all_features)
# Check temporal integrity
sample = train_engineered.iloc[1000]
print(f"\nFeature example for {sample['HomeTeam']} vs {sample['AwayTeam']}:")
print(f"Home 5-game goal avg: {sample['HomeTeam_goals_5avg']:.2f}")
print(f"Home shot conversion: {sample['Home_shot_conv_3']:.2%}")
print(f"Home comeback pts: {sample['Home_comeback_pts_10']}")



Total engineered features: 26
['HomeTeam', 'AwayTeam', 'HTHG', 'HTAG', 'HTR', 'Referee', 'temperature', 'wind_speed', 'precipitation', 'league_home_avg', 'league_away_avg', 'HomeTeam_goals_5avg', 'HomeTeam_win_pct', 'AwayTeam_goals_5avg', 'AwayTeam_win_pct', 'Home_clean_sheets_5', 'Home_goals_conceded_5avg', 'Away_clean_sheets_5', 'Away_goals_conceded_5avg', 'Home_shot_conv_3', 'Away_shot_conv_3', 'Home_avg_cards_5', 'Away_avg_cards_5', 'Home_comeback_pts_10', 'Away_comeback_pts_10', 'h2h_goal_diff']

Feature example for Toulouse vs Nantes:
Home 5-game goal avg: 1.20
Home shot conversion: 6.67%
Home comeback pts: 0.0


In [15]:
# %% [markdown]
"""
## Feature NA Handling
Filling missing values with appropriate defaults
"""
features = [
    # Basic form
    'HomeTeam_goals_5avg', 'AwayTeam_goals_5avg',
    'HomeTeam_win_pct', 'AwayTeam_win_pct',
    'h2h_goal_diff',
    
    # Defensive stability
    'Home_goals_conceded_5avg', 'Away_goals_conceded_5avg',
    'Home_clean_sheets_5', 'Away_clean_sheets_5',
    
    # Attack efficiency
    'Home_shot_conv_3', 'Away_shot_conv_3',
    
    # Discipline
    'Home_avg_cards_5', 'Away_avg_cards_5',
    
    # Comeback ability
    'Home_comeback_pts_10', 'Away_comeback_pts_10',

    # Weather
    'temperature', 'wind_speed', 'precipitation'
]# Corrected fill values dictionary

fill_values = {
    # Goals and defense - use league averages
    **{f'{team}Team_goals_5avg': train_engineered[f'league_{team.lower()}_avg'] 
       for team in ['Home', 'Away']},
    **{f'{team}_goals_conceded_5avg': train_engineered[f'league_{team.lower()}_avg'] 
       for team in ['Home', 'Away']},  # Removed "Team" from column name
    **{f'{team}_clean_sheets_5': 0 for team in ['Home', 'Away']},
    
    # Attack efficiency
    'Home_shot_conv_3': train_engineered['FTHG'].mean() / (train_engineered['HS'].mean() + 0.001),
    'Away_shot_conv_3': train_engineered['FTAG'].mean() / (train_engineered['AS'].mean() + 0.001),
    
    # Win percentages
    'HomeTeam_win_pct': 0.5,
    'AwayTeam_win_pct': 0.5,
    
    # Discipline
    'Home_avg_cards_5': train_engineered['HY'].mean(),
    'Away_avg_cards_5': train_engineered['AY'].mean(),
    
    # Comeback ability
    'Home_comeback_pts_10': 0,
    'Away_comeback_pts_10': 0,
    
    # Head-to-head
    'h2h_goal_diff': 0
}

# Apply filling
for feature, fill_value in fill_values.items():
    if feature in train_engineered.columns:  # Safe check
        train_engineered[feature] = train_engineered[feature].fillna(fill_value)
    else:
        print(f"Warning: {feature} not found in dataframe")

# Final cleanup
train_final = train_engineered.dropna(subset=features)
print(f"\nFinal dataset: {len(train_final)} matches")
print(f"First match: {train_final['Date'].min().date()}")
print(f"Last match: {train_final['Date'].max().date()}")


Final dataset: 1520 matches
First match: 2013-08-09
Last match: 2017-05-20


In [None]:
import joblib

# 1. Select only safe columns to keep
safe_columns = [
    # Match identifiers
    'Date', 'HomeTeam', 'AwayTeam', 'season',
    
    # Engineered features
    *features,  # All our engineered features
    
    # Target variable
    'FTR'
]

# 2. Create cleaned dataset
clean_data = train_final[safe_columns].copy()

# 3. Scale features (only the engineered ones)
# scaler = StandardScaler()
# scaled_features = scaler.fit_transform(clean_data[features])
# clean_data[features] = scaled_features

# 4. Save files
clean_data.to_csv('Features/TrainingSet/matches_engineered_weather.csv', index=False)  # All safe data
pd.DataFrame(clean_data[features], columns=features).to_csv('Features/TrainingSet/features_weather.csv', index=False)
# joblib.dump(scaler, 'Features/TrainingSet/feature_scaler.pkl')

# 5. Verification
print(f"\nSaved files with {len(clean_data)} matches")
print("Columns in final CSV:")
print(clean_data.columns.tolist())
print("\nSample row:")
print(clean_data.iloc[0][['Date', 'HomeTeam', 'AwayTeam', 'FTR'] + features[:3]].to_dict())


Saved files with 1520 matches
Columns in final CSV:
['Date', 'HomeTeam', 'AwayTeam', 'season', 'HomeTeam_goals_5avg', 'AwayTeam_goals_5avg', 'HomeTeam_win_pct', 'AwayTeam_win_pct', 'h2h_goal_diff', 'Home_goals_conceded_5avg', 'Away_goals_conceded_5avg', 'Home_clean_sheets_5', 'Away_clean_sheets_5', 'Home_shot_conv_3', 'Away_shot_conv_3', 'Home_avg_cards_5', 'Away_avg_cards_5', 'Home_comeback_pts_10', 'Away_comeback_pts_10', 'temperature', 'wind_speed', 'precipitation', 'FTR']

Sample row:
{'Date': Timestamp('2013-08-09 00:00:00'), 'HomeTeam': 'Montpellier', 'AwayTeam': 'Paris SG', 'FTR': 'D', 'HomeTeam_goals_5avg': 1.4157894736842105, 'AwayTeam_goals_5avg': 1.0394736842105263, 'HomeTeam_win_pct': 0.5}


In [None]:
# Load raw validation data
val_raw = pd.read_csv('Datasets/Processed/season-1718_with_weather.csv', parse_dates=['Date'])

# Add season marker
val_raw['season'] = 2017

# Apply the SAME feature engineering
val_engineered = create_features(val_raw)  # Uses same function as training

# Fill NAs using training league averages (not val averages!)
for feature in features:
    if feature in fill_values:  # Use the same fill values as training
        val_engineered[feature] = val_engineered[feature].fillna(fill_values[feature])

# Scale features (using TRAINING scaler)
# X_val = scaler.transform(val_engineered[features])
y_val = val_engineered['FTR']

# Save validation files
val_engineered.to_csv('Features/ValidationSet/matches_engineered_weather.csv', index=False)
pd.DataFrame(val_engineered[features], columns=features).to_csv('Features/ValidationSet/features_weather.csv', index=False)

print(f"Validation set: {len(val_engineered)} matches")

  val_raw = pd.read_csv('Datasets/Processed/season-1718_with_weather.csv', parse_dates=['Date'])


Validation set: 380 matches


In [None]:
# Load raw test data
test_raw = pd.read_csv('Datasets/Processed/season-1819_with_weather.csv', parse_dates=['Date'])

# Add season marker
test_raw['season'] = 2018

# Apply feature engineering
test_engineered = create_features(test_raw)

# Fill NAs (using training defaults)
for feature in features:
    if feature in fill_values:
        test_engineered[feature] = test_engineered[feature].fillna(fill_values[feature])

# Scale features
# X_test = scaler.transform(test_engineered[features])
y_test = test_engineered['FTR']

# Save test files
test_engineered.to_csv('Features/TestSet/matches_engineered_weather.csv', index=False)
pd.DataFrame(test_engineered[features], columns=features).to_csv('Features/TestSet/features_weather.csv', index=False)

print(f"Test set: {len(test_engineered)} matches")

  test_raw = pd.read_csv('Datasets/Processed/season-1819_with_weather.csv', parse_dates=['Date'])


Test set: 380 matches
