In [106]:
import pandas as pd
# Loading the CSV file
csv_file_path = 'final_all_euro_data_with_rankings_features.csv'
df = pd.read_csv(csv_file_path)

# Identifying numeric columns
numeric_columns = df.select_dtypes(include=['number']).columns.tolist()


# Including the specified non-numeric columns
required_columns = numeric_columns + ['Home Team', 'Away Team', 'Full Time Result']

# Creating a new dataframe with only the required columns
filtered_df = df[required_columns]

# Drop 'full time home goals' and 'full time away goals' columns
columns_to_drop = ['AvgGoalsScored (Home)_Last5','AvgGoalsConceded (Away)_Last5','HomeWin','WinRate (Home)_Last5','Unnamed: 0','Full Time Home Goals','Full Time Away Goals','Half Time Home Goals','Half Time Away Goals']
filtered_df.drop(columns=columns_to_drop, errors='ignore', inplace=True) 


final_renamed_file_path = "filterd_all_euro_data_with_rankings_features.csv"
filtered_df.to_csv(final_renamed_file_path, index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.drop(columns=columns_to_drop, errors='ignore', inplace=True)


In [107]:
# Display the first few rows of the new dataframe
filtered_df.head()

Unnamed: 0,Home Shots,Away Shots,Home Shots on Target,Away Shots on Target,Home Fouls,Away Fouls,Home Corners,Away Corners,Home Yellow Cards,Away Yellow Cards,...,Home Off-Def Interaction,Away Off-Def Interaction,Expected Dominance,Performance Differential,Home Dominance Score,Home Recent Form Index,Away Recent Form Index,Home Team,Away Team,Full Time Result
0,6,17,1,8,11,8,6,5,0.0,0,...,105.21875,130.55875,-0.6,-19.6,-0.6,-100.0,50.0,Burnley,Manchester City,A
1,15,6,7,2,12,12,8,3,2.0,2,...,93.775,104.04625,0.4,14.325,0.4,50.0,0.0,Arsenal,Nottingham Forest,H
2,14,16,5,3,9,14,10,4,1.0,4,...,144.7725,168.72,0.2,-1.2,0.2,-20.0,-30.0,AFC Bournemouth,West Brom,D
3,27,9,12,3,11,12,6,7,2.0,2,...,191.4,133.039375,0.0,11.9,0.0,20.0,-40.0,Brighton & Hove Albion,Athlone Town,H
4,19,9,9,2,12,6,10,4,0.0,2,...,123.4325,108.275,0.4,11.175,0.4,50.0,-30.0,Everton FC,Fulham,A


In [108]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


# Replace infinity values with NaN in the dataset
filtered_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Re-separating the target variable and features after replacing infinities
X = filtered_df.drop('Full Time Result', axis=1)
y = filtered_df['Full Time Result']

# Re-identifying categorical and numerical columns
categorical_cols = ['Home Team', 'Away Team']
numerical_cols = X.select_dtypes(include=['number']).columns.tolist()

# Update transformers for the preprocessing pipeline
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Update the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Update the model pipeline with RandomForestClassifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=0))
])

# Split the data into training and testing sets again
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the model with the updated pipeline
model.fit(X_train, y_train)

# Extracting feature importances from the RandomForest model
feature_importances = model.named_steps['model'].feature_importances_

# Getting feature names from the preprocessor
feature_names = numerical_cols + \
    list(model.named_steps['preprocessor'].named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols))

# Creating a DataFrame for better visualization of feature importances
features_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
features_df.sort_values(by='Importance', ascending=False, inplace=True)

# Displaying the top 20 features by importance
top_features_df = features_df.head(23)
top_features_df.head(30)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.replace([np.inf, -np.inf], np.nan, inplace=True)


Unnamed: 0,Feature,Importance
206,Defense_Attack_Interaction,0.021005
95,Diff_ShotsOnTarget,0.020709
219,Performance Differential,0.014146
192,Attack Strength Difference,0.013889
61,IWCA,0.009509
2,Home Shots on Target,0.009461
62,PSCH,0.009457
3,Away Shots on Target,0.009397
64,PSCA,0.009392
105,Pts/MP (Home),0.009282


In [109]:
# Selecting only the top 20 features for the model
top_features = top_features_df['Feature'].tolist()
X_top_features = filtered_df[top_features]

# Splitting the data with only top features into training and testing sets
X_train_top, X_test_top, y_train, y_test = train_test_split(X_top_features, y, test_size=0.2, random_state=0)

# Since some of the top features are categorical, we need to adjust the preprocessing steps
categorical_cols_top = [col for col in categorical_cols if col in top_features]
numerical_cols_top = [col for col in numerical_cols if col in top_features]

# Update the preprocessing pipeline for top features
preprocessor_top = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols_top),
        ('cat', categorical_transformer, categorical_cols_top)
    ])

# Update the model pipeline with only top features
model_top = Pipeline(steps=[
    ('preprocessor', preprocessor_top),
    ('model', RandomForestClassifier(random_state=0))
])

# Fit the model with top features
model_top.fit(X_train_top, y_train)

# Existing code for prediction
y_pred_top = model_top.predict(X_test_top)



# Getting prediction probabilities
y_pred_proba = model_top.predict_proba(X_test_top)

# Extracting the probabilities corresponding to the predicted class
confidence = np.max(y_pred_proba, axis=1)

# Optional: Combine predictions and their confidences in a DataFrame for better visualization
import pandas as pd
predictions_df = pd.DataFrame({
    'Predicted Label': y_pred_top,
    'Confidence': confidence
})

# Existing code for evaluation
accuracy_top = accuracy_score(y_test, y_pred_top)
report_top = classification_report(y_test, y_pred_top)

# You can return or print the predictions along with their confidence levels
predictions_df, accuracy_top, report_top


(    Predicted Label  Confidence
 0                 H        0.74
 1                 A        0.63
 2                 H        0.57
 3                 D        0.40
 4                 H        0.38
 ..              ...         ...
 188               H        0.71
 189               H        0.56
 190               H        0.63
 191               A        0.49
 192               H        0.84
 
 [193 rows x 2 columns],
 0.6787564766839378,
 '              precision    recall  f1-score   support\n\n           A       0.69      0.79      0.74        57\n           D       0.49      0.35      0.40        49\n           H       0.74      0.79      0.77        87\n\n    accuracy                           0.68       193\n   macro avg       0.64      0.64      0.64       193\nweighted avg       0.66      0.68      0.67       193\n')

In [110]:
test2=["Prob Away Win","Prob Home Win","Prob Draw"]
test1 = [col for col in filtered_df.columns if col in top_features or col in categorical_cols or col in test2]

In [111]:
test1

['Home Shots on Target',
 'Away Shots on Target',
 'Bet365 Away Win Odds',
 'Pinnacle Home Win Odds',
 'William Hill Home Win Odds',
 'Market Maximum Away Win Odds',
 'IWCA',
 'PSCH',
 'PSCA',
 'VCCH',
 'VCCA',
 'MaxCA',
 'AvgCA',
 'Diff_ShotsOnTarget',
 'D (Home)',
 'Pts/MP (Home)',
 'W (Away)',
 'D (Away)',
 'Pts/MP (Away)',
 'Attack Strength Difference',
 'Defense_Attack_Interaction',
 'Prob Home Win',
 'Prob Away Win',
 'Prob Draw',
 'Expected Dominance',
 'Performance Differential',
 'Home Team',
 'Away Team']

In [112]:
import joblib
X_top_features = filtered_df[test1]

# Splitting the data with only top features into training and testing sets
X_train_top, X_test_top, y_train, y_test = train_test_split(X_top_features, y, test_size=0.2, random_state=0)

# Since some of the top features are categorical, we need to adjust the preprocessing steps
categorical_cols_top = [col for col in categorical_cols if col in test1]
numerical_cols_top = [col for col in numerical_cols if col in test1]

# Update the preprocessing pipeline for top features
preprocessor_top = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols_top),
        ('cat', categorical_transformer, categorical_cols_top)
    ])

# Update the model pipeline with only top features
model_top = Pipeline(steps=[
    ('preprocessor', preprocessor_top),
    ('model', RandomForestClassifier(random_state=0))
])

# Fit the model with top features
model_top.fit(X_train_top, y_train)

# Existing code for prediction
y_pred_top = model_top.predict(X_test_top)



# Getting prediction probabilities
y_pred_proba = model_top.predict_proba(X_test_top)

# Extracting the probabilities for each outcome
prob_home_win = y_pred_proba[:, 2]*100
prob_away_win = y_pred_proba[:, 0]*100
prob_draw = y_pred_proba[:, 1]*100

# Creating DataFrame with predictions and their confidences
import pandas as pd
predictions_df = pd.DataFrame({
    'Home Team': X_test_top['Home Team'],
    'Away Team': X_test_top['Away Team'],
    'Predicted Label': y_pred_top,
    'Probablity of Home Win %': prob_home_win,
    'Probablity of Away Win %': prob_away_win,
    'Probablity of Draw %': prob_draw,
    'Points per match of home team': X_test_top['Pts/MP (Home)'],
    'Points per match of away team': X_test_top['Pts/MP (Away)'],
    'AvgCA':X_test_top['AvgCA']


    
})

# Existing code for evaluation
accuracy_top = accuracy_score(y_test, y_pred_top)
report_top = classification_report(y_test, y_pred_top)

# Display the DataFrame
predictions_df,accuracy_top, report_top

# Save the model
joblib.dump(model_top, 'football_prediction_model.pkl')

['football_prediction_model.pkl']

In [113]:
import pandas as pd

scoreline_buckets = {
    2: [  # 2 represents Home Win
        ((30, 60), ['2-0', '3-1', '1-0']),
        ((60, 80), ['3-0', '4-1', '2-1']),
        ((80, 100), ['4-0', '5-1', '3-1'])
    ],
    0: [  # 0 represents Away Win
        ((30, 60), ['0-2', '1-3', '0-1']),
        ((60, 80), ['0-3', '1-4', '1-2']),
        ((80, 100), ['0-4', '1-5', '1-3'])
    ],
    1: [  # 1 represents Draw
        ((30, 60), ['0-0', '1-1', '2-2']),
        ((60, 80), ['1-1', '2-2', '3-3']),
        ((80, 100), ['2-2', '3-3', '4-4'])
    ]
}


# Adjusted function to determine multiple scorelines
def get_estimated_scorelines(label, home_prob, away_prob, draw_prob):
    if label == 2:  # Home Win
        prob = home_prob
    elif label == 0:  # Away Win
        prob = away_prob
    else:  # Draw
        prob = draw_prob

    for (start, end), scorelines in scoreline_buckets[label]:
        if start <= prob <= end:
            return ', '.join(scorelines)
    return 'No estimate'  # Return this if no range matches
# Assuming 'H' = 2, 'A' = 0, 'D' = 1
predictions_df['Predicted Label'] = predictions_df['Predicted Label'].map({'H': 2, 'A': 0, 'D': 1})
# Apply the function to the DataFrame
predictions_df['Estimated Scorelines'] = predictions_df.apply(
    lambda row: get_estimated_scorelines(row['Predicted Label'], row['Probablity of Home Win %'], row['Probablity of Away Win %'], row['Probablity of Draw %']), 
    axis=1
)

# Display the DataFrame with estimated scorelines
print(predictions_df)


               Home Team            Away Team  Predicted Label  \
753          KV Mechelen   Sporting Charleroi                2   
278           Hoffenheim     Bayer Leverkusen                0   
14     Tottenham Hotspur    Manchester United                2   
679                Eupen  Oud-Heverlee Leuven                2   
338           Villarreal              Almería                2   
..                   ...                  ...              ...   
597               Monaco            Marseille                2   
712  Oud-Heverlee Leuven       Standard Liège                2   
720             Westerlo             Kortrijk                2   
18        Crystal Palace              Arsenal                0   
390            Barcelona      Athletic Bilbao                2   

     Probablity of Home Win %  Probablity of Away Win %  Probablity of Draw %  \
753                      69.0                       6.0                  25.0   
278                      20.0                

In [56]:
predictions_df.columns 

Index(['Home Team', 'Away Team', 'Predicted Label', 'Probablity of Home Win %',
       'Probablity of Away Win %', 'Probablity of Draw %',
       'Points per match of home team', 'Points per match of away team',
       'Estimated Scorelines'],
      dtype='object')

In [114]:
predictions_df.to_csv("Results.csv")

In [115]:
csv_file_path = 'Results.csv'
df_results = pd.read_csv(csv_file_path)

In [116]:
df_results.head()

Unnamed: 0.1,Unnamed: 0,Home Team,Away Team,Predicted Label,Probablity of Home Win %,Probablity of Away Win %,Probablity of Draw %,Points per match of home team,Points per match of away team,AvgCA,Estimated Scorelines
0,753,KV Mechelen,Sporting Charleroi,2,69.0,6.0,25.0,1.07,0.93,2.7,"3-0, 4-1, 2-1"
1,278,Hoffenheim,Bayer Leverkusen,0,20.0,64.0,16.0,1.73,2.82,1.53,"0-3, 1-4, 1-2"
2,14,Tottenham Hotspur,Manchester United,2,46.0,36.0,18.0,2.17,1.75,2.25,"2-0, 3-1, 1-0"
3,679,Eupen,Oud-Heverlee Leuven,2,43.0,24.0,33.0,1.0,0.86,2.42,"2-0, 3-1, 1-0"
4,338,Villarreal,Almería,2,44.0,30.0,26.0,0.92,0.23,5.78,"2-0, 3-1, 1-0"


In [117]:
label_mapping = {2: 'H', 0: 'A', 1: 'D'}
df_results['Predicted Resultes'] = df_results['Predicted Label'].map(label_mapping)

In [118]:
df_results = df_results.drop(columns=['Predicted Label'])

In [119]:
print(df_results)

     Unnamed: 0            Home Team            Away Team  \
0           753          KV Mechelen   Sporting Charleroi   
1           278           Hoffenheim     Bayer Leverkusen   
2            14    Tottenham Hotspur    Manchester United   
3           679                Eupen  Oud-Heverlee Leuven   
4           338           Villarreal              Almería   
..          ...                  ...                  ...   
188         597               Monaco            Marseille   
189         712  Oud-Heverlee Leuven       Standard Liège   
190         720             Westerlo             Kortrijk   
191          18       Crystal Palace              Arsenal   
192         390            Barcelona      Athletic Bilbao   

     Probablity of Home Win %  Probablity of Away Win %  Probablity of Draw %  \
0                        69.0                       6.0                  25.0   
1                        20.0                      64.0                  16.0   
2                       

In [120]:
df_results.to_csv("Results_final.csv")

In [17]:
X_test_top.columns

Index(['Home Shots on Target', 'Away Shots on Target', 'Bet365 Away Win Odds',
       'Pinnacle Home Win Odds', 'William Hill Home Win Odds',
       'Market Maximum Away Win Odds', 'IWCA', 'PSCH', 'PSCA', 'VCCH', 'VCCA',
       'MaxCA', 'AvgCA', 'Diff_ShotsOnTarget', 'D (Home)', 'Pts/MP (Home)',
       'W (Away)', 'D (Away)', 'Pts/MP (Away)', 'Attack Strength Difference',
       'Defense_Attack_Interaction', 'Prob Home Win', 'Prob Away Win',
       'Prob Draw', 'Expected Dominance', 'Performance Differential',
       'Home Team', 'Away Team'],
      dtype='object')

In [11]:
model_top

In [14]:
X_top_features.to_csv("X_top_features.csv")

In [125]:
import pandas as pd
def get_match_data(home_team, away_team,X_top_features ):
  
    # Filter the data for the given home and away team
    match_data = X_top_features[(X_top_features['Home Team'] == home_team) & (X_top_features['Away Team'] == away_team)]
    

    # Existing code for prediction
    y_pred_top = model_top.predict(match_data)


    # Getting prediction probabilities
    y_pred_proba = model_top.predict_proba(match_data)
    # Extracting the probabilities for each outcome
    prob_home_win = y_pred_proba[:, 2]*100
    prob_away_win = y_pred_proba[:, 0]*100
    prob_draw = y_pred_proba[:, 1]*100
    
    # Creating DataFrame with predictions and their confidences

    predictions_df = pd.DataFrame({
        'Home Team': match_data['Home Team'],
        'Away Team': match_data['Away Team'],
        'Predicted Label': y_pred_top,
        'Probablity of Home Win %': prob_home_win,
        'Probablity of Away Win %': prob_away_win,
        'Probablity of Draw %': prob_draw,
        'Points per match of home team': match_data['Pts/MP (Home)'],
        'Points per match of away team': match_data['Pts/MP (Away)'],
        

        
    })

    
    
    scoreline_buckets = {
    2: [  # 2 represents Home Win
        ((40, 60), ['2-0', '2-1', '1-0']),
        ((60, 80), ['3-0', '4-1', '3-1']),
        ((80, 100), ['4-0', '2-0', '4-2'])
    ],
    0: [  # 0 represents Away Win
        ((40, 60), ['0-2', '1-2', '0-1']),
        ((60, 80), ['0-3', '1-4', '1-2']),
        ((80, 100), ['0-4', '0-3', '2-4'])
    ],
    1: [  # 1 represents Draw
        ((40, 60), ['0-0', '1-1', '2-2']),
        ((60, 80), ['1-1', '2-2', '3-3']),
        ((80, 100), ['2-2', '3-3', '1-1'])
    ]
    }


    # Adjusted function to determine multiple scorelines
    def get_estimated_scorelines(label, home_prob, away_prob, draw_prob):
        if label == 2:  # Home Win
            prob = home_prob
        elif label == 0:  # Away Win
            prob = away_prob
        else:  # Draw
            prob = draw_prob

        for (start, end), scorelines in scoreline_buckets[label]:
            if start <= prob <= end:
                return ', '.join(scorelines)
        return 'No estimate'  # Return this if no range matches
    # Assuming 'H' = 2, 'A' = 0, 'D' = 1
    predictions_df['Predicted Label'] = predictions_df['Predicted Label'].map({'H': 2, 'A': 0, 'D': 1})
    # Apply the function to the DataFrame
    predictions_df['Estimated Scorelines'] = predictions_df.apply(
        lambda row: get_estimated_scorelines(row['Predicted Label'], row['Probablity of Home Win %'], row['Probablity of Away Win %'], row['Probablity of Draw %']), 
        axis=1
    )
    predictions_df['Predicted Label'] = predictions_df['Predicted Label'].map({2: 'H', 0:'A', 1:'D'})
    print(predictions_df)

# Example usage of the function
example_home_team = "Hoffenheim"
example_away_team = "Bayer Leverkusen"
get_match_data(example_home_team, example_away_team, X_top_features)


      Home Team         Away Team Predicted Label  Probablity of Home Win %  \
278  Hoffenheim  Bayer Leverkusen               A                      20.0   

     Probablity of Away Win %  Probablity of Draw %  \
278                      64.0                  16.0   

     Points per match of home team  Points per match of away team  \
278                           1.73                           2.82   

    Estimated Scorelines  
278        0-3, 1-4, 1-2  
