 ## Step 3: Prediction Model (WITH PCA features)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.ml.feature import StandardScaler, PCA, VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, udf
from pyspark.sql.types import FloatType, ArrayType, IntegerType
import pyspark.sql.functions as F

# For scikit-learn models (we'll use these with Spark)
from sklearn.preprocessing import StandardScaler as SklearnStandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier as SklearnRandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

# Initialize Spark Session
spark = SparkSession.builder.appName("FootballPrediction").getOrCreate()


In [None]:
# Read the data
df_pandas = spark.read.csv("/FileStore/df.csv", header = True, inferSchema = True, sep=';')
df = spark.createDataFrame(df_pandas)


 ## Model

 ### Preprocessing

In [None]:
# Select only the most important features from PCA
selected_features = [
    'PC1',
    'PC2',
    'PC3',
    'PC4',
    'PC5',
    'PC6'
]

# Create feature vector assembler
assembler = VectorAssembler(inputCols=selected_features, outputCol="features")

# Create a scaler
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", 
                       withStd=True, withMean=True)

# Print dataset information
print(f"Dataset shape: {df.count()} rows, {len(df.columns)} columns")
df.groupBy("HomeWin").count().show()


In [None]:
# Split the data into training and testing sets
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# Create a pipeline for preprocessing
preprocessing_pipeline = Pipeline(stages=[
    assembler,
    scaler
])

# Fit preprocessing on training data
preprocessing_model = preprocessing_pipeline.fit(train_df)

# Transform both training and testing data
train_processed = preprocessing_model.transform(train_df)
test_processed = preprocessing_model.transform(test_df)

# Cache the processed datasets for faster access
train_processed.cache()
test_processed.cache()

# Create pandas dataframes for scikit-learn models
train_pd = train_processed.select("scaledFeatures", "HomeWin", "B365H", "B365H_prob").toPandas()
test_pd = test_processed.select("scaledFeatures", "HomeWin", "B365H", "B365H_prob").toPandas()

# Extract features and labels
X_train = np.array([x.toArray() for x in train_pd["scaledFeatures"]])
y_train = train_pd["HomeWin"].values
X_test = np.array([x.toArray() for x in test_pd["scaledFeatures"]])
y_test = test_pd["HomeWin"].values


 ### Machine Learning Models

 Train and evaluate multiple machine learning models

 We will test 4 different models:

 - Decision Tree: A simple tree-based model

 - Random Forest: An ensemble of decision trees

 - Support Vector Machine (SVM): A powerful classifier that works well with scaled data

 - K-Nearest Neighbors (KNN): A distance-based classifier





In [None]:
# Define Spark ML models
spark_lr = LogisticRegression(featuresCol="scaledFeatures", labelCol="HomeWin", maxIter=100)
spark_dt = DecisionTreeClassifier(featuresCol="scaledFeatures", labelCol="HomeWin")
spark_rf = RandomForestClassifier(featuresCol="scaledFeatures", labelCol="HomeWin", 
                                 numTrees=100, subsamplingRate=0.8)

# Initialize scikit-learn models
svm_model = SVC(kernel='rbf', probability=True, random_state=42)
knn_model = KNeighborsClassifier(n_neighbors=10)

# Create evaluators for Spark models
binary_evaluator = BinaryClassificationEvaluator(
    labelCol="HomeWin", rawPredictionCol="rawPrediction")
multi_evaluator = MulticlassClassificationEvaluator(
    labelCol="HomeWin", predictionCol="prediction")

# Lists to store metrics
model_names = []
accuracies = []
precisions = []
recalls = []
f1_scores = []
auc_scores = []
trained_models = {}

# Function to evaluate Spark models
def evaluate_spark_model(model, name):
    model_fitted = model.fit(train_processed)
    predictions = model_fitted.transform(test_processed)
    
    # Calculate metrics
    accuracy = multi_evaluator.setMetricName("accuracy").evaluate(predictions)
    precision = multi_evaluator.setMetricName("weightedPrecision").evaluate(predictions)
    recall = multi_evaluator.setMetricName("weightedRecall").evaluate(predictions)
    f1 = multi_evaluator.setMetricName("f1").evaluate(predictions)
    auc_score = binary_evaluator.evaluate(predictions)
    
    # Store metrics
    model_names.append(name)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    auc_scores.append(auc_score)
    trained_models[name] = model_fitted
    
    return model_fitted, predictions

# Function to evaluate scikit-learn models
def evaluate_sklearn_model(model, name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Get probabilities
    if hasattr(model, "predict_proba"):
        y_probs = model.predict_proba(X_test)[:,1]
    else:
        y_probs = model.decision_function(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Calculate ROC curve and AUC
    fpr, tpr, _ = roc_curve(y_test, y_probs)
    roc_auc = auc(fpr, tpr)
    
    # Store metrics
    model_names.append(name)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    auc_scores.append(roc_auc)
    trained_models[name] = model
    
    return model, y_probs

# Evaluate Spark models
lr_model, lr_preds = evaluate_spark_model(spark_lr, "Logistic Regression")
dt_model, dt_preds = evaluate_spark_model(spark_dt, "Decision Tree")
rf_model, rf_preds = evaluate_spark_model(spark_rf, "Random Forest")

# Evaluate scikit-learn models
svm_fitted, svm_probs = evaluate_sklearn_model(svm_model, "SVM")
knn_fitted, knn_probs = evaluate_sklearn_model(knn_model, "KNN")

# Create a DataFrame with the results
results_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracies,
    'Precision': precisions, 
    'Recall': recalls,
    'F1 Score': f1_scores,
    'AUC': auc_scores
})

# Style the DataFrame
styled_df = results_df.style\
    .background_gradient(cmap='RdYlGn')\
    .format({
        'Accuracy': '{:.4f}',
        'Precision': '{:.4f}',
        'Recall': '{:.4f}',
        'F1 Score': '{:.4f}',
        'AUC': '{:.4f}'
    })\
    .set_caption('Model Performance Comparison')

display(styled_df)


In [None]:
# Define functions to get probabilities from Spark models
def get_spark_model_probs(model_name):
    model = trained_models[model_name]
    predictions = model.transform(test_processed)
    if model_name == "Logistic Regression":
        # For Logistic Regression, use probability column
        probs_df = predictions.select("HomeWin", "probability").toPandas()
        probs = np.array([p[1] for p in probs_df.probability])
    else:
        # For tree-based models, use prediction column
        probs_df = predictions.select("HomeWin", "probability").toPandas()
        probs = np.array([p[1] for p in probs_df.probability])
    return probs

# Get probabilities for positive class from Spark models
rf_probs = get_spark_model_probs("Random Forest")
lr_probs = get_spark_model_probs("Logistic Regression")

# Get Bet365's implied probability for home win
bet365_probs = test_pd['B365H_prob'].values

# Calculate ROC curves
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
svm_fpr, svm_tpr, _ = roc_curve(y_test, svm_probs)
bet365_fpr, bet365_tpr, _ = roc_curve(y_test, bet365_probs)

# Calculate AUC scores
rf_auc = auc(rf_fpr, rf_tpr)
lr_auc = auc(lr_fpr, lr_tpr)
svm_auc = auc(svm_fpr, svm_tpr)
bet365_auc = auc(bet365_fpr, bet365_tpr)

# Plot ROC curves
plt.figure(figsize=(10, 6))
plt.plot(rf_fpr, rf_tpr, label=f'Random Forest (AUC = {rf_auc:.3f})')
plt.plot(lr_fpr, lr_tpr, label=f'Logistic Regression (AUC = {lr_auc:.3f})')
plt.plot(svm_fpr, svm_tpr, label=f'SVM (AUC = {svm_auc:.3f})')
plt.plot(bet365_fpr, bet365_tpr, color='black', linestyle='-', linewidth=2, 
         label=f'Bet365 Implied (AUC = {bet365_auc:.3f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend()
plt.grid(True)
plt.show()

# Print AUC scores
print("\nAUC Scores:")
print(f"Random Forest: {rf_auc:.4f}")
print(f"Logistic Regression: {lr_auc:.4f}")
print(f"SVM: {svm_auc:.4f}")
print(f"Bet365 Implied: {bet365_auc:.4f}")


 ### Deep Learning Models

In [None]:
# Create and train the MLPClassifier
mlp = MLPClassifier(
    hidden_layer_sizes=(100, 50),  # Two hidden layers with 100 and 50 neurons
    activation='relu',             # ReLU activation function
    solver='adam',                 # Adam optimizer
    alpha=0.0001,                  # L2 regularization term
    batch_size='auto',             # Automatic batch size
    learning_rate='adaptive',      # Adaptive learning rate
    max_iter=1000,                 # Maximum number of iterations
    early_stopping=True,           # Use early stopping
    validation_fraction=0.1,       # 10% of training data for validation
    n_iter_no_change=10,           # Stop if no improvement after 10 iterations
    random_state=42                # For reproducibility
)

# Train the model on the scaled training data
mlp.fit(X_train, y_train)

# Make predictions
mlp_probs = mlp.predict_proba(X_test)[:, 1]
mlp_pred = mlp.predict(X_test)

# Calculate ROC curve and AUC score
mlp_fpr, mlp_tpr, _ = roc_curve(y_test, mlp_probs)
mlp_auc = auc(mlp_fpr, mlp_tpr)

# Calculate metrics for MLP
mlp_accuracy = accuracy_score(y_test, mlp_pred)
mlp_precision = precision_score(y_test, mlp_pred)
mlp_recall = recall_score(y_test, mlp_pred)
mlp_f1 = f1_score(y_test, mlp_pred)

# Store the MLP model for later use
trained_models['Neural Network'] = mlp

# Calculate metrics for Bet365 implied probabilities
# Find optimal threshold for Bet365 classification (maximize f1-score)
thresholds = np.arange(0.1, 0.9, 0.01)
best_f1 = 0
best_threshold = 0.5  # default threshold

for threshold in thresholds:
    bet365_pred = (bet365_probs >= threshold).astype(int)
    f1 = f1_score(y_test, bet365_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

# Use best threshold to make predictions
bet365_pred = (bet365_probs >= best_threshold).astype(int)
bet365_accuracy = accuracy_score(y_test, bet365_pred)
bet365_precision = precision_score(y_test, bet365_pred)
bet365_recall = recall_score(y_test, bet365_pred)
bet365_f1 = f1_score(y_test, bet365_pred)

# Add MLP and Bet365 to the results DataFrame
results_df_with_all = pd.DataFrame({
    'Model': model_names + ['Neural Network', 'Bet365 Implied'],
    'Accuracy': accuracies + [mlp_accuracy, bet365_accuracy],
    'Precision': precisions + [mlp_precision, bet365_precision], 
    'Recall': recalls + [mlp_recall, bet365_recall],
    'F1 Score': f1_scores + [mlp_f1, bet365_f1],
    'AUC': auc_scores + [mlp_auc, bet365_auc]
})

# Style the DataFrame
styled_df_with_all = results_df_with_all.style\
    .background_gradient(cmap='RdYlGn')\
    .format({
        'Accuracy': '{:.4f}',
        'Precision': '{:.4f}',
        'Recall': '{:.4f}',
        'F1 Score': '{:.4f}',
        'AUC': '{:.4f}'
    })\
    .set_caption('Model Performance Comparison (Including Neural Network and Bet365)')

display(styled_df_with_all)

# Add MLP to the ROC curve comparison
plt.figure(figsize=(10, 6))
plt.plot(rf_fpr, rf_tpr, color='#6495ED', label=f'Random Forest (AUC = {rf_auc:.3f})')
plt.plot(lr_fpr, lr_tpr, color='#4169E1', label=f'Logistic Regression (AUC = {lr_auc:.3f})')
plt.plot(svm_fpr, svm_tpr, color='#6495ED', label=f'SVM (AUC = {svm_auc:.3f})')
plt.plot(mlp_fpr, mlp_tpr, color='red', label=f'Neural Network (AUC = {mlp_auc:.3f})')
plt.plot(bet365_fpr, bet365_tpr, color='black', linestyle='-', linewidth=2, 
         label=f'Bet365 Implied (AUC = {bet365_auc:.3f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend()
plt.grid(True)
plt.show()

# Print AUC scores
print("\nAUC Scores:")
print(f"Random Forest: {rf_auc:.4f}")
print(f"Logistic Regression: {lr_auc:.4f}")
print(f"SVM: {svm_auc:.4f}")
print(f"Neural Network: {mlp_auc:.4f}")
print(f"Bet365 Implied: {bet365_auc:.4f}")


 ### Extra: Using the model predictions to build a betting strategy

 Placing a bet for the Home Team Win whenever the model probability is greather than the Bet365 Implied probability multiplied by a minimum margin.

In [None]:
# Betting Strategy Simulation 
# This section is not parallelized as it depends on sequential betting

# Min Bet Margin 
min_bet_margin = 0.05

# Get the odds
bet365_odds = test_pd['B365H'].values

# Define random seeds for different random betting strategies
random_seeds = [42, 123, 456]

# Initialize results dictionary
betting_results = {
    'Random Forest': {'bets': 0, 'wins': 0, 'losses': 0, 'profit': 0, 'roi': 0},
    'Logistic Regression': {'bets': 0, 'wins': 0, 'losses': 0, 'profit': 0, 'roi': 0},
    'SVM': {'bets': 0, 'wins': 0, 'losses': 0, 'profit': 0, 'roi': 0},
    'Neural Network': {'bets': 0, 'wins': 0, 'losses': 0, 'profit': 0, 'roi': 0},
    'Random Selection 1': {'bets': 0, 'wins': 0, 'losses': 0, 'profit': 0, 'roi': 0},
    'Random Selection 2': {'bets': 0, 'wins': 0, 'losses': 0, 'profit': 0, 'roi': 0},
    'Random Selection 3': {'bets': 0, 'wins': 0, 'losses': 0, 'profit': 0, 'roi': 0}
}

# Betting parameters
initial_bankroll = 1000  # Starting with $1000
fixed_stake = 10         # Fixed stake of $10 per bet

# Helper function to process a bet regardless of strategy
def process_bet(model_name, index, bankroll):
    stake = fixed_stake
    betting_results[model_name]['bets'] += 1
    
    # Check if it's a win (home team won)
    if y_test[index] == 1:
        # Win: get stake back plus winnings
        winnings = stake * (bet365_odds[index] - 1)
        bankroll += winnings
        betting_results[model_name]['wins'] += 1
        return bankroll, (index, stake, winnings, bankroll)
    else:
        # Loss: lose stake
        bankroll -= stake
        betting_results[model_name]['losses'] += 1
        return bankroll, (index, stake, -stake, bankroll)

# Function to simulate betting for ML models
def simulate_betting(model_probs, model_name):
    bankroll = initial_bankroll
    bet_history = []
    
    for i in range(len(y_test)):
        # Only bet when model probability is higher than bookmaker's implied probability
        if model_probs[i] > bet365_probs[i]*(1 + min_bet_margin) and model_probs[i] > 0.5:
            bankroll, bet_record = process_bet(model_name, i, bankroll)
            bet_history.append(bet_record)
    
    # Calculate profit and ROI
    betting_results[model_name]['profit'] = bankroll - initial_bankroll
    betting_results[model_name]['roi'] = (bankroll / initial_bankroll - 1) * 100
    
    return bet_history

# Function to simulate random betting strategy
def simulate_random_betting(model_name, seed=42):
    bankroll = initial_bankroll
    bet_history = []
    
    # Randomly select matches to bet on
    np.random.seed(seed)
    bet_indices = np.random.choice(len(y_test), size=len(y_test)//6, replace=False)
    
    for i in bet_indices:
        bankroll, bet_record = process_bet(model_name, i, bankroll)
        bet_history.append(bet_record)
    
    # Calculate profit and ROI
    betting_results[model_name]['profit'] = bankroll - initial_bankroll
    betting_results[model_name]['roi'] = (bankroll / initial_bankroll - 1) * 100
    
    return bet_history

# Run simulations
model_histories = {
    'Random Forest': simulate_betting(rf_probs, 'Random Forest'),
    'Logistic Regression': simulate_betting(lr_probs, 'Logistic Regression'),
    'SVM': simulate_betting(svm_probs, 'SVM'),
    'Neural Network': simulate_betting(mlp_probs, 'Neural Network')
}

# Run random betting simulations with different seeds
random_bet_histories = []
for i, seed in enumerate(random_seeds):
    model_name = f'Random Selection {i+1}'
    random_bet_histories.append(simulate_random_betting(model_name, seed=seed))

# Display results
print("\nBetting Strategy Results:")
for model, results in betting_results.items():
    print(f"\n{model}:")
    win_rate_str = f"Wins: {results['wins']} ({results['wins']/results['bets']*100:.2f}% win rate)" if results['bets'] > 0 else "Wins: 0 (0.00% win rate)"
    print(win_rate_str)
    print(f"Profit: ${results['profit']:.2f}")
    print(f"ROI: {results['roi']:.2f}%")

# Plot bankroll evolution with custom colors
plt.figure(figsize=(12, 6))

# Define color scheme
orange_shades = ['#ff7f0e', '#FFA500', '#FF8C00']  # Shades of orange for ML models
red_color = '#d62728'  # Red for deep learning model
purple_shades = ['#9467bd', '#8674A1', '#7B68EE']  # Shades of purple for random selections

# Plot ML and Neural Network model bankrolls
for i, (model_name, history) in enumerate(model_histories.items()):
    if history:
        if model_name == 'Neural Network':
            color = red_color
        else:
            color = orange_shades[i % len(orange_shades)]
        
        bankroll = [initial_bankroll] + [bet[3] for bet in history]
        plt.plot(range(len(bankroll)), bankroll, label=model_name, color=color)

# Plot random selection bankrolls
for i, history in enumerate(random_bet_histories):
    if history:
        random_bankroll = [initial_bankroll] + [bet[3] for bet in history]
        plt.plot(range(len(random_bankroll)), random_bankroll, 
                 label=f'Random Selection {i+1}', 
                 color=purple_shades[i], 
                 linestyle='--')

plt.axhline(y=initial_bankroll, color='green', linestyle='--', label='Initial Bankroll')
plt.xlabel('Number of Bets')
plt.ylabel('Bankroll ($)')
plt.title('Bankroll Evolution')
plt.legend()
plt.grid(True)
plt.show()

# Create a DataFrame with betting results for better visualization
results_table = {
    'Model Type': [],
    'Model': [],
    'Total Bets': [],
    'Win Rate (%)': [],
    'Profit ($)': [],
    'ROI (%)': []
}

for model, results in betting_results.items():
    # Determine model type based on model name
    model_type = 'Random Selection' if 'Random Selection' in model else ('Deep Learning' if model == 'Neural Network' else 'Machine Learning')
    
    # Calculate win rate safely
    win_rate = results['wins']/results['bets']*100 if results['bets'] > 0 else 0
    
    # Add all data to the results table
    results_table['Model Type'].append(model_type)
    results_table['Model'].append(model)
    results_table['Total Bets'].append(results['bets'])
    results_table['Win Rate (%)'].append(win_rate)
    results_table['Profit ($)'].append(results['profit'])
    results_table['ROI (%)'].append(results['roi'])

# Convert to DataFrame and style it
results_df = pd.DataFrame(results_table)
styled_df = results_df.style\
    .background_gradient(subset=['Win Rate (%)', 'Profit ($)', 'ROI (%)'], cmap='RdYlGn')\
    .set_properties(**{'text-align': 'center'})\
    .format({
        'Win Rate (%)': '{:.2f}',
        'Profit ($)': '{:.2f}',
        'ROI (%)': '{:.2f}'
    })\
    .set_table_styles([{
        'selector': 'th',
        'props': [('background-color', '#f2f2f2'), 
                  ('color', 'black'),
                  ('font-weight', 'bold'),
                  ('text-align', 'center')]
    }])\
    .set_caption('Betting Strategy Performance Comparison')

display(styled_df)

# Stop the Spark session when done
spark.stop()
