In [None]:
from google.colab import files
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

#  Upload dataset
uploaded = files.upload()
filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

#  Data Cleaning
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(df.mean(), inplace=True)

#  Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
label_encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

#  Feature Engineering
df["goal_diff"] = df["home_score"] - df["away_score"]

#  Define features (X) and target (y)
X = df.drop(columns=["home_team_victory"])
y = df["home_team_victory"]

#  Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#  Train Logistic Regression (FAST METHOD)
logit_model = LogisticRegression(solver='liblinear', max_iter=300, penalty='l2')
logit_model.fit(X_train, y_train)

#  Predict
y_pred_class = logit_model.predict(X_test)
y_pred_prob = logit_model.predict_proba(X_test)[:, 1]  # Get probability scores

#  Evaluate Performance
accuracy = accuracy_score(y_test, y_pred_class)
roc_auc = roc_auc_score(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred_class)

print(f"\n✔ Accuracy: {accuracy:.4f}")
print(f"✔ ROC AUC Score: {roc_auc:.4f}")
print("✔ Confusion Matrix:\n", conf_matrix)

#  Visualization
plt.figure(figsize=(8, 6))
sns.boxplot(x=y, y=df["goal_diff"], palette="coolwarm")
plt.xlabel("Home Team Victory (0 = Loss, 1 = Win)")
plt.ylabel("Goal Difference")
plt.title("Impact of Goal Difference on Home Team Victory")
plt.grid(True)
plt.show()