In [None]:
# Google Colab: Upload Local Files
from google.colab import files
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm  # Import for logistic regression
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

# ✅ Upload dataset from local machine
uploaded = files.upload()

# ✅ Load dataset
filename = list(uploaded.keys())[0]  # Get uploaded file name
df = pd.read_csv(filename)  # Load the CSV file

# ✅ Display dataset info
print("\n📊 Dataset Overview:\n")
print(df.info())  # Display column data types and missing values
print("\n🔍 First few rows of the dataset:\n")
print(df.head())  # Show first few rows

# --- 🛠 DATA CLEANING ---
# Convert 'date' column to datetime format if it exists
if "date" in df.columns:
    df["date"] = pd.to_datetime(df["date"], errors="coerce")

# Replace infinite values with NaN and fill missing values
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill missing numeric values with mean
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Fill missing categorical values with mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Ensure 'home_team_victory' is binary (0 or 1)
df["home_team_victory"] = df["home_team_victory"].astype(int)

print("\n✅ Data cleaned and missing values handled.\n")

# --- 🏆 FEATURE ENGINEERING ---
df["goal_diff"] = df["home_score"] - df["away_score"]  # Goal difference feature
df["score_interaction"] = df["home_score"] * df["away_score"]  # Interaction term

# Convert categorical variables to numeric using One-Hot Encoding
df = pd.get_dummies(df, columns=["tournament_name", "home_team", "away_team"], drop_first=True)

# Rename 'date' column to avoid conflicts
if "date" in df.columns:
    df.rename(columns={"date": "match_date"}, inplace=True)

print("\n✅ Feature Engineering Completed!")

# --- 🔍 MODEL TRAINING (Logistic Regression) ---
# Define independent (X) and dependent (y) variables
X = df.drop(columns=["home_team_victory"])  # Features
y = df["home_team_victory"]  # Target variable

# ✅ Convert all columns to numeric (fixes the issue)
for col in X.select_dtypes(include=['object']).columns:
    try:
        X[col] = pd.to_numeric(X[col])
    except ValueError:
        print(f"⚠️ Could not convert column '{col}' to numeric. Creating dummy variables.")
        X = pd.get_dummies(X, columns=[col], drop_first=True, dummy_na=False)  # Handle non-convertible columns

# ✅ Fill NaN values (introduced by conversion) with the mean of each column
X = X.fillna(X.mean())

# Final check to ensure all features are numeric
if X.select_dtypes(include=['object']).shape[1] > 0:
    print("\n🚨 Warning: Non-numeric columns detected even after encoding!")
    print(X.select_dtypes(include=['object']).columns)
    print("\n Manually converting to category codes...\n")

    # Convert remaining non-numeric columns into numerical category codes
    for col in X.select_dtypes(include=['object']).columns:
        X[col] = X[col].astype('category').cat.codes

print("\n✅ All features are now numeric.")

# ✅ Train-Test Split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ✅ Check for NaN values in training data before fitting
if X_train.isnull().sum().sum() > 0:
    print("\n🚨 ERROR: NaN values detected in training data! Fixing...")
    X_train.fillna(X_train.mean(), inplace=True)  # Fill remaining NaNs
    X_test.fillna(X_test.mean(), inplace=True)

# --- 🚀 Fit the Logistic Regression Model ---
# ✅ Add a constant to the independent variables (required for statsmodels)
X_train_const = sm.add_constant(X_train)

# ✅ Debugging step: Print any remaining non-numeric columns
non_numeric_cols = X_train_const.select_dtypes(exclude=[np.number]).columns
if len(non_numeric_cols) > 0:
    print("\n🚨 WARNING: Non-numeric columns found! Fixing now...\n", non_numeric_cols)

    # Convert categorical columns to numeric codes (if they exist)
    for col in non_numeric_cols:
        X_train_const[col] = X_train_const[col].astype(str).astype('category').cat.codes

    print("\n✅ Non-numeric columns converted to numeric.")
else:
    print("\n✅ No non-numeric columns found. Proceeding with training.")

# ✅ Train the logistic regression model
logit_model = sm.Logit(y_train, X_train_const)
logit_results = logit_model.fit()

# ✅ Display Model Summary
print("\n📊 Model Summary:")
print(logit_results.summary())

# ✅ Print AIC score (goal is to minimize this)
print("\n🎯 AIC Score:", logit_results.aic)

# --- 📈 MODEL EVALUATION ---
# Predict on Test Set
y_pred_prob = logit_results.predict(sm.add_constant(X_test))
y_pred_class = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to binary

# ✅ Evaluate Performance
accuracy = accuracy_score(y_test, y_pred_class)
conf_matrix = confusion_matrix(y_test, y_pred_class)
roc_auc = roc_auc_score(y_test, y_pred_prob)

print("\n✅ Model Performance Metrics:")
print(f"✔ Test Set Accuracy: {accuracy:.4f}")
print(f"✔ ROC AUC Score: {roc_auc:.4f}")
print("✔ Confusion Matrix:\n", conf_matrix)

# --- 📊 VISUALIZATION ---
plt.figure(figsize=(8, 6))
sns.boxplot(x="home_team_victory", y="goal_diff", data=df, palette="coolwarm")
plt.xlabel("Home Team Victory (0 = Loss, 1 = Win)")
plt.ylabel("Goal Difference")
plt.title("Impact of Goal Difference on Home Team Victory")
plt.grid(True)
plt.show()

Saving training.csv to training.csv

📊 Dataset Overview:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23719 entries, 0 to 23718
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date               23719 non-null  object 
 1   home_team          23719 non-null  object 
 2   away_team          23719 non-null  object 
 3   home_score         23719 non-null  int64  
 4   away_score         23719 non-null  int64  
 5   home_team_victory  23719 non-null  bool   
 6   draw               23719 non-null  bool   
 7   tournament_name    23719 non-null  object 
 8   city               23719 non-null  object 
 9   country            23719 non-null  object 
 10  neutral            23719 non-null  bool   
 11  scoring_team       22696 non-null  object 
 12  scoring_player     22696 non-null  object 
 13  minute             22592 non-null  float64
 14  own_goal           22696 non-null  object 
 15  penalty     

  df[col] = df[col].fillna(df[col].mode()[0])



✅ Feature Engineering Completed!
⚠️ Could not convert column 'city' to numeric. Creating dummy variables.
⚠️ Could not convert column 'country' to numeric. Creating dummy variables.
⚠️ Could not convert column 'scoring_team' to numeric. Creating dummy variables.
⚠️ Could not convert column 'scoring_player' to numeric. Creating dummy variables.
⚠️ Could not convert column 'first_shooter' to numeric. Creating dummy variables.
⚠️ Could not convert column 'winner' to numeric. Creating dummy variables.

✅ All features are now numeric.

 Index(['match_date', 'draw', 'neutral', 'own_goal', 'penalty',
       'shootout_required', 'home_team_Albania', 'home_team_Algeria',
       'home_team_American Samoa', 'home_team_Andorra',
       ...
       'winner_British Virgin Islands', 'winner_China PR', 'winner_Egypt',
       'winner_Morocco', 'winner_Nigeria', 'winner_Saint Lucia',
       'winner_Senegal', 'winner_Suriname', 'winner_Tanzania',
       'winner_Tunisia'],
      dtype='object', length=112