In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

print("🏈 EPL Match Predictor Starting...")

# Load and preprocess data
df = pd.read_csv("epl_final.csv")
print("✅ Data loaded successfully!")
print("Shape of dataset:", df.shape)

column_left = ['Season', 'HomeTeam', 'AwayTeam', 'FullTimeHomeGoals',
               'FullTimeAwayGoals', 'FullTimeResult', 'HalfTimeHomeGoals',
               'HalfTimeAwayGoals', 'HalfTimeResult', 'HomeShots', 'AwayShots',
               'HomeShotsOnTarget', 'AwayShotsOnTarget', 'HomeCorners', 'AwayCorners',
               'HomeFouls', 'AwayFouls', 'HomeYellowCards', 'AwayYellowCards',
               'HomeRedCards', 'AwayRedCards']
df = df[column_left]

# Encode FullTimeResult
result_encoder = LabelEncoder()
df['FullTimeResult'] = df['FullTimeResult'].astype(str)
result_encoder.fit(df['FullTimeResult'])

# Encode team names
team_encoder = LabelEncoder()
all_teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).unique()
team_encoder.fit(all_teams)
df['HomeTeam'] = team_encoder.transform(df['HomeTeam'])
df['AwayTeam'] = team_encoder.transform(df['AwayTeam'])
TEAM_NAMES = team_encoder.classes_

# Feature engineering
df['GoalDiff'] = df['FullTimeHomeGoals'] - df['FullTimeAwayGoals']
df['HomeAdvantage'] = 1
df['ShotAccuracyHome'] = df['HomeShotsOnTarget'] / (df['HomeShots'] + 1)
df['ShotAccuracyAway'] = df['AwayShotsOnTarget'] / (df['AwayShots'] + 1)
df['TotalCards'] = df['HomeYellowCards'] + df['AwayYellowCards'] + df['HomeRedCards'] + df['AwayRedCards']

print("✅ Feature engineering completed!")

# Calculate team statistics
def calculate_team_stats(df):
    team_stats = {}
    
    for team_id in df['HomeTeam'].unique():
        home_games = df[df['HomeTeam'] == team_id]
        away_games = df[df['AwayTeam'] == team_id]
        
        # Home performance
        home_goals_scored = home_games['FullTimeHomeGoals'].mean()
        home_goals_conceded = home_games['FullTimeAwayGoals'].mean()
        home_wins = (home_games['FullTimeResult'] == result_encoder.transform(['H'])[0]).mean()
        
        # Away performance  
        away_goals_scored = away_games['FullTimeAwayGoals'].mean()
        away_goals_conceded = away_games['FullTimeHomeGoals'].mean()
        away_wins = (away_games['FullTimeResult'] == result_encoder.transform(['A'])[0]).mean()
        
        team_stats[team_id] = {
            'avg_goals_scored': (home_goals_scored + away_goals_scored) / 2,
            'avg_goals_conceded': (home_goals_conceded + away_goals_conceded) / 2,
            'win_rate': (home_wins + away_wins) / 2,
            'home_advantage': home_wins - away_wins
        }
    
    return team_stats

team_stats = calculate_team_stats(df)
print("✅ Team statistics calculated!")

# Add team strength features
df['HomeTeamStrength'] = df['HomeTeam'].map(lambda x: team_stats[x]['avg_goals_scored'])
df['AwayTeamStrength'] = df['AwayTeam'].map(lambda x: team_stats[x]['avg_goals_scored'])
df['HomeTeamDefense'] = df['HomeTeam'].map(lambda x: team_stats[x]['avg_goals_conceded'])
df['AwayTeamDefense'] = df['AwayTeam'].map(lambda x: team_stats[x]['avg_goals_conceded'])
df['HomeWinRate'] = df['HomeTeam'].map(lambda x: team_stats[x]['win_rate'])
df['AwayWinRate'] = df['AwayTeam'].map(lambda x: team_stats[x]['win_rate'])

# Add form features (simplified)
df['HomeForm'] = np.random.uniform(0.2, 0.8, len(df))
df['AwayForm'] = np.random.uniform(0.2, 0.8, len(df))

# Prepare features for modeling
feature_columns = [
    'HomeTeam', 'AwayTeam', 'HomeAdvantage',
    'HomeTeamStrength', 'AwayTeamStrength',
    'HomeTeamDefense', 'AwayTeamDefense',
    'HomeWinRate', 'AwayWinRate',
    'HomeForm', 'AwayForm'
]

X = df[feature_columns]
y = df['FullTimeResult']

print("✅ Features prepared!")
print("Feature matrix shape:", X.shape)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Random Forest model
print("\n🤖 Training Random Forest model...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

print(f"✅ Random Forest Accuracy: {rf_accuracy:.3f}")

# Train Logistic Regression
print("🤖 Training Logistic Regression model...")
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)

print(f"✅ Logistic Regression Accuracy: {lr_accuracy:.3f}")

# Show model performance
print(f"\n📊 MODEL PERFORMANCE SUMMARY")
print(f"Random Forest Accuracy: {rf_accuracy:.1%}")
print(f"Logistic Regression Accuracy: {lr_accuracy:.1%}")
print(f"Training Data Size: {len(X_train)} matches")
print(f"Test Data Size: {len(X_test)} matches")

# Get team names for easy access
TEAM_NAMES = team_encoder.classes_
print(f"\n📋 Dataset contains {len(TEAM_NAMES)} teams")

# ========================================
# SIMPLE PREDICTION FUNCTION
# ========================================

def simple_predict(home_team_name, away_team_name):
    """
    🎯 SIMPLE PREDICTION FUNCTION
    Just write team names and get the winner!
    """
    
    # Find matching team names
    home_match = None
    away_match = None
    
    # Try to find home team
    for team in TEAM_NAMES:
        if home_team_name.lower() in team.lower() or team.lower() in home_team_name.lower():
            home_match = team
            break
    
    # Try to find away team
    for team in TEAM_NAMES:
        if away_team_name.lower() in team.lower() or team.lower() in away_team_name.lower():
            away_match = team
            break
    
    # Check if teams found
    if not home_match:
        print(f"❌ Home team '{home_team_name}' not found!")
        print("💡 Available teams:", ", ".join(TEAM_NAMES[:10]) + "...")
        return None
        
    if not away_match:
        print(f"❌ Away team '{away_team_name}' not found!")
        print("💡 Available teams:", ", ".join(TEAM_NAMES[:10]) + "...")
        return None
    
    try:
        # Encode teams
        home_encoded = team_encoder.transform([home_match])[0]
        away_encoded = team_encoder.transform([away_match])[0]
        
        # Create feature vector
        features = np.array([[
            home_encoded, away_encoded, 1,
            team_stats[home_encoded]['avg_goals_scored'],
            team_stats[away_encoded]['avg_goals_scored'],
            team_stats[home_encoded]['avg_goals_conceded'],
            team_stats[away_encoded]['avg_goals_conceded'],
            team_stats[home_encoded]['win_rate'],
            team_stats[away_encoded]['win_rate'],
            0.5, 0.5  # Form placeholders
        ]])
        
        # Make prediction
        prediction = rf_model.predict(features)[0]
        probabilities = rf_model.predict_proba(features)[0]
        
        # Map probabilities correctly using classes
        class_probs = dict(zip(rf_model.classes_, probabilities))
        
        # Determine winner
        if prediction == 'H':  # Home win
            winner = home_match
            result_text = f"🏠 {home_match} WINS!"
        elif prediction == 'A':  # Away win
            winner = away_match
            result_text = f"✈️ {away_match} WINS!"
        else:  # Draw
            winner = "Draw"
            result_text = "🤝 DRAW"
        
        confidence = max(probabilities)
        
        # Display result
        print(f"\n⚽ {home_match} vs {away_match}")
        print(f"🏆 PREDICTION: {result_text}")
        print(f"📊 Confidence: {confidence:.0%}")
        print(f"📈 Probabilities:")
        print(f"   🏠 {home_match}: {class_probs.get('H',0):.0%}")
        print(f"   🤝 Draw: {class_probs.get('D',0):.0%}")
        print(f"   ✈️ {away_match}: {class_probs.get('A',0):.0%}")
        
        # Confidence indicator
        if confidence > 0.6:
            print("🔥 HIGH CONFIDENCE")
        elif confidence > 0.4:
            print("⚡ MEDIUM CONFIDENCE")
        else:
            print("❓ LOW CONFIDENCE")
        
        return winner
        
    except Exception as e:
        print(f"❌ Error making prediction: {e}")
        return None

# Show available teams function
def show_teams():
    """Display all available teams"""
    print("\n📋 AVAILABLE TEAMS:")
    for i, team in enumerate(TEAM_NAMES, 1):
        print(f"{i:2d}. {team}")

print("\n" + "="*60)
print("🎉 EPL MATCH PREDICTOR IS READY!")
print("="*60)
print("\n💡 HOW TO USE:")
print("1. Predict single match:")
print("   simple_predict('Chelsea', 'Arsenal')")
print("\n2. See all teams:")
print("   show_teams()")

print(f"\n📊 Model trained on {len(df)} matches")
print(f"🎯 Accuracy: {rf_accuracy:.1%}")
print(f"📋 {len(TEAM_NAMES)} teams available")

print("\n🚀 EXAMPLE - Try this:")
print("simple_predict('Chelsea', 'Arsenal')")


🏈 EPL Match Predictor Starting...
✅ Data loaded successfully!
Shape of dataset: (9380, 22)
✅ Feature engineering completed!
✅ Team statistics calculated!
✅ Features prepared!
Feature matrix shape: (9380, 11)

🤖 Training Random Forest model...
✅ Random Forest Accuracy: 0.449
🤖 Training Logistic Regression model...
✅ Logistic Regression Accuracy: 0.530

📊 MODEL PERFORMANCE SUMMARY
Random Forest Accuracy: 44.9%
Logistic Regression Accuracy: 53.0%
Training Data Size: 7504 matches
Test Data Size: 1876 matches

📋 Dataset contains 46 teams

🎉 EPL MATCH PREDICTOR IS READY!

💡 HOW TO USE:
1. Predict single match:
   simple_predict('Chelsea', 'Arsenal')

2. See all teams:
   show_teams()

📊 Model trained on 9380 matches
🎯 Accuracy: 44.9%
📋 46 teams available

🚀 EXAMPLE - Try this:
simple_predict('Chelsea', 'Arsenal')


In [24]:
print(rf_model.classes_)


['A' 'D' 'H']


In [25]:
simple_predict('Liverpool','Everton')


⚽ Liverpool vs Everton
🏆 PREDICTION: 🏠 Liverpool WINS!
📊 Confidence: 76%
📈 Probabilities:
   🏠 Liverpool: 76%
   🤝 Draw: 23%
   ✈️ Everton: 1%
🔥 HIGH CONFIDENCE


'Liverpool'

In [3]:
simple_predict('Chelsea','MAN United')


⚽ Chelsea vs Man United
🏆 PREDICTION: 🤝 DRAW
📊 Confidence: 47%
📈 Probabilities:
   🏠 Chelsea: 37%
   🤝 Draw: 47%
   ✈️ Man United: 16%
⚡ MEDIUM CONFIDENCE


'Draw'

In [27]:
print(rf_model.classes_)

['A' 'D' 'H']


In [28]:
simple_predict('Brighton','Tottenham')


⚽ Brighton vs Tottenham
🏆 PREDICTION: ✈️ Tottenham WINS!
📊 Confidence: 48%
📈 Probabilities:
   🏠 Brighton: 40%
   🤝 Draw: 12%
   ✈️ Tottenham: 48%
⚡ MEDIUM CONFIDENCE


'Tottenham'

In [29]:
simple_predict('brighton','tottenham')


⚽ Brighton vs Tottenham
🏆 PREDICTION: ✈️ Tottenham WINS!
📊 Confidence: 48%
📈 Probabilities:
   🏠 Brighton: 40%
   🤝 Draw: 12%
   ✈️ Tottenham: 48%
⚡ MEDIUM CONFIDENCE


'Tottenham'