In [1]:
# PART 1: DATA COLLECTION AND CSV EXPORT
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

class FBRefCSVDownloader:
    """
    Download FBRef data and save as CSV files for notebook use
    """

    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

    def download_team_stats_csv(self, team_name, team_id, season="2024-2025"):
        """
        Download team statistics and save as CSV
        """
        print(f"📥 Downloading {team_name} statistics...")

        url = f"https://fbref.com/en/squads/{team_id}/{season}/{team_name}-Stats"

        try:
            response = requests.get(url, headers=self.headers)
            time.sleep(3)

            soup = BeautifulSoup(response.content, 'html.parser')
            all_stats = {}

            # Download multiple stat tables
            table_ids = {
                'standard': 'stats_standard',
                'shooting': 'stats_shooting',
                'passing': 'stats_passing',
                'defense': 'stats_defense',
                'possession': 'stats_possession'
            }

            for stat_type, table_id in table_ids.items():
                try:
                    table = soup.find('table', {'id': table_id})
                    if table:
                        df = pd.read_html(str(table))[0]

                        # Clean multi-level columns if they exist
                        if isinstance(df.columns, pd.MultiIndex):
                            df.columns = ['_'.join(col).strip() for col in df.columns]

                        # Add metadata
                        df['Team'] = team_name
                        df['Season'] = season
                        df['StatType'] = stat_type

                        # Save individual CSV
                        filename = f"{team_name.lower()}_{stat_type}_{season.replace('-', '_')}.csv"
                        df.to_csv(filename, index=False)
                        print(f"✅ Saved {filename}")

                        all_stats[stat_type] = df

                except Exception as e:
                    print(f"❌ Failed to get {stat_type} for {team_name}: {e}")

            # Save combined stats file
            if all_stats:
                combined_filename = f"{team_name.lower()}_all_stats_{season.replace('-', '_')}.csv"

                # Combine all stats with common columns
                base_df = all_stats['standard'] if 'standard' in all_stats else pd.DataFrame()

                for stat_type, df in all_stats.items():
                    if stat_type != 'standard' and not df.empty:
                        # Merge on common columns (usually Player name)
                        common_cols = set(base_df.columns) & set(df.columns)
                        if 'Player' in common_cols or any('Player' in col for col in common_cols):
                            try:
                                base_df = base_df.merge(df, on=list(common_cols), how='outer', suffixes=('', f'_{stat_type}'))
                            except:
                                # If merge fails, just concatenate
                                pass

                base_df.to_csv(combined_filename, index=False)
                print(f"✅ Saved combined: {combined_filename}")

            return all_stats

        except Exception as e:
            print(f"❌ Error downloading {team_name} data: {e}")
            return None

    def download_match_logs_csv(self, team_name, team_id, seasons=['2023-2024', '2024-2025']):
        """
        Download match history and save as CSV
        """
        print(f"📥 Downloading {team_name} match logs...")

        all_matches = []

        for season in seasons:
            try:
                url = f"https://fbref.com/en/squads/{team_id}/{season}/matchlogs/all_comps/schedule/{team_name}-Scores-and-Fixtures-All-Competitions"

                response = requests.get(url, headers=self.headers)
                time.sleep(3)

                # Try to find the fixtures table
                tables = pd.read_html(response.content)

                if tables:
                    matches_df = tables[0]  # Usually the first table

                    # Clean columns
                    if isinstance(matches_df.columns, pd.MultiIndex):
                        matches_df.columns = ['_'.join(col).strip() for col in matches_df.columns]

                    # Add metadata
                    matches_df['Team'] = team_name
                    matches_df['Season'] = season

                    all_matches.append(matches_df)
                    print(f"✅ {season}: {len(matches_df)} matches")

            except Exception as e:
                print(f"❌ Failed to get {season} matches for {team_name}: {e}")

        if all_matches:
            combined_matches = pd.concat(all_matches, ignore_index=True)
            filename = f"{team_name.lower()}_match_logs.csv"
            combined_matches.to_csv(filename, index=False)
            print(f"✅ Saved {filename}")
            return combined_matches

        return None

    def create_training_dataset_csv(self):
        """
        Create a training dataset with similar matchups for ML
        """
        print("📥 Creating training dataset...")

        # This creates a synthetic training dataset based on typical football patterns
        # In practice, you'd collect this from multiple teams and leagues

        np.random.seed(42)
        n_matches = 1000

        # Generate realistic football match data
        training_data = []

        for i in range(n_matches):
            # Team strengths (0-100 scale)
            team1_strength = np.random.normal(75, 15)  # Stronger teams like Real Madrid
            team2_strength = np.random.normal(55, 12)  # Weaker teams like Kairat

            # Ensure realistic bounds
            team1_strength = np.clip(team1_strength, 40, 95)
            team2_strength = np.clip(team2_strength, 30, 85)

            # Generate match features
            strength_diff = team1_strength - team2_strength
            home_advantage = np.random.choice([0, 1], p=[0.5, 0.5])  # 0=away, 1=home for team1
            competition_level = np.random.choice([6, 7, 8, 9], p=[0.3, 0.3, 0.3, 0.1])  # UEFA competition levels

            # Team statistics based on strength
            team1_goals_per_game = 1.2 + (team1_strength / 100) * 1.8 + np.random.normal(0, 0.2)
            team2_goals_per_game = 0.8 + (team2_strength / 100) * 1.5 + np.random.normal(0, 0.2)

            team1_goals_against = 0.5 + ((100 - team1_strength) / 100) * 1.2 + np.random.normal(0, 0.15)
            team2_goals_against = 0.7 + ((100 - team2_strength) / 100) * 1.4 + np.random.normal(0, 0.15)

            # Shot statistics
            team1_shots_per_game = 10 + (team1_strength / 100) * 8 + np.random.normal(0, 1)
            team2_shots_per_game = 8 + (team2_strength / 100) * 6 + np.random.normal(0, 1)

            team1_shot_accuracy = 30 + (team1_strength / 100) * 25 + np.random.normal(0, 3)
            team2_shot_accuracy = 25 + (team2_strength / 100) * 20 + np.random.normal(0, 3)

            # Possession
            team1_possession = 45 + (team1_strength / 100) * 20 + np.random.normal(0, 3)
            team2_possession = 100 - team1_possession

            # Determine outcome based on probabilities influenced by strength difference
            if strength_diff > 20:
                outcome_probs = [0.75, 0.15, 0.10]  # Strong favorite
            elif strength_diff > 10:
                outcome_probs = [0.60, 0.25, 0.15]  # Moderate favorite
            elif strength_diff > 0:
                outcome_probs = [0.50, 0.30, 0.20]  # Slight favorite
            else:
                outcome_probs = [0.35, 0.30, 0.35]  # Even match

            # Add home advantage effect
            if home_advantage == 1:  # Team1 at home
                outcome_probs = [outcome_probs[0] + 0.1, outcome_probs[1], outcome_probs[2] - 0.1]

            outcome = np.random.choice([0, 1, 2], p=outcome_probs)  # 0=Team1 wins, 1=Draw, 2=Team2 wins

            # Create row
            row = {
                'team1_strength': team1_strength,
                'team2_strength': team2_strength,
                'strength_difference': strength_diff,
                'home_advantage': home_advantage,
                'competition_level': competition_level,
                'team1_goals_per_game': max(0, team1_goals_per_game),
                'team2_goals_per_game': max(0, team2_goals_per_game),
                'team1_goals_against_per_game': max(0, team1_goals_against),
                'team2_goals_against_per_game': max(0, team2_goals_against),
                'team1_shots_per_game': max(0, team1_shots_per_game),
                'team2_shots_per_game': max(0, team2_shots_per_game),
                'team1_shot_accuracy': np.clip(team1_shot_accuracy, 10, 60),
                'team2_shot_accuracy': np.clip(team2_shot_accuracy, 10, 60),
                'team1_possession': np.clip(team1_possession, 25, 75),
                'team2_possession': np.clip(team2_possession, 25, 75),
                'outcome': outcome
            }

            training_data.append(row)

        # Create DataFrame and save
        training_df = pd.DataFrame(training_data)
        training_df.to_csv('football_training_dataset.csv', index=False)
        print(f"✅ Saved football_training_dataset.csv ({len(training_df)} matches)")

        # Show distribution
        print("\nOutcome distribution:")
        print(training_df['outcome'].value_counts())
        print("0 = Team1 wins, 1 = Draw, 2 = Team2 wins")

        return training_df

def download_all_data():
    """
    Download all required data and save as CSV files
    """
    downloader = FBRefCSVDownloader()

    print("🚀 Starting data download...")
    print("=" * 50)

    # Team IDs from FBRef
    teams = {
        'Real-Madrid': '53a2f082',
        'Qairat-Almaty': '768fb565'  # Kairat's FBRef ID
    }

    # Download team statistics
    for team_name, team_id in teams.items():
        print(f"\n📊 Downloading {team_name} data...")
        downloader.download_team_stats_csv(team_name, team_id)
        downloader.download_match_logs_csv(team_name, team_id)

    # Create training dataset
    print(f"\n🎯 Creating training dataset...")
    training_df = downloader.create_training_dataset_csv()

    print(f"\n✅ All data downloaded! Files created:")
    print("- real-madrid_*_stats.csv")
    print("- qairat-almaty_*_stats.csv")
    print("- real-madrid_match_logs.csv")
    print("- qairat-almaty_match_logs.csv")
    print("- football_training_dataset.csv")

# ============================================================================
# PART 2: NOTEBOOK CODE - USE THIS IN YOUR JUPYTER NOTEBOOK
# ============================================================================

"""
JUPYTER NOTEBOOK CODE - Copy this section into your notebook
"""

def load_and_prepare_data():
    """
    Load CSV files and prepare for ML - USE THIS IN YOUR NOTEBOOK
    """
    print("📂 Loading CSV files...")

    try:
        # Load training dataset
        training_df = pd.read_csv('football_training_dataset.csv')
        print(f"✅ Loaded training data: {training_df.shape}")

        # Load team statistics
        real_madrid_stats = {}
        kairat_stats = {}

        # Try to load Real Madrid stats
        try:
            real_madrid_stats['standard'] = pd.read_csv('real-madrid_standard_2024_2025.csv')
            print("✅ Loaded Real Madrid standard stats")
        except:
            print("⚠️  Real Madrid standard stats not found")

        try:
            real_madrid_stats['shooting'] = pd.read_csv('real-madrid_shooting_2024_2025.csv')
            print("✅ Loaded Real Madrid shooting stats")
        except:
            print("⚠️  Real Madrid shooting stats not found")

        # Try to load Kairat stats
        try:
            kairat_stats['standard'] = pd.read_csv('qairat-almaty_standard_2024_2025.csv')
            print("✅ Loaded Kairat standard stats")
        except:
            print("⚠️  Kairat standard stats not found")

        try:
            kairat_stats['shooting'] = pd.read_csv('qairat-almaty_shooting_2024_2025.csv')
            print("✅ Loaded Kairat shooting stats")
        except:
            print("⚠️  Kairat shooting stats not found")

        # Load match logs
        try:
            real_matches = pd.read_csv('real-madrid_match_logs.csv')
            print(f"✅ Loaded Real Madrid matches: {len(real_matches)}")
        except:
            print("⚠️  Real Madrid match logs not found")
            real_matches = None

        try:
            kairat_matches = pd.read_csv('qairat-almaty_match_logs.csv')
            print(f"✅ Loaded Kairat matches: {len(kairat_matches)}")
        except:
            print("⚠️  Kairat match logs not found")
            kairat_matches = None

        return {
            'training_data': training_df,
            'real_madrid_stats': real_madrid_stats,
            'kairat_stats': kairat_stats,
            'real_matches': real_matches,
            'kairat_matches': kairat_matches
        }

    except Exception as e:
        print(f"❌ Error loading data: {e}")
        return None

def train_prediction_model(training_df):
    """
    Train ML models on the loaded data - USE THIS IN YOUR NOTEBOOK
    """
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split, cross_val_score
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

    print("🤖 Training ML models...")

    # Prepare features and target
    feature_columns = [col for col in training_df.columns if col != 'outcome']
    X = training_df[feature_columns]
    y = training_df['outcome']

    print(f"Features: {len(feature_columns)}")
    print(f"Samples: {len(X)}")
    print(f"Target distribution: {y.value_counts().to_dict()}")

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train multiple models
    models = {
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
    }

    results = {}

    for name, model in models.items():
        print(f"\n📊 Training {name}...")

        # Train
        model.fit(X_train_scaled, y_train)

        # Evaluate
        y_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)

        # Cross-validation
        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)

        results[name] = {
            'model': model,
            'accuracy': accuracy,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std()
        }

        print(f"Accuracy: {accuracy:.3f}")
        print(f"CV Score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

    # Select best model
    best_model_name = max(results.keys(), key=lambda k: results[k]['cv_mean'])
    best_model = results[best_model_name]['model']

    print(f"\n🏆 Best model: {best_model_name}")

    return {
        'models': results,
        'best_model': best_model,
        'scaler': scaler,
        'feature_columns': feature_columns,
        'best_model_name': best_model_name
    }

def predict_real_madrid_vs_kairat(model_package, real_stats, kairat_stats):
    """
    Make prediction for Real Madrid vs Kairat - USE THIS IN YOUR NOTEBOOK
    """
    print("🔮 Predicting Real Madrid vs Kairat...")

    # Create features for the matchup
    features = create_match_features(real_stats, kairat_stats)

    # Convert to array
    feature_array = np.array([list(features.values())]).reshape(1, -1)

    # Scale features
    feature_array_scaled = model_package['scaler'].transform(feature_array)

    # Make prediction
    prediction = model_package['best_model'].predict(feature_array_scaled)[0]
    probabilities = model_package['best_model'].predict_proba(feature_array_scaled)[0]

    outcomes = ['Real Madrid Win', 'Draw', 'Kairat Win']

    result = {
        'predicted_outcome': outcomes[prediction],
        'probabilities': {
            'Real Madrid Win': probabilities[0],
            'Draw': probabilities[1],
            'Kairat Win': probabilities[2]
        },
        'confidence': max(probabilities),
        'features_used': features
    }

    return result

def create_match_features(real_stats, kairat_stats):
    """
    Create features for Real Madrid vs Kairat prediction
    """
    # This creates mock features - replace with actual stats from your CSV files
    features = {
        'team1_strength': 85,  # Real Madrid strength
        'team2_strength': 55,  # Kairat strength
        'strength_difference': 30,
        'home_advantage': 0,  # 0 = neutral, 1 = Kairat home
        'competition_level': 8,  # Champions League level
        'team1_goals_per_game': 2.3,
        'team2_goals_per_game': 1.1,
        'team1_goals_against_per_game': 0.8,
        'team2_goals_against_per_game': 1.4,
        'team1_shots_per_game': 16.5,
        'team2_shots_per_game': 10.2,
        'team1_shot_accuracy': 45.2,
        'team2_shot_accuracy': 32.1,
        'team1_possession': 62.5,
        'team2_possession': 37.5
    }

    # TODO: Replace these mock values with actual calculations from your CSV data
    # Example:
    # if 'standard' in real_stats:
    #     rm_df = real_stats['standard']
    #     features['team1_goals_per_game'] = rm_df['Gls'].iloc[0] / rm_df['MP'].iloc[0]

    return features

# COMPLETE NOTEBOOK WORKFLOW
def complete_prediction_workflow():
    """
    Complete workflow for your Jupyter notebook - USE THIS AS MAIN FUNCTION
    """
    print("🚀 REAL MADRID vs KAIRAT PREDICTION WORKFLOW")
    print("=" * 60)

    # Step 1: Load data
    data = load_and_prepare_data()
    if not data:
        print("❌ Failed to load data. Please run the download script first.")
        return

    # Step 2: Train models
    model_package = train_prediction_model(data['training_data'])

    # Step 3: Make prediction
    prediction = predict_real_madrid_vs_kairat(
        model_package,
        data['real_madrid_stats'],
        data['kairat_stats']
    )

    # Step 4: Display results
    print(f"\n🎯 PREDICTION RESULTS")
    print("=" * 40)
    print(f"Predicted Outcome: {prediction['predicted_outcome']}")
    print(f"Model Used: {model_package['best_model_name']}")
    print(f"Confidence: {prediction['confidence']:.3f}")

    print(f"\n📊 PROBABILITIES:")
    for outcome, prob in prediction['probabilities'].items():
        print(f"  {outcome}: {prob:.3f} ({prob*100:.1f}%)")

    print(f"\n📈 FEATURES USED:")
    for feature, value in prediction['features_used'].items():
        print(f"  {feature}: {value}")

    return prediction

if __name__ == "__main__":
    # Run this to download all CSV files
    download_all_data()

🚀 Starting data download...

📊 Downloading Real-Madrid data...
📥 Downloading Real-Madrid statistics...
📥 Downloading Real-Madrid match logs...
❌ Failed to get 2023-2024 matches for Real-Madrid: No tables found
❌ Failed to get 2024-2025 matches for Real-Madrid: No tables found

📊 Downloading Qairat-Almaty data...
📥 Downloading Qairat-Almaty statistics...
📥 Downloading Qairat-Almaty match logs...
❌ Failed to get 2023-2024 matches for Qairat-Almaty: No tables found
❌ Failed to get 2024-2025 matches for Qairat-Almaty: No tables found

🎯 Creating training dataset...
📥 Creating training dataset...
✅ Saved football_training_dataset.csv (1000 matches)

Outcome distribution:
outcome
0    682
1    206
2    112
Name: count, dtype: int64
0 = Team1 wins, 1 = Draw, 2 = Team2 wins

✅ All data downloaded! Files created:
- real-madrid_*_stats.csv
- qairat-almaty_*_stats.csv
- real-madrid_match_logs.csv
- qairat-almaty_match_logs.csv
- football_training_dataset.csv
