In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier, XGBRegressor
import joblib
from datetime import datetime

# --- Data Preprocessing ---

def load_and_preprocess_data(file_path='/workspaces/codespaces-blank/IPL_2025_Predictions/ipl_2025_all.csv'):
    """Load and preprocess IPL match data."""
    try:
        df = pd.read_csv(file_path)
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.sort_values('Date').dropna(subset=['Date', 'Teams', 'Toss_Winner', 'Toss_Decision', 
                                                  'Venue', 'Match_Winner', 'First_Innings_Score', 
                                                  'Second_Innings_Score'])
        
        # Extract Team1 and Team2
        df['Team1'] = df['Teams'].apply(lambda x: x.split(' vs ')[0].strip())
        df['Team2'] = df['Teams'].apply(lambda x: x.split(' vs ')[1].strip())
        
        # Fix batting order
        df['batting_first_team'] = np.where(df['Toss_Decision'] == 'bat', df['Toss_Winner'],
                                            np.where(df['Toss_Winner'] == df['Team1'], df['Team2'], df['Team1']))
        df['batting_second_team'] = np.where(df['batting_first_team'] == df['Team1'], df['Team2'], df['Team1'])
        
        # Target for winner prediction
        df['batting_first_wins'] = (df['Match_Winner'] == df['batting_first_team']).astype(int)
        
        # Validate scores for T20
        df = df[(df['First_Innings_Score'] >= 50) & (df['First_Innings_Score'] <= 300) &
                (df['Second_Innings_Score'] >= 50) & (df['Second_Innings_Score'] <= 300)]
        if df.empty:
            raise ValueError("No valid data after filtering unrealistic scores (50-300 runs expected for T20)")
        
        print(f"Data loaded: {len(df)} matches after cleaning")
        print(f"First innings score range: {df['First_Innings_Score'].min()} to {df['First_Innings_Score'].max()}")
        print(f"Second innings score range: {df['Second_Innings_Score'].min()} to {df['Second_Innings_Score'].max()}")
        print(f"Mean first innings score: {df['First_Innings_Score'].mean():.2f}, Std: {df['First_Innings_Score'].std():.2f}")
        print(f"Mean second innings score: {df['Second_Innings_Score'].mean():.2f}, Std: {df['Second_Innings_Score'].std():.2f}")
        
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        raise

# --- Feature Engineering ---

def feature_engineering(df):
    """Engineer features emphasizing team vs. team and venue performance."""
    teams = pd.concat([df['Team1'], df['Team2']]).unique()
    venues = df['Venue'].unique()
    team_stats = {team: {'matches': 0, 'wins': 0, 'runs_scored': 0, 'runs_conceded': 0, 
                        'recent_wins': [], 'recent_scores': [], 
                        'vs_opponent_wins': {t: 0 for t in teams}, 
                        'vs_opponent_matches': {t: 0 for t in teams},
                        'vs_opponent_scores': {t: [] for t in teams},
                        'venue_wins': {v: 0 for v in venues},
                        'venue_matches': {v: 0 for v in venues}} for team in teams}
    venue_stats = {venue: {'matches': 0, 'total_runs': 0, 'first_innings_avg': 0, 
                         'second_innings_avg': 0, 'toss_bat_win_rate': 0} for venue in venues}
    
    # Feature lists
    bf_win_rate, bs_win_rate = [], []
    bf_avg_score, bs_avg_score = [], []
    bf_runs_conceded, bs_runs_conceded = [], []
    bf_recent_form, bs_recent_form = [], []
    bf_vs_bs_win_rate, bf_vs_bs_avg_score = [], []
    bf_venue_win_rate, bs_venue_win_rate = [], []
    bf_recent_scoring, bs_recent_scoring = [], []
    venue_first_avg, venue_second_avg = [], []
    venue_toss_bat_win = []
    toss_impact = []
    
    for _, row in df.iterrows():
        bf, bs = row['batting_first_team'], row['batting_second_team']
        venue = row['Venue']
        match_date = row['Date']
        
        # Pre-match stats (based on matches before this date)
        bf_wr = team_stats[bf]['wins'] / team_stats[bf]['matches'] if team_stats[bf]['matches'] > 0 else 0.5
        bs_wr = team_stats[bs]['wins'] / team_stats[bs]['matches'] if team_stats[bs]['matches'] > 0 else 0.5
        bf_as = team_stats[bf]['runs_scored'] / team_stats[bf]['matches'] if team_stats[bf]['matches'] > 0 else 150
        bs_as = team_stats[bs]['runs_scored'] / team_stats[bs]['matches'] if team_stats[bs]['matches'] > 0 else 150
        bf_rc = team_stats[bf]['runs_conceded'] / team_stats[bf]['matches'] if team_stats[bf]['matches'] > 0 else 150
        bs_rc = team_stats[bs]['runs_conceded'] / team_stats[bs]['matches'] if team_stats[bs]['matches'] > 0 else 150
        bf_rf = sum(team_stats[bf]['recent_wins']) / 5 if len(team_stats[bf]['recent_wins']) >= 5 else 0.5
        bs_rf = sum(team_stats[bs]['recent_wins']) / 5 if len(team_stats[bs]['recent_wins']) >= 5 else 0.5
        bf_vs_bs_wr = team_stats[bf]['vs_opponent_wins'][bs] / team_stats[bf]['vs_opponent_matches'][bs] if team_stats[bf]['vs_opponent_matches'][bs] > 0 else 0.5
        bf_vs_bs_as = sum(team_stats[bf]['vs_opponent_scores'][bs]) / len(team_stats[bf]['vs_opponent_scores'][bs]) if team_stats[bf]['vs_opponent_scores'][bs] else 150
        bf_vwr = team_stats[bf]['venue_wins'][venue] / team_stats[bf]['venue_matches'][venue] if team_stats[bf]['venue_matches'][venue] > 0 else 0.5
        bs_vwr = team_stats[bs]['venue_wins'][venue] / team_stats[bs]['venue_matches'][venue] if team_stats[bs]['venue_matches'][venue] > 0 else 0.5
        bf_rs = sum(team_stats[bf]['recent_scores']) / 5 if len(team_stats[bf]['recent_scores']) >= 5 else 150
        bs_rs = sum(team_stats[bs]['recent_scores']) / 5 if len(team_stats[bs]['recent_scores']) >= 5 else 150
        v_fa = venue_stats[venue]['first_innings_avg'] / venue_stats[venue]['matches'] if venue_stats[venue]['matches'] > 0 else 150
        v_sa = venue_stats[venue]['second_innings_avg'] / venue_stats[venue]['matches'] if venue_stats[venue]['matches'] > 0 else 140
        v_tbw = venue_stats[venue]['toss_bat_win_rate'] / venue_stats[venue]['matches'] if venue_stats[venue]['matches'] > 0 else 0.5
        ti = 1 if row['Toss_Decision'] == 'bat' else 0
        
        # Append features
        bf_win_rate.append(bf_wr)
        bs_win_rate.append(bs_wr)
        bf_avg_score.append(bf_as)
        bs_avg_score.append(bs_as)
        bf_runs_conceded.append(bf_rc)
        bs_runs_conceded.append(bs_rc)
        bf_recent_form.append(bf_rf)
        bs_recent_form.append(bs_rf)
        bf_vs_bs_win_rate.append(bf_vs_bs_wr)
        bf_vs_bs_avg_score.append(bf_vs_bs_as)
        bf_venue_win_rate.append(bf_vwr)
        bs_venue_win_rate.append(bs_vwr)
        bf_recent_scoring.append(bf_rs)
        bs_recent_scoring.append(bs_rs)
        venue_first_avg.append(v_fa)
        venue_second_avg.append(v_sa)
        venue_toss_bat_win.append(v_tbw)
        toss_impact.append(ti)
        
        # Update stats
        team_stats[bf]['matches'] += 1
        team_stats[bs]['matches'] += 1
        team_stats[bf]['runs_scored'] += row['First_Innings_Score']
        team_stats[bs]['runs_scored'] += row['Second_Innings_Score']
        team_stats[bf]['runs_conceded'] += row['Second_Innings_Score']
        team_stats[bs]['runs_conceded'] += row['First_Innings_Score']
        winner = 1 if row['Match_Winner'] == bf else 0
        team_stats[bf]['wins'] += winner
        team_stats[bs]['wins'] += 1 - winner
        team_stats[bf]['recent_wins'] = (team_stats[bf]['recent_wins'] + [winner])[-5:]
        team_stats[bs]['recent_wins'] = (team_stats[bs]['recent_wins'] + [1 - winner])[-5:]
        team_stats[bf]['recent_scores'] = (team_stats[bf]['recent_scores'] + [row['First_Innings_Score']])[-5:]
        team_stats[bs]['recent_scores'] = (team_stats[bs]['recent_scores'] + [row['Second_Innings_Score']])[-5:]
        team_stats[bf]['vs_opponent_matches'][bs] += 1
        team_stats[bs]['vs_opponent_matches'][bf] += 1
        team_stats[bf]['vs_opponent_wins'][bs] += winner
        team_stats[bs]['vs_opponent_wins'][bf] += 1 - winner
        team_stats[bf]['vs_opponent_scores'][bs].append(row['First_Innings_Score'])
        team_stats[bs]['vs_opponent_scores'][bf].append(row['Second_Innings_Score'])
        team_stats[bf]['venue_matches'][venue] += 1
        team_stats[bs]['venue_matches'][venue] += 1
        team_stats[bf]['venue_wins'][venue] += winner
        team_stats[bs]['venue_wins'][venue] += 1 - winner
        venue_stats[venue]['matches'] += 1
        venue_stats[venue]['total_runs'] += row['First_Innings_Score'] + row['Second_Innings_Score']
        venue_stats[venue]['first_innings_avg'] += row['First_Innings_Score']
        venue_stats[venue]['second_innings_avg'] += row['Second_Innings_Score']
        venue_stats[venue]['toss_bat_win_rate'] += winner if row['Toss_Decision'] == 'bat' else 0
    
    # Add features to DataFrame
    df['bf_win_rate'] = bf_win_rate
    df['bs_win_rate'] = bs_win_rate
    df['bf_avg_score'] = bf_avg_score
    df['bs_avg_score'] = bs_avg_score
    df['bf_runs_conceded'] = bf_runs_conceded
    df['bs_runs_conceded'] = bs_runs_conceded
    df['bf_recent_form'] = bf_recent_form
    df['bs_recent_form'] = bs_recent_form
    df['bf_vs_bs_win_rate'] = bf_vs_bs_win_rate
    df['bf_vs_bs_avg_score'] = bf_vs_bs_avg_score
    df['bf_venue_win_rate'] = bf_venue_win_rate
    df['bs_venue_win_rate'] = bs_venue_win_rate
    df['bf_recent_scoring'] = bf_recent_scoring
    df['bs_recent_scoring'] = bs_recent_scoring
    df['venue_first_avg'] = venue_first_avg
    df['venue_second_avg'] = venue_second_avg
    df['venue_toss_bat_win'] = venue_toss_bat_win
    df['toss_impact'] = toss_impact
    
    return df, team_stats, venue_stats

# --- Model Training and Saving ---

def train_and_save_models(df):
    """Train and save machine learning models with hyperparameter tuning."""
    # Encode categorical variables
    team_encoder = LabelEncoder()
    venue_encoder = LabelEncoder()
    df['batting_first_encoded'] = team_encoder.fit_transform(df['batting_first_team'])
    df['batting_second_encoded'] = team_encoder.transform(df['batting_second_team'])
    df['venue_encoded'] = venue_encoder.fit_transform(df['Venue'])
    
    # Features and targets
    features = ['batting_first_encoded', 'batting_second_encoded', 'venue_encoded', 
                'bf_win_rate', 'bs_win_rate', 'bf_avg_score', 'bs_avg_score', 
                'bf_runs_conceded', 'bs_runs_conceded', 'bf_recent_form', 'bs_recent_form', 
                'bf_vs_bs_win_rate', 'bf_vs_bs_avg_score', 'bf_venue_win_rate', 'bs_venue_win_rate', 
                'bf_recent_scoring', 'bs_recent_scoring', 'venue_first_avg', 'venue_second_avg', 
                'venue_toss_bat_win', 'toss_impact']
    X = df[features]
    y_winner = df['batting_first_wins']
    y_first_score = df['First_Innings_Score']
    y_second_score = df['Second_Innings_Score']
    
    # Time-based train-test split
    train_size = int(0.8 * len(df))
    X_train, X_test = X[:train_size], X[train_size:]
    y_winner_train, y_winner_test = y_winner[:train_size], y_winner[train_size:]
    y_first_score_train, y_first_score_test = y_first_score[:train_size], y_first_score[train_size:]
    y_second_score_train, y_second_score_test = y_second_score[:train_size], y_second_score[train_size:]
    
    # Hyperparameter tuning
    clf_params = {'n_estimators': [200, 300], 'max_depth': [3, 5], 'learning_rate': [0.05, 0.1]}
    reg_params = {'n_estimators': [200, 300], 'max_depth': [3, 5], 'learning_rate': [0.05, 0.1]}
    
    # Train classifier
    clf = GridSearchCV(XGBClassifier(random_state=42), clf_params, cv=5, scoring='accuracy')
    clf.fit(X_train, y_winner_train)
    print(f"Best classifier params: {clf.best_params_}")
    
    # Train regressors
    reg_first = GridSearchCV(XGBRegressor(random_state=42), reg_params, cv=5, scoring='neg_mean_squared_error')
    reg_first.fit(X_train, y_first_score_train)
    print(f"Best first innings regressor params: {reg_first.best_params_}")
    
    reg_second = GridSearchCV(XGBRegressor(random_state=42), reg_params, cv=5, scoring='neg_mean_squared_error')
    reg_second.fit(X_train, y_second_score_train)
    print(f"Best second innings regressor params: {reg_second.best_params_}")
    
    # Evaluate models
    y_pred_winner = clf.predict(X_test)
    print(f'Winner prediction accuracy: {accuracy_score(y_winner_test, y_pred_winner):.2f}')
    y_pred_first = reg_first.predict(X_test)
    print(f'First innings score MSE: {mean_squared_error(y_first_score_test, y_pred_first):.2f}')
    y_pred_second = reg_second.predict(X_test)
    print(f'Second innings score MSE: {mean_squared_error(y_second_score_test, y_pred_second):.2f}')
    
    # Print sample predictions
    print("\nSample predictions for last 5 test matches:")
    for i in range(-5, 0):
        actual_first = y_first_score_test.iloc[i]
        pred_first = max(50, min(300, y_pred_first[i]))
        actual_second = y_second_score_test.iloc[i]
        pred_second = max(50, min(300, y_pred_second[i]))
        first_run_rate = pred_first / 20
        second_run_rate = pred_second / 20
        print(f"Match {i}: First innings - Actual: {actual_first:.2f}, Predicted: {pred_first:.2f}, Run Rate: {first_run_rate:.2f}")
        print(f"Match {i}: Second innings - Actual: {actual_second:.2f}, Predicted: {pred_second:.2f}, Run Rate: {second_run_rate:.2f}")
    
    # Save models and encoders
    try:
        joblib.dump(clf.best_estimator_, 'winner_model.joblib')
        joblib.dump(reg_first.best_estimator_, 'first_score_model.joblib')
        joblib.dump(reg_second.best_estimator_, 'second_score_model.joblib')
        joblib.dump(team_encoder, 'team_encoder.joblib')
        joblib.dump(venue_encoder, 'venue_encoder.joblib')
        joblib.dump(team_stats, 'team_stats.joblib')
        joblib.dump(venue_stats, 'venue_stats.joblib')
        print("\nModels and encoders saved successfully as joblib files.")
    except Exception as e:
        print(f"Error saving models: {e}")
        raise
    
    return clf.best_estimator_, reg_first.best_estimator_, reg_second.best_estimator_, team_encoder, venue_encoder, team_stats, venue_stats

# --- Prediction Function ---

def predict_match(team1, team2, venue, match_date, clf, reg_first, reg_second, team_encoder, venue_encoder, team_stats, venue_stats):
    """Predict match outcome based on teams, venue, and date."""
    try:
        match_date = pd.to_datetime(match_date)
        # Assume toss decision is unknown; use team1 bats first scenario
        batting_first, batting_second = team1, team2
        
        # Validate inputs
        if team1 not in team_encoder.classes_ or team2 not in team_encoder.classes_:
            raise ValueError(f"Invalid team: {team1} or {team2}")
        if venue not in venue_encoder.classes_:
            raise ValueError(f"Invalid venue: {venue}")
        
        # Prepare features based on historical data before match_date
        bf_encoded = team_encoder.transform([batting_first])[0]
        bs_encoded = team_encoder.transform([batting_second])[0]
        v_encoded = venue_encoder.transform([venue])[0]
        bf_wr = team_stats[batting_first]['wins'] / team_stats[batting_first]['matches'] if team_stats[batting_first]['matches'] > 0 else 0.5
        bs_wr = team_stats[batting_second]['wins'] / team_stats[batting_second]['matches'] if team_stats[batting_second]['matches'] > 0 else 0.5
        bf_as = team_stats[batting_first]['runs_scored'] / team_stats[batting_first]['matches'] if team_stats[batting_first]['matches'] > 0 else 150
        bs_as = team_stats[batting_second]['runs_scored'] / team_stats[batting_second]['matches'] if team_stats[batting_second]['matches'] > 0 else 150
        bf_rc = team_stats[batting_first]['runs_conceded'] / team_stats[batting_first]['matches'] if team_stats[batting_first]['matches'] > 0 else 150
        bs_rc = team_stats[batting_second]['runs_conceded'] / team_stats[batting_second]['matches'] if team_stats[batting_second]['matches'] > 0 else 150
        bf_rf = sum(team_stats[batting_first]['recent_wins']) / 5 if len(team_stats[batting_first]['recent_wins']) >= 5 else 0.5
        bs_rf = sum(team_stats[batting_second]['recent_wins']) / 5 if len(team_stats[batting_second]['recent_wins']) >= 5 else 0.5
        bf_vs_bs_wr = team_stats[batting_first]['vs_opponent_wins'][batting_second] / team_stats[batting_first]['vs_opponent_matches'][batting_second] if team_stats[batting_first]['vs_opponent_matches'][batting_second] > 0 else 0.5
        bf_vs_bs_as = sum(team_stats[batting_first]['vs_opponent_scores'][batting_second]) / len(team_stats[batting_first]['vs_opponent_scores'][batting_second]) if team_stats[batting_first]['vs_opponent_scores'][batting_second] else 150
        bf_vwr = team_stats[batting_first]['venue_wins'][venue] / team_stats[batting_first]['venue_matches'][venue] if team_stats[batting_first]['venue_matches'][venue] > 0 else 0.5
        bs_vwr = team_stats[batting_second]['venue_wins'][venue] / team_stats[batting_second]['venue_matches'][venue] if team_stats[batting_second]['venue_matches'][venue] > 0 else 0.5
        bf_rs = sum(team_stats[batting_first]['recent_scores']) / 5 if len(team_stats[batting_first]['recent_scores']) >= 5 else 150
        bs_rs = sum(team_stats[batting_second]['recent_scores']) / 5 if len(team_stats[batting_second]['recent_scores']) >= 5 else 150
        v_fa = venue_stats[venue]['first_innings_avg'] / venue_stats[venue]['matches'] if venue_stats[venue]['matches'] > 0 else 150
        v_sa = venue_stats[venue]['second_innings_avg'] / venue_stats[venue]['matches'] if venue_stats[venue]['matches'] > 0 else 140
        v_tbw = venue_stats[venue]['toss_bat_win_rate'] / venue_stats[venue]['matches'] if venue_stats[venue]['matches'] > 0 else 0.5
        ti = 0.5  # Neutral toss impact since toss unknown
        
        features = np.array([[bf_encoded, bs_encoded, v_encoded, bf_wr, bs_wr, bf_as, bs_as, bf_rc, bs_rc, 
                             bf_rf, bs_rf, bf_vs_bs_wr, bf_vs_bs_as, bf_vwr, bs_vwr, bf_rs, bs_rs, 
                             v_fa, v_sa, v_tbw, ti]])
        
        # Predict
        winner_prob = clf.predict_proba(features)[0][1]  # Probability batting_first wins
        first_score = max(50, min(300, reg_first.predict(features)[0]))
        second_score = max(50, min(300, reg_second.predict(features)[0]))
        winner = batting_first if winner_prob > 0.5 else batting_second
        
        # Run rates
        first_run_rate = first_score / 20
        second_run_rate = second_score / 20
        
        # Historical performance
        h2h_matches = team_stats[batting_first]['vs_opponent_matches'][batting_second]
        h2h_wins = team_stats[batting_first]['vs_opponent_wins'][batting_second]
        h2h_win_rate = h2h_wins / h2h_matches if h2h_matches > 0 else 0.5
        h2h_avg_score = sum(team_stats[batting_first]['vs_opponent_scores'][batting_second]) / len(team_stats[batting_first]['vs_opponent_scores'][batting_second]) if team_stats[batting_first]['vs_opponent_scores'][batting_second] else 150
        
        result = {
            'batting_first': batting_first,
            'batting_second': batting_second,
            'venue': venue,
            'date': match_date.strftime('%Y-%m-%d'),
            'first_innings_score': round(float(first_score), 2),
            'second_innings_score': round(float(second_score), 2),
            'first_innings_run_rate': round(first_run_rate, 2),
            'second_innings_run_rate': round(second_run_rate, 2),
            'winner': winner,
            'winner_probability': round(float(winner_prob), 2),
            'head_to_head': {
                'matches': h2h_matches,
                'team1_wins': h2h_wins,
                'team1_win_rate': round(h2h_win_rate, 2),
                'team1_avg_score_vs_team2': round(h2h_avg_score, 2)
            }
        }
        
        return result
    except Exception as e:
        print(f"Error predicting match: {e}")
        return None

# --- Main Execution ---

if __name__ == '__main__':
    print(f"Starting IPL prediction training at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    df = load_and_preprocess_data()
    df, team_stats, venue_stats = feature_engineering(df)
    clf, reg_first, reg_second, team_encoder, venue_encoder, team_stats, venue_stats = train_and_save_models(df)
    
    # Example prediction
    print("\nExample prediction:")
    result = predict_match(
        team1="Royal Challengers Bengaluru",
        team2="Kings XI Punjab",
        venue="Narendra Modi Stadium, Ahmedabad",
        match_date="2025-06-03",
        clf=clf,
        reg_first=reg_first,
        reg_second=reg_second,
        team_encoder=team_encoder,
        venue_encoder=venue_encoder,
        team_stats=team_stats,
        venue_stats=venue_stats
    )
    if result:
        print(f"Match: {result['batting_first']} vs {result['batting_second']} at {result['venue']} on {result['date']}")
        print(f"Predicted first innings score: {result['first_innings_score']} (Run Rate: {result['first_innings_run_rate']})")
        print(f"Predicted second innings score: {result['second_innings_score']} (Run Rate: {result['second_innings_run_rate']})")
        print(f"Predicted winner: {result['winner']} (Probability: {result['winner_probability']})")
        print(f"Head-to-head: {result['head_to_head']['matches']} matches, {result['batting_first']} won {result['head_to_head']['team1_wins']} (Win Rate: {result['head_to_head']['team1_win_rate']}), Avg Score: {result['head_to_head']['team1_avg_score_vs_team2']}")
    
    print("Training and prediction complete.")

Starting IPL prediction training at 2025-06-03 09:18:47
Data loaded: 1068 matches after cleaning
First innings score range: 67.0 to 287.0
Second innings score range: 55.0 to 262.0
Mean first innings score: 165.35, Std: 31.60
Mean second innings score: 152.41, Std: 30.78
Best classifier params: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}
Best first innings regressor params: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}
Best second innings regressor params: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}
Winner prediction accuracy: 0.51
First innings score MSE: 1601.44
Second innings score MSE: 1329.83

Sample predictions for last 5 test matches:
Match -5: First innings - Actual: 200.00, Predicted: 138.68, Run Rate: 6.93
Match -5: Second innings - Actual: 206.00, Predicted: 150.45, Run Rate: 7.52
Match -4: First innings - Actual: 212.00, Predicted: 158.29, Run Rate: 7.91
Match -4: Second innings - Actual: 134.00, Predicted: 147.73, Run Rate: 7

In [2]:
from flask import Flask, request, render_template, jsonify
import numpy as np
import pandas as pd
import joblib
from datetime import datetime

app = Flask(__name__)

# Load saved models and encoders
try:
    clf = joblib.load('winner_model.joblib')
    reg_first = joblib.load('first_score_model.joblib')
    reg_second = joblib.load('second_score_model.joblib')
    team_encoder = joblib.load('team_encoder.joblib')
    venue_encoder = joblib.load('venue_encoder.joblib')
    team_stats = joblib.load('team_stats.joblib')
    venue_stats = joblib.load('venue_stats.joblib')
    print("Models and encoders loaded successfully.")
except Exception as e:
    print(f"Error loading models: {e}")
    raise

# Prediction function
def predict_match(team1, team2, venue, match_date):
    """Predict match outcome based on teams, venue, and date."""
    try:
        match_date = pd.to_datetime(match_date)
        batting_first, batting_second = team1, team2
        
        # Validate inputs
        if team1 not in team_encoder.classes_ or team2 not in team_encoder.classes_:
            raise ValueError(f"Invalid team: {team1} or {team2}")
        if venue not in venue_encoder.classes_:
            raise ValueError(f"Invalid venue: {venue}")
        
        # Prepare features
        bf_encoded = team_encoder.transform([batting_first])[0]
        bs_encoded = team_encoder.transform([batting_second])[0]
        v_encoded = venue_encoder.transform([venue])[0]
        bf_wr = team_stats[batting_first]['wins'] / team_stats[batting_first]['matches'] if team_stats[batting_first]['matches'] > 0 else 0.5
        bs_wr = team_stats[batting_second]['wins'] / team_stats[batting_second]['matches'] if team_stats[batting_second]['matches'] > 0 else 0.5
        bf_as = team_stats[batting_first]['runs_scored'] / team_stats[batting_first]['matches'] if team_stats[batting_first]['matches'] > 0 else 150
        bs_as = team_stats[batting_second]['runs_scored'] / team_stats[batting_second]['matches'] if team_stats[batting_second]['matches'] > 0 else 150
        bf_rc = team_stats[batting_first]['runs_conceded'] / team_stats[batting_first]['matches'] if team_stats[batting_first]['matches'] > 0 else 150
        bs_rc = team_stats[batting_second]['runs_conceded'] / team_stats[batting_second]['matches'] if team_stats[batting_second]['matches'] > 0 else 150
        bf_rf = sum(team_stats[batting_first]['recent_wins']) / 5 if len(team_stats[batting_first]['recent_wins']) >= 5 else 0.5
        bs_rf = sum(team_stats[batting_second]['recent_wins']) / 5 if len(team_stats[batting_second]['recent_wins']) >= 5 else 0.5
        bf_vs_bs_wr = team_stats[batting_first]['vs_opponent_wins'][batting_second] / team_stats[batting_first]['vs_opponent_matches'][batting_second] if team_stats[batting_first]['vs_opponent_matches'][batting_second] > 0 else 0.5
        bf_vs_bs_as = sum(team_stats[batting_first]['vs_opponent_scores'][batting_second]) / len(team_stats[batting_first]['vs_opponent_scores'][batting_second]) if team_stats[batting_first]['vs_opponent_scores'][batting_second] else 150
        bf_vwr = team_stats[batting_first]['venue_wins'][venue] / team_stats[batting_first]['venue_matches'][venue] if team_stats[batting_first]['venue_matches'][venue] > 0 else 0.5
        bs_vwr = team_stats[batting_second]['venue_wins'][venue] / team_stats[batting_second]['venue_matches'][venue] if team_stats[batting_second]['venue_matches'][venue] > 0 else 0.5
        bf_rs = sum(team_stats[batting_first]['recent_scores']) / 5 if len(team_stats[batting_first]['recent_scores']) >= 5 else 150
        bs_rs = sum(team_stats[batting_second]['recent_scores']) / 5 if len(team_stats[batting_second]['recent_scores']) >= 5 else 150
        v_fa = venue_stats[venue]['first_innings_avg'] / venue_stats[venue]['matches'] if venue_stats[venue]['matches'] > 0 else 150
        v_sa = venue_stats[venue]['second_innings_avg'] / venue_stats[venue]['matches'] if venue_stats[venue]['matches'] > 0 else 140
        v_tbw = venue_stats[venue]['toss_bat_win_rate'] / venue_stats[venue]['matches'] if venue_stats[venue]['matches'] > 0 else 0.5
        ti = 0.5  # Neutral toss impact
        
        features = np.array([[bf_encoded, bs_encoded, v_encoded, bf_wr, bs_wr, bf_as, bs_as, bf_rc, bs_rc, 
                             bf_rf, bs_rf, bf_vs_bs_wr, bf_vs_bs_as, bf_vwr, bs_vwr, bf_rs, bs_rs, 
                             v_fa, v_sa, v_tbw, ti]])
        
        # Predict
        winner_prob = clf.predict_proba(features)[0][1]
        first_score = max(50, min(300, reg_first.predict(features)[0]))
        second_score = max(50, min(300, reg_second.predict(features)[0]))
        winner = batting_first if winner_prob > 0.5 else batting_second
        
        # Run rates
        first_run_rate = first_score / 20
        second_run_rate = second_score / 20
        
        # Head-to-head stats
        h2h_matches = team_stats[batting_first]['vs_opponent_matches'][batting_second]
        h2h_wins = team_stats[batting_first]['vs_opponent_wins'][batting_second]
        h2h_win_rate = h2h_wins / h2h_matches if h2h_matches > 0 else 0.5
        h2h_avg_score = sum(team_stats[batting_first]['vs_opponent_scores'][batting_second]) / len(team_stats[batting_first]['vs_opponent_scores'][batting_second]) if team_stats[batting_first]['vs_opponent_scores'][batting_second] else 150
        
        return {
            'batting_first': batting_first,
            'batting_second': batting_second,
            'venue': venue,
            'date': match_date.strftime('%Y-%m-%d'),
            'first_innings_score': round(float(first_score), 2),
            'second_innings_score': round(float(second_score), 2),
            'first_innings_run_rate': round(first_run_rate, 2),
            'second_innings_run_rate': round(second_run_rate, 2),
            'winner': winner,
            'winner_probability': round(float(winner_prob), 2),
            'head_to_head': {
                'matches': h2h_matches,
                'team1_wins': h2h_wins,
                'team1_win_rate': round(h2h_win_rate, 2),
                'team1_avg_score_vs_team2': round(h2h_avg_score, 2)
            },
            'status': 'success'
        }
    except Exception as e:
        return {'status': 'error', 'message': str(e)}

@app.route('/')
def index():
    """Render the main page."""
    teams = team_encoder.classes_.tolist()
    venues = venue_encoder.classes_.tolist()
    return render_template('index.html', teams=teams, venues=venues)

@app.route('/predict', methods=['POST'])
def predict():
    """Predict match outcome."""
    data = request.form
    team1 = data['team1']
    team2 = data['team2']
    venue = data['venue']
    match_date = data['date']
    
    result = predict_match(team1, team2, venue, match_date)
    return jsonify(result)

if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=5000)

Models and encoders loaded successfully.
 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://10.0.2.220:5000
[33mPress CTRL+C to quit[0m
 * Restarting with stat
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/workspaces/codespaces-blank/.venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/workspaces/codespaces-blank/.venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1074, in launch_instance
    app.initialize(argv)
  File "/workspaces/codespaces-blank/.venv/lib/python3.12/site-packages/traitlets/config/application.py", line 118, in inner
    return method(app, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/codespaces-blank/.venv/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 692, in initialize
    self.init_sockets()
  File "/workspaces/codespaces-blank/

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
