In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import mlflow
import mlflow.sklearn

from mlflow.models.signature import infer_signature

In [9]:
mlflow.set_experiment("Fantasy Cricket Prediction")

<Experiment: artifact_location='file:///c:/Users/Admin/Desktop/dream11ml/notebooks/mlruns/193820164966890080', creation_time=1758525537582, experiment_id='193820164966890080', last_update_time=1758525537582, lifecycle_stage='active', name='Fantasy Cricket Prediction', tags={}>

In [10]:
print("Loading processed data...")
try:
    df = pd.read_csv('C:/Users/Admin/Desktop/dream11ml/data/processed/processed_data.csv')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: Processed data file not found. Please run the data pipeline first.")
    exit()


Loading processed data...
Data loaded successfully.


In [11]:
features_to_drop = ['Unnamed: 0', 'season', 'home_team_points', 'away_team_points']
X = df.drop(columns=['fantasy_points'] + features_to_drop, errors='ignore')
y = df['fantasy_points']

# Verify the feature set
print("\nFeatures selected for training:")
print(X.columns)



Features selected for training:
Index(['name', 'decision', '1st_inning_score', '2nd_inning_score', 'venue_id',
       'home_overs', 'home_runs', 'home_wickets', 'home_boundaries',
       'away_overs',
       ...
       'description_9th match (D/N), Indian Premier League at Mohali, Apr 13 2011',
       'description_9th match (D/N), Indian Premier League at Mumbai, Apr 14 2018',
       'description_9th match (N), Indian Premier League at Delhi, Mar 17 2010',
       'description_9th match (N), Indian Premier League at Hyderabad (Deccan), Apr 24 2008',
       'description_9th match (N), Indian Premier League at Mumbai, Apr 16 2016',
       'description_9th match (N), Indian Premier League at Pune, Apr 11 2017',
       'description_9th match (N), Indian Premier League at Visakhapatnam, Apr 9 2012',
       'description_9th match (N), Pepsi Indian Premier League at Ahmedabad, Apr 14 2015',
       'description_9th match (N), Pepsi Indian Premier League at Sharjah, Apr 22 2014',
       'descri

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features for better model performance
# We are selecting numerical features that were not one-hot encoded
numerical_cols = ['home_overs', 'home_runs', 'home_wickets', 'home_boundaries',
                  'away_overs', 'away_runs', 'away_wickets', 'away_boundaries',
                  'score_ratio', 'home_advantage']

# Filter the list to include only columns present in the DataFrame
numerical_cols = [col for col in numerical_cols if col in X_train.columns]

# IMPORTANT: Convert integer columns to floats to avoid schema enforcement warnings
for col in numerical_cols:
    if X_train[col].dtype == 'int64':
        X_train[col] = X_train[col].astype('float64')
    if X_test[col].dtype == 'int64':
        X_test[col] = X_test[col].astype('float64')

scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])


In [22]:
print("\nStarting MLflow experiment for Linear Regression...")
with mlflow.start_run(run_name="Linear_Regression_Model") as lr_run:
    # Log parameters
    mlflow.log_param("model_name", "Linear Regression")
    
    # Train the model
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = lr_model.predict(X_test)
    
    # Evaluate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    # Log metrics
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2)
    
    print(f"Linear Regression Metrics - RMSE: {rmse:.4f}, R-squared: {r2:.4f}")
    
    # Infer model signature
    signature = infer_signature(X_train, lr_model.predict(X_train))
    
    # Log the model artifact with a signature and input example
    mlflow.sklearn.log_model(lr_model, "linear-regression-model", signature=signature, input_example=X_train.iloc[:2])
    print("Linear Regression model logged to MLflow.")



Starting MLflow experiment for Linear Regression...
Linear Regression Metrics - RMSE: 6.1113, R-squared: 0.9958


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 239.37it/s] 


Linear Regression model logged to MLflow.


In [23]:
print("\nStarting MLflow experiment for Random Forest Regressor...")
with mlflow.start_run(run_name="Random_Forest_Regressor_Model") as rf_run:
    # Log parameters
    rf_params = {"n_estimators": 100, "max_depth": 10, "random_state": 42}
    mlflow.log_params(rf_params)
    
    # Train the model
    rf_model = RandomForestRegressor(**rf_params)
    rf_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = rf_model.predict(X_test)
    
    # Evaluate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    # Log metrics
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2)
    
    print(f"Random Forest Regressor Metrics - RMSE: {rmse:.4f}, R-squared: {r2:.4f}")
    
    # Infer model signature
    signature = infer_signature(X_train, rf_model.predict(X_train))

    # Log the model artifact with a signature and input example
    mlflow.sklearn.log_model(rf_model, "random-forest-model", signature=signature, input_example=X_train.iloc[:2])
    print("Random Forest Regressor model logged to MLflow.")

print("\nModel training and experiment tracking complete.")



Starting MLflow experiment for Random Forest Regressor...
Random Forest Regressor Metrics - RMSE: 29.3089, R-squared: 0.9039


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 225.10it/s]


Random Forest Regressor model logged to MLflow.

Model training and experiment tracking complete.
