## this is where we will train and develop the models for forecasting ROI

In [18]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns  
import numpy as np

In [19]:
df = pd.read_csv('./data/df_encoded_for_training.csv') 
df.head() 

Unnamed: 0,Conversion_Rate,Acquisition_Cost,ROI,Clicks,Impressions,Engagement_Score,Campaign_Type_Display,Campaign_Type_Email,Campaign_Type_Influencer,Campaign_Type_Search,...,Customer_Segment_Fashionistas,Customer_Segment_Foodies,Customer_Segment_Health & Wellness,Customer_Segment_Outdoor Adventurers,Customer_Segment_Tech Enthusiasts,Season_Fall,Season_Spring,Season_Summer,Season_Winter,Duration_encoded
0,0.214286,0.744933,0.715,0.451111,0.102444,0.555556,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0.333333
1,0.785714,0.437733,0.601667,0.017778,0.724778,0.666667,0,1,0,0,...,1,0,0,0,0,0,0,0,1,1.0
2,0.428571,0.346667,0.863333,0.537778,0.744222,0.0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0.333333
3,0.714286,0.514933,0.591667,0.13,0.091111,0.666667,1,0,0,0,...,0,0,1,0,0,0,0,0,1,1.0
4,0.285714,0.763467,0.75,0.31,0.355667,0.222222,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0.0


In [20]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import time   
from time import perf_counter

In [21]:
X, y = df.drop('ROI', axis=1), df['ROI']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
) 

models = {
    "Linear Regression":  LinearRegression(),
    "Ridge":              Ridge(alpha=1.0),          # α can be tuned
    "Lasso":              Lasso(alpha=0.01),         # α can be tuned
    "ElasticNet":         ElasticNet(alpha=0.01, l1_ratio=0.5),
    "Random Forest":      RandomForestRegressor(random_state=42),
    "Decision Tree":      DecisionTreeRegressor(random_state=42),
    # "SVR":               SVR(),                    # optional—slow
    "KNN":                KNeighborsRegressor(n_neighbors=5)
}

# ------------------------------------------------------------
# Fit, time, predict, score
# ------------------------------------------------------------
results = {}

for name, model in models.items():
    start = time.perf_counter()
    model.fit(X_train, y_train)
    fit_time = time.perf_counter() - start
    
    preds     = model.predict(X_test)
    mae       = mean_absolute_error(y_test, preds)
    r2        = r2_score(y_test, preds)
    
    results[name] = {
        "fit_time_s": round(fit_time, 2),
        "mae":        round(mae, 4),
        "r2":         round(r2, 4)
    }

# ------------------------------------------------------------
# Display summary
# ------------------------------------------------------------
print("\n=== Model Performance Summary ===")
for name, res in results.items():
    print(f"{name:17s} | fit: {res['fit_time_s']:>6}s | MAE: {res['mae']:.4f} | R²: {res['r2']:.4f}")

KeyboardInterrupt: 

In [5]:
# Prepare the data
X, y = df.drop('ROI', axis=1), df['ROI']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
) 

# Initialize regression models
linear_reg  = LinearRegression()
rand_forest = RandomForestRegressor(random_state=42)
dec_tree    = DecisionTreeRegressor(random_state=42)
# svm         = SVR()
knn         = KNeighborsRegressor(n_neighbors=5)

# Train each model with timing
start = time.perf_counter()
linear_reg.fit(X_train, y_train)
print(f"LinearRegression fit time: {time.perf_counter() - start:.2f} s")

start = time.perf_counter()
rand_forest.fit(X_train, y_train)
print(f"RandomForestRegressor fit time: {time.perf_counter() - start:.2f} s")

start = time.perf_counter()
dec_tree.fit(X_train, y_train)
print(f"DecisionTreeRegressor fit time: {time.perf_counter() - start:.2f} s")

# start = time.perf_counter()
# svm.fit(X_train, y_train)
# print(f"SVR fit time: {time.perf_counter() - start:.2f} s")

start = time.perf_counter()
knn.fit(X_train, y_train)
print(f"KNeighborsRegressor fit time: {time.perf_counter() - start:.2f} s\n")

# Make predictions
linear_reg_preds  = linear_reg.predict(X_test)
rand_forest_preds = rand_forest.predict(X_test)
dec_tree_preds    = dec_tree.predict(X_test)
#svm_preds         = svm.predict(X_test)
knn_preds         = knn.predict(X_test)

# Calculate regression metrics
linear_reg_mae  = mean_absolute_error(y_test, linear_reg_preds)
rand_forest_mae = mean_absolute_error(y_test, rand_forest_preds)
dec_tree_mae    = mean_absolute_error(y_test, dec_tree_preds)
#svm_mae         = mean_absolute_error(y_test, svm_preds)
knn_mae         = mean_absolute_error(y_test, knn_preds)

linear_reg_r2  = r2_score(y_test, linear_reg_preds)
rand_forest_r2 = r2_score(y_test, rand_forest_preds)
dec_tree_r2    = r2_score(y_test, dec_tree_preds)
#svm_r2         = r2_score(y_test, svm_preds)
knn_r2         = r2_score(y_test, knn_preds)

# Print results
print(f"Linear Regression MAE: {linear_reg_mae:.4f}, R²: {linear_reg_r2:.4f}")
print(f"Random Forest MAE: {rand_forest_mae:.4f}, R²: {rand_forest_r2:.4f}")
print(f"Decision Tree MAE: {dec_tree_mae:.4f}, R²: {dec_tree_r2:.4f}")
#print(f"SVM MAE: {svm_mae:.4f}, R²: {svm_r2:.4f}")
print(f"KNN MAE: {knn_mae:.4f}, R²: {knn_r2:.4f}") 

LinearRegression fit time: 0.13 s
RandomForestRegressor fit time: 135.04 s
DecisionTreeRegressor fit time: 2.31 s
KNeighborsRegressor fit time: 0.04 s

Linear Regression MAE: 0.2505, R²: -0.0003
Random Forest MAE: 0.2520, R²: -0.0189
Decision Tree MAE: 0.3393, R²: -1.0622
KNN MAE: 0.2669, R²: -0.2002


## now lets do some tuning with gridsearchCV

In [13]:
param_grids = {
    'LinearRegression': {
        'fit_intercept': [True, False],
        # 'positive' is available in sklearn ≥0.24; uncomment if you like
        # 'positive': [True, False]
    },
    'RandomForest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'n_jobs': [-1]          # use all CPU cores
    },
    'DecisionTree': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    }
}

# Map names → estimator instances
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest'    : RandomForestRegressor(random_state=42),
    'DecisionTree'    : DecisionTreeRegressor(random_state=42),
    'KNN'             : KNeighborsRegressor()
}

# ------------------------------------------------------------
# 3. GridSearchCV loop
# ------------------------------------------------------------
search_results = {}

for name, model in models.items():
    print(f"🔍  Tuning {name} …")
    
    grid = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        scoring='neg_mean_absolute_error',   # MAE (lower is better)
        cv=5,
        n_jobs=-1,
        verbose=0
    )
    
    t0 = perf_counter()
    grid.fit(X_train, y_train)
    fit_time = perf_counter() - t0
    
    best_model = grid.best_estimator_
    cv_mae = -grid.best_score_              # convert back to positive MAE
    
    # Evaluate on hold-out set
    y_pred = best_model.predict(X_test)
    test_mae = mean_absolute_error(y_test, y_pred)
    test_r2  = r2_score(y_test, y_pred)
    
    search_results[name] = {
        'best_params': grid.best_params_,
        'fit_time_s' : round(fit_time, 2),
        'cv_mae'     : round(cv_mae , 4),
        'test_mae'   : round(test_mae, 4),
        'test_r2'    : round(test_r2 , 4)
    }

# ------------------------------------------------------------
# 4. Display summary (sorted by lowest test MAE)
# ------------------------------------------------------------
print("\n=== Grid-search summary (sorted by lowest test MAE) ===")
for name, res in sorted(search_results.items(),
                        key=lambda x: x[1]['test_mae']):
    print(f"\n{name}")
    print(f"  • best params : {res['best_params']}")
    print(f"  • fit time    : {res['fit_time_s']} s")
    print(f"  • CV MAE      : {res['cv_mae']}")
    print(f"  • Test MAE    : {res['test_mae']}")
    print(f"  • Test R²     : {res['test_r2']}")

🔍  Tuning LinearRegression …
🔍  Tuning RandomForest …




KeyboardInterrupt: 