In [10]:
import pandas as pd
import numpy as np
from sklearn.exceptions import DataConversionWarning
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut
from sklearn.metrics import f1_score, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
from sklearn import clone
from sklearn.utils.parallel import Parallel, delayed
from sklearn.model_selection import RepeatedStratifiedKFold
import warnings
import time
warnings.filterwarnings('ignore')

# Filter the specific warning about collinearity
warnings.filterwarnings('ignore', message='Variables are collinear')

# Optionally, if you want to filter other common warnings:
warnings.filterwarnings('ignore', category=DataConversionWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [11]:
def load_and_preprocess_data(file_path='dataset.csv', encoding_method='onehot', test_size=0.2, random_state=42):
    """Load and preprocess data with proper train/test splitting"""
    # Load data
    df = pd.read_csv(file_path)
    df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    # Basic preprocessing
    df = df.drop('Id', axis=1)
    df['Vegetation_Type'] = df['Vegetation_Type'].str.replace('Type_', '').astype(int)
    
    # Split features and target
    y = df['Vegetation_Type']
    X = df.drop('Vegetation_Type', axis=1)
    
    # First split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    # Identify feature types
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns
    
    if encoding_method == 'onehot':
        preprocessor = ColumnTransformer([
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore', 
                                drop='first'), categorical_features)
        ])
        
        # Fit preprocessor on training data only
        X_train_processed = preprocessor.fit_transform(X_train)
        X_test_processed = preprocessor.transform(X_test)
        
        numeric_names = numeric_features.tolist()
        categorical_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
        feature_names = numeric_names + categorical_names.tolist()
        
    elif encoding_method == 'label':
        label_encoders = {}
        X_train_transformed = X_train.copy()
        X_test_transformed = X_test.copy()
        
        for cat_feature in categorical_features:
            label_encoders[cat_feature] = LabelEncoder()
            X_train_transformed[cat_feature] = label_encoders[cat_feature].fit_transform(X_train[cat_feature])
            X_test_transformed[cat_feature] = label_encoders[cat_feature].transform(X_test[cat_feature])
        
        preprocessor = ColumnTransformer([
            ('all', StandardScaler(), X_train_transformed.columns)
        ])
        
        X_train_processed = preprocessor.fit_transform(X_train_transformed)
        X_test_processed = preprocessor.transform(X_test_transformed)
        feature_names = X_train_transformed.columns
    
    else:
        raise ValueError("encoding_method must be either 'onehot' or 'label'")
    
    X_train_processed = pd.DataFrame(X_train_processed, columns=feature_names)
    X_test_processed = pd.DataFrame(X_test_processed, columns=feature_names)
    
    print(f"Data processed using {encoding_method} encoding")
    print(f"Training set shape: {X_train_processed.shape}")
    print(f"Test set shape: {X_test_processed.shape}")
    print(f"Number of classes: {len(np.unique(y))}")
    
    return X_train_processed, X_test_processed, y_train, y_test, preprocessor

In [12]:
def get_base_model(model_name, random_state=42):
    """Get base model without regularization"""
    models = {
        'logistic': LogisticRegression(max_iter=20000, random_state=random_state),
        'lda': LDA(),
        'qda': QDA()
    }
    
    if model_name not in models:
        raise ValueError(f"Invalid model name. Choose from: {list(models.keys())}")
    
    return models[model_name]

In [13]:
def get_model_param_grid(model_name, reg_type=None):
    """Get parameter grid for GridSearchCV"""
    if reg_type not in [None, 'ridge', 'lasso']:
        raise ValueError("reg_type must be None, 'ridge' or 'lasso'")
    
    if model_name == 'logistic':
        if reg_type == 'ridge':
            return {
                'C': np.logspace(-4, 4, 20),
                'penalty': ['l2'],
                'solver': ['lbfgs']
            }
        elif reg_type == 'lasso':
            return {
                'C': np.logspace(-4, 4, 20),
                'penalty': ['l1'],
                'solver': ['liblinear']
            }
        else:  # base model
            return {'C': [1.0]}
            
    elif model_name == 'lda':
        if reg_type in ['ridge', 'lasso']:
            return {
                'shrinkage': np.linspace(0, 1, 20),
                'solver': ['lsqr']
            }
        else:
            return {'solver': ['svd']}
            
    elif model_name == 'qda':
        if reg_type in ['ridge', 'lasso']:
            return {'reg_param': np.linspace(0, 1, 20)}
        else:
            return {'reg_param': [0.0]}
    
    return {}

In [14]:
class ModelEvaluator:
    def __init__(self, X_train, X_test, y_train, y_test, model, random_state=42):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.model = model
        self.random_state = random_state
    
    def calculate_f1(self, y_true, y_pred):
        return f1_score(y_true, y_pred, average='macro')
    
    def holdout(self):
        self.model.fit(self.X_train, self.y_train)
        y_pred = self.model.predict(self.X_test)
        
        return {
            'f1_score': self.calculate_f1(self.y_test, y_pred),
            'classification_report': classification_report(self.y_test, y_pred)
        }
    
    def cross_validation(self, k):
        scores = cross_val_score(self.model, self.X_train, self.y_train, 
                               cv=k, scoring='f1_macro')
        return {
            'mean_f1': np.mean(scores),
            'std_f1': np.std(scores),
            'all_scores': scores,
            'k_folds': k
        }
    
    def loocv(self):
        loo = LeaveOneOut()
        scores = cross_val_score(self.model, self.X_train, self.y_train, 
                               cv=loo, scoring='f1_macro', n_jobs=-1)
        return {
            'mean_f1': np.mean(scores),
            'std_f1': np.std(scores)
        }
    
    def bootstrap(self, n_iterations=100, sample_size=0.8):
        def single_bootstrap():
            X_boot, y_boot = resample(self.X_train, self.y_train, 
                                    n_samples=int(len(self.X_train) * sample_size))
            X_oob = self.X_train.loc[~self.X_train.index.isin(X_boot.index)]
            y_oob = self.y_train.loc[~self.y_train.index.isin(y_boot.index)]
            
            model = clone(self.model)
            model.fit(X_boot, y_boot)
            y_pred = model.predict(X_oob)
            return self.calculate_f1(y_oob, y_pred)
        
        scores = Parallel(n_jobs=-1)(
            delayed(single_bootstrap)() 
            for _ in range(n_iterations)
        )
        
        scores = np.array(scores)
        return {
            'mean_f1': np.mean(scores),
            'std_f1': np.std(scores),
            'confidence_interval': (
                np.percentile(scores, 2.5),
                np.percentile(scores, 97.5)
            ),
            'n_iterations': n_iterations
        }

In [15]:
def evaluate_all_methods(model, X_train, X_test, y_train, y_test):
    evaluator = ModelEvaluator(X_train, X_test, y_train, y_test, model)
    times = {}
    results = {}
    
    for method in ['holdout', 'cv_5', 'cv_10', 'loocv', 'bootstrap']:
        start_time = time.time()
        if method == 'holdout':
            results[method] = evaluator.holdout()
        elif method == 'cv_5':
            results[method] = evaluator.cross_validation(k=5)
        elif method == 'cv_10':
            results[method] = evaluator.cross_validation(k=10)
        elif method == 'loocv':
            results[method] = evaluator.loocv()
        else:  # bootstrap
            results[method] = evaluator.bootstrap()
        times[method] = time.time() - start_time
        print(f"{method} took {times[method]:.2f} seconds")
    
    return results

In [16]:
def format_results(results, model_name):
    data = {
        'Method': [
            'Holdout',
            'CV (k=5)',
            'CV (k=10)',
            'LOOCV',
            'Bootstrap'
        ],
        'F1-Score': [
            results['holdout']['f1_score'],
            results['cv_5']['mean_f1'],
            results['cv_10']['mean_f1'],
            results['loocv']['mean_f1'],
            results['bootstrap']['mean_f1']
        ],
        'Std Dev': [
            None,
            results['cv_5']['std_f1'],
            results['cv_10']['std_f1'],
            results['loocv']['std_f1'],
            results['bootstrap']['std_f1']
        ]
    }
    
    df = pd.DataFrame(data)
    df = df.round(4)
    print(f"\nResults for {model_name}:")
    return df

In [17]:
def run_model_evaluation(model_name, X_train, X_test, y_train, y_test):
    """Run complete evaluation for a given model with proper GridSearchCV"""
    results = {}
    
    # Step 1: Evaluate base model
    print(f"\nEvaluating Base {model_name.upper()}:")
    print("-" * 50)
    
    base_model = get_base_model(model_name)
    param_grid = get_model_param_grid(model_name)
    
    # Use GridSearchCV even for base model to maintain consistency
    grid_search = GridSearchCV(
        base_model,
        param_grid,
        cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42),
        scoring='f1_macro',
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    best_base_model = grid_search.best_estimator_
    
    base_results = evaluate_all_methods(best_base_model, X_train, X_test, y_train, y_test)
    results['base'] = base_results
    print(format_results(base_results, f"Base {model_name.upper()}"))
    print("\nClassification Report (Holdout Method):")
    print(base_results['holdout']['classification_report'])
    
    # Step 2: Evaluate with Ridge regularization
    print(f"\nEvaluating {model_name.upper()} with Ridge Regularization:")
    print("-" * 50)
    
    param_grid = get_model_param_grid(model_name, reg_type='ridge')
    base_model = get_base_model(model_name)
    
    grid_search = GridSearchCV(
        base_model,
        param_grid,
        cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42),
        scoring='f1_macro',
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    best_ridge_model = grid_search.best_estimator_
    print(f"Best Ridge parameters: {grid_search.best_params_}")
    
    ridge_results = evaluate_all_methods(best_ridge_model, X_train, X_test, y_train, y_test)
    results['ridge'] = ridge_results
    print(format_results(ridge_results, f"Ridge {model_name.upper()}"))
    print("\nClassification Report (Holdout Method):")
    print(ridge_results['holdout']['classification_report'])
    
    # Step 3: Evaluate with Lasso regularization
    print(f"\nEvaluating {model_name.upper()} with Lasso Regularization:")
    print("-" * 50)
    
    param_grid = get_model_param_grid(model_name, reg_type='lasso')
    base_model = get_base_model(model_name)
    
    grid_search = GridSearchCV(
        base_model,
        param_grid,
        cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42),
        scoring='f1_macro',
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    best_lasso_model = grid_search.best_estimator_
    print(f"Best Lasso parameters: {grid_search.best_params_}")
    
    lasso_results = evaluate_all_methods(best_lasso_model, X_train, X_test, y_train, y_test)
    results['lasso'] = lasso_results
    print(format_results(lasso_results, f"Lasso {model_name.upper()}"))
    print("\nClassification Report (Holdout Method):")
    print(lasso_results['holdout']['classification_report'])
    
    # Step 4: Compare Results
    print("\nComparison Summary:")
    print("-" * 50)
    
    comparison_data = [{
        'Model': model_name.upper(),
        'Base F1': results['base']['cv_5']['mean_f1'],
        'Ridge F1': results['ridge']['cv_5']['mean_f1'],
        'Lasso F1': results['lasso']['cv_5']['mean_f1'],
        'Ridge Improvement (%)': ((results['ridge']['cv_5']['mean_f1'] - 
                                 results['base']['cv_5']['mean_f1']) / 
                                results['base']['cv_5']['mean_f1'] * 100),
        'Lasso Improvement (%)': ((results['lasso']['cv_5']['mean_f1'] - 
                                 results['base']['cv_5']['mean_f1']) / 
                                results['base']['cv_5']['mean_f1'] * 100)
    }]
    
    comparison_df = pd.DataFrame(comparison_data)
    comparison_df = comparison_df.round(4)
    print("\nOverall Comparison:")
    print(comparison_df)
    
    return results, comparison_df


In [18]:
def main():
    # Load and preprocess data
    X_train, X_test, y_train, y_test, preprocessor = load_and_preprocess_data(
        encoding_method='onehot'
    )
    
    # List of models to evaluate
    models = ['logistic', 'lda', 'qda']
    
    # Store all results
    all_results = {}
    all_comparisons = []
    
    # Evaluate each model
    for model_name in models:
        print(f"\n{'='*80}")
        print(f"Evaluating {model_name.upper()} classifier")
        print(f"{'='*80}")
        
        results, comparison = run_model_evaluation(
            model_name, X_train, X_test, y_train, y_test
        )
        
        all_results[model_name] = results
        all_comparisons.append(comparison)
    
    # Combine all comparisons
    final_comparison = pd.concat(all_comparisons, axis=0)
    print("\nFinal Comparison Across All Models:")
    print("="*80)
    print(final_comparison)
    
    return all_results, final_comparison

if __name__ == "__main__":
    all_results, final_comparison = main()

Data processed using onehot encoding
Training set shape: (3888, 47)
Test set shape: (972, 47)
Number of classes: 3

Evaluating LOGISTIC classifier

Evaluating Base LOGISTIC:
--------------------------------------------------
holdout took 0.03 seconds
cv_5 took 0.12 seconds
cv_10 took 0.26 seconds
loocv took 10.90 seconds
bootstrap took 0.38 seconds

Results for Base LOGISTIC:
      Method  F1-Score  Std Dev
0    Holdout    0.9029      NaN
1   CV (k=5)    0.9076   0.0049
2  CV (k=10)    0.9103   0.0116
3      LOOCV    0.9257   0.2623
4  Bootstrap    0.9074   0.0060

Classification Report (Holdout Method):
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       432
           3       0.89      0.87      0.88       324
           4       0.82      0.84      0.83       216

    accuracy                           0.92       972
   macro avg       0.90      0.90      0.90       972
weighted avg       0.92      0.92      0.92       972


Evalua



cv_5 took 0.04 seconds
cv_10 took 0.09 seconds




loocv took 2.40 seconds




bootstrap took 0.30 seconds

Results for Base QDA:
      Method  F1-Score  Std Dev
0    Holdout    0.7279      NaN
1   CV (k=5)    0.7336   0.0138
2  CV (k=10)    0.7317   0.0242
3      LOOCV    0.7505   0.4327
4  Bootstrap    0.7571   0.0371

Classification Report (Holdout Method):
              precision    recall  f1-score   support

           1       1.00      0.89      0.94       432
           3       0.75      0.43      0.54       324
           4       0.54      1.00      0.70       216

    accuracy                           0.76       972
   macro avg       0.76      0.77      0.73       972
weighted avg       0.81      0.76      0.75       972


Evaluating QDA with Ridge Regularization:
--------------------------------------------------




Best Ridge parameters: {'reg_param': np.float64(0.05263157894736842)}
holdout took 0.01 seconds
cv_5 took 0.04 seconds
cv_10 took 0.07 seconds




loocv took 2.41 seconds




bootstrap took 0.30 seconds

Results for Ridge QDA:
      Method  F1-Score  Std Dev
0    Holdout    0.9035      NaN
1   CV (k=5)    0.9008   0.0024
2  CV (k=10)    0.9027   0.0073
3      LOOCV    0.9187   0.2733
4  Bootstrap    0.9013   0.0063

Classification Report (Holdout Method):
              precision    recall  f1-score   support

           1       1.00      0.98      0.99       432
           3       0.92      0.83      0.87       324
           4       0.78      0.92      0.85       216

    accuracy                           0.92       972
   macro avg       0.90      0.91      0.90       972
weighted avg       0.92      0.92      0.92       972


Evaluating QDA with Lasso Regularization:
--------------------------------------------------




Best Lasso parameters: {'reg_param': np.float64(0.05263157894736842)}
holdout took 0.01 seconds
cv_5 took 0.04 seconds
cv_10 took 0.07 seconds




loocv took 2.43 seconds




bootstrap took 0.29 seconds

Results for Lasso QDA:
      Method  F1-Score  Std Dev
0    Holdout    0.9035      NaN
1   CV (k=5)    0.9008   0.0024
2  CV (k=10)    0.9027   0.0073
3      LOOCV    0.9187   0.2733
4  Bootstrap    0.9010   0.0076

Classification Report (Holdout Method):
              precision    recall  f1-score   support

           1       1.00      0.98      0.99       432
           3       0.92      0.83      0.87       324
           4       0.78      0.92      0.85       216

    accuracy                           0.92       972
   macro avg       0.90      0.91      0.90       972
weighted avg       0.92      0.92      0.92       972


Comparison Summary:
--------------------------------------------------

Overall Comparison:
  Model  Base F1  Ridge F1  Lasso F1  Ridge Improvement (%)  \
0   QDA   0.7336    0.9008    0.9008                  22.79   

   Lasso Improvement (%)  
0                  22.79  

Final Comparison Across All Models:
      Model  Base F1  R