In [92]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut
from sklearn.metrics import f1_score, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn import clone
import warnings
import time
warnings.filterwarnings('ignore')

In [70]:
def load_and_preprocess_data(file_path='dataset.csv', encoding_method='onehot'):
    # Load data
    df = pd.read_csv(file_path)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Basic preprocessing
    df = df.drop('Id', axis=1)
    df['Vegetation_Type'] = df['Vegetation_Type'].str.replace('Type_', '').astype(int)
    
    # Split features and target
    y = df['Vegetation_Type']
    X = df.drop('Vegetation_Type', axis=1)
    
    # Identify feature types
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns
    
    if encoding_method == 'onehot':
        preprocessor = ColumnTransformer([
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore', 
                                drop='first'), categorical_features)
        ])
        
        X_processed = preprocessor.fit_transform(X)
        numeric_names = numeric_features.tolist()
        categorical_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
        feature_names = numeric_names + categorical_names.tolist()
        
    elif encoding_method == 'label':
        label_encoders = {}
        X_transformed = X.copy()
        
        for cat_feature in categorical_features:
            label_encoders[cat_feature] = LabelEncoder()
            X_transformed[cat_feature] = label_encoders[cat_feature].fit_transform(X[cat_feature])
        
        preprocessor = ColumnTransformer([
            ('all', StandardScaler(), X_transformed.columns)
        ])
        
        X_processed = preprocessor.fit_transform(X_transformed)
        feature_names = X_transformed.columns
    
    else:
        raise ValueError("encoding_method must be either 'onehot' or 'label'")
    
    X_processed = pd.DataFrame(X_processed, columns=feature_names)
    
    print(f"Data processed using {encoding_method} encoding")
    print(f"Shape of X: {X_processed.shape}")
    print(f"Number of classes in y: {len(np.unique(y))}")
    
    return X_processed, y, preprocessor

In [71]:
def get_base_model(model_name, random_state=42):
    models = {
        'logistic': LogisticRegression(max_iter=20000, random_state=random_state),
        'lda': LDA(),
        'qda': QDA()
    }
    
    if model_name not in models:
        raise ValueError(f"Invalid model name. Choose from: {list(models.keys())}")
    
    return models[model_name]

In [72]:
def get_regularized_model(model_name, reg_type='ridge', C=1.0, random_state=42):
    """Models with regularization applied"""
    if reg_type not in ['ridge', 'lasso']:
        raise ValueError("reg_type must be 'ridge' or 'lasso'")
        
    # For logistic regression, apply regularization directly
    if model_name == 'logistic':
        if reg_type == 'ridge':
            return LogisticRegression(penalty='l2', C=C, solver='lbfgs', 
                                    max_iter=20000, random_state=random_state)
        else:  # lasso
            return LogisticRegression(penalty='l1', C=C, solver='liblinear', 
                                    max_iter=20000, random_state=random_state)
    
    # For LDA, map C to shrinkage parameter between 0 and 1
    elif model_name == 'lda':
        # Convert C to shrinkage: smaller C = stronger regularization
        shrinkage = 1 / (1 + C)  # This will always be between 0 and 1
        return LDA(solver='lsqr', shrinkage=shrinkage)
    
    # For QDA, use reg_param
    elif model_name == 'qda':
        # Convert C to reg_param: smaller C = stronger regularization
        reg_param = 1 / (1 + C)  # This will always be between 0 and 1
        return QDA(reg_param=reg_param)
    
    else:
        raise ValueError(f"Invalid model name. Choose from: logistic, lda, qda")

In [73]:
class ModelEvaluator:
    def __init__(self, X, y, model, random_state=42):
        self.X = X
        self.y = y
        self.model = model
        self.random_state = random_state
    
    def calculate_f1(self, y_true, y_pred):
        return f1_score(y_true, y_pred, average='macro')
    
    def holdout(self, test_size=0.2):
        X_train, X_test, y_train, y_test = train_test_split(
            self.X, self.y, test_size=test_size, random_state=self.random_state
        )
        
        self.model.fit(X_train, y_train)
        y_pred = self.model.predict(X_test)
        
        return {
            'f1_score': self.calculate_f1(y_test, y_pred),
            'classification_report': classification_report(y_test, y_pred),
            'test_size': test_size
        }
    
    def cross_validation(self, k):
        scores = cross_val_score(self.model, self.X, self.y, 
                               cv=k, scoring='f1_macro')
        return {
            'mean_f1': np.mean(scores),
            'std_f1': np.std(scores),
            'all_scores': scores,
            'k_folds': k
        }
    
    def loocv(self):
        loo = LeaveOneOut()
        scores = cross_val_score(self.model, self.X, self.y, 
                               cv=loo, scoring='f1_macro', n_jobs=-1)
        return {
            'mean_f1': np.mean(scores),
            'std_f1': np.std(scores)
        }
    
    def bootstrap(self, n_iterations=100, sample_size=0.8):
        from joblib import Parallel, delayed
        
        def single_bootstrap():
            # Single bootstrap iteration
            X_boot, y_boot = resample(self.X, self.y, 
                                    n_samples=int(len(self.X) * sample_size))
            X_oob = self.X.loc[~self.X.index.isin(X_boot.index)]
            y_oob = self.y.loc[~self.y.index.isin(y_boot.index)]
            
            model = clone(self.model)  # Create a fresh clone of the model
            model.fit(X_boot, y_boot)
            y_pred = model.predict(X_oob)
            return self.calculate_f1(y_oob, y_pred)
        
        # Run bootstrap iterations in parallel
        scores = Parallel(n_jobs=-1)(
            delayed(single_bootstrap)() 
            for _ in range(n_iterations)
        )
        
        scores = np.array(scores)
        return {
            'mean_f1': np.mean(scores),
            'std_f1': np.std(scores),
            'confidence_interval': (
                np.percentile(scores, 2.5),
                np.percentile(scores, 97.5)
            ),
            'n_iterations': n_iterations
        }

In [74]:
def evaluate_all_methods(model, X, y):
    evaluator = ModelEvaluator(X, y, model)
    times = {}
    results = {}
    
    for method in ['holdout', 'cv_5', 'cv_10', 'loocv', 'bootstrap']:
        start_time = time.time()
        if method == 'holdout':
            results[method] = evaluator.holdout()
        elif method == 'cv_5':
            results[method] = evaluator.cross_validation(k=5)
        elif method == 'cv_10':
            results[method] = evaluator.cross_validation(k=10)
        elif method == 'loocv':
            results[method] = evaluator.loocv()
        else:  # bootstrap
            results[method] = evaluator.bootstrap()
        times[method] = time.time() - start_time
        print(f"{method} took {times[method]:.2f} seconds")
    
    return results
    

In [75]:
def format_results(results, model_name):
    data = {
        'Method': [
            'Holdout',
            'CV (k=5)',
            'CV (k=10)',
            'LOOCV',
            'Bootstrap'
        ],
        'F1-Score': [
            results['holdout']['f1_score'],
            results['cv_5']['mean_f1'],
            results['cv_10']['mean_f1'],
            results['loocv']['mean_f1'],
            results['bootstrap']['mean_f1']
        ],
        'Std Dev': [
            None,
            results['cv_5']['std_f1'],
            results['cv_10']['std_f1'],
            results['loocv']['std_f1'],
            results['bootstrap']['std_f1']
        ]
    }
    
    df = pd.DataFrame(data)
    df = df.round(4)
    print(f"\nResults for {model_name}:")
    return df

# Logistic

In [76]:
X, y, preprocessor = load_and_preprocess_data(encoding_method='onehot')

# Step 1: Evaluate base models
print("\nEvaluating Base Models:")
print("-" * 50)

# Logistic
base_results = {}
model_name = 'logistic'
model = get_base_model(model_name)
results = evaluate_all_methods(model, X, y)
base_results[model_name] = results
print(format_results(results, f"Base {model_name.upper()}"))
print("\nClassification Report (Holdout Method):")
print(results['holdout']['classification_report'])

# Step 2: Evaluate with Ridge regularization
print("\nEvaluating with Ridge Regularization:")
print("-" * 50)
C_range = np.logspace(-4, 4, 10)
ridge_results = {}
best_ridge_C = {}

# Find best C
cv_scores = []
for C in C_range:
    model = get_regularized_model(model_name, reg_type='ridge', C=C)
    scores = cross_val_score(model, X, y, cv=5, scoring='f1_macro')
    cv_scores.append(np.mean(scores))

best_ridge_C[model_name] = C_range[np.argmax(cv_scores)]
print(f"\nBest Ridge C for {model_name}: {best_ridge_C[model_name]:.4f}")

# Evaluate with best C
model = get_regularized_model(model_name, reg_type='ridge', 
                            C=best_ridge_C[model_name])
results = evaluate_all_methods(model, X, y)
ridge_results[model_name] = results
print(format_results(results, f"Ridge {model_name.upper()}"))
print("\nClassification Report (Holdout Method):")
print(results['holdout']['classification_report'])

# Step 3: Evaluate with Lasso regularization
print("\nEvaluating with Lasso Regularization:")
print("-" * 50)
lasso_results = {}
best_lasso_C = {}

# Find best C
cv_scores = []
for C in C_range:
    model = get_regularized_model(model_name, reg_type='lasso', C=C)
    scores = cross_val_score(model, X, y, cv=5, scoring='f1_macro')
    cv_scores.append(np.mean(scores))

best_lasso_C[model_name] = C_range[np.argmax(cv_scores)]
print(f"\nBest Lasso C for {model_name}: {best_lasso_C[model_name]:.4f}")

# Evaluate with best C
model = get_regularized_model(model_name, reg_type='lasso', 
                            C=best_lasso_C[model_name])
results = evaluate_all_methods(model, X, y)
lasso_results[model_name] = results
print(format_results(results, f"Lasso {model_name.upper()}"))
print("\nClassification Report (Holdout Method):")
print(results['holdout']['classification_report'])

# Step 4: Compare Results
print("\nComparison Summary:")
print("-" * 50)

comparison_data = []
base_f1 = base_results[model_name]['cv_5']['mean_f1']
ridge_f1 = ridge_results[model_name]['cv_5']['mean_f1']
lasso_f1 = lasso_results[model_name]['cv_5']['mean_f1']

comparison_data.append({
    'Model': model_name.upper(),
    'Base F1': base_f1,
    'Ridge F1': ridge_f1,
    'Lasso F1': lasso_f1,
    'Ridge Improvement (%)': ((ridge_f1 - base_f1) / base_f1) * 100,
    'Lasso Improvement (%)': ((lasso_f1 - base_f1) / base_f1) * 100
})

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.round(4)
print("\nOverall Comparison:")
print(comparison_df)

Data processed using onehot encoding
Shape of X: (4860, 47)
Number of classes in y: 3

Evaluating Base Models:
--------------------------------------------------
holdout took 0.02 seconds
cv_5 took 0.12 seconds
cv_10 took 0.28 seconds
loocv took 20.97 seconds
bootstrap took 0.44 seconds

Results for Base LOGISTIC:
      Method  F1-Score  Std Dev
0    Holdout    0.9119      NaN
1   CV (k=5)    0.9122   0.0076
2  CV (k=10)    0.9116   0.0106
3      LOOCV    0.9265   0.2609
4  Bootstrap    0.9074   0.0052

Classification Report (Holdout Method):
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       439
           3       0.92      0.86      0.89       323
           4       0.81      0.89      0.85       210

    accuracy                           0.93       972
   macro avg       0.91      0.92      0.91       972
weighted avg       0.93      0.93      0.93       972


Evaluating with Ridge Regularization:
-------------------------------

# LDA

In [77]:
X, y, preprocessor = load_and_preprocess_data(encoding_method='onehot')

# Step 1: Evaluate base models
print("\nEvaluating Base Models:")
print("-" * 50)

# Logistic
base_results = {}
model_name = 'lda'
model = get_base_model(model_name)
results = evaluate_all_methods(model, X, y)
base_results[model_name] = results
print(format_results(results, f"Base {model_name.upper()}"))
print("\nClassification Report (Holdout Method):")
print(results['holdout']['classification_report'])

# Step 2: Evaluate with Ridge regularization
print("\nEvaluating with Ridge Regularization:")
print("-" * 50)
C_range = np.logspace(-4, 4, 10)
ridge_results = {}
best_ridge_C = {}

# Find best C
cv_scores = []
for C in C_range:
    model = get_regularized_model(model_name, reg_type='ridge', C=C)
    scores = cross_val_score(model, X, y, cv=5, scoring='f1_macro')
    cv_scores.append(np.mean(scores))

best_ridge_C[model_name] = C_range[np.argmax(cv_scores)]
print(f"\nBest Ridge C for {model_name}: {best_ridge_C[model_name]:.4f}")

# Evaluate with best C
model = get_regularized_model(model_name, reg_type='ridge', 
                            C=best_ridge_C[model_name])
results = evaluate_all_methods(model, X, y)
ridge_results[model_name] = results
print(format_results(results, f"Ridge {model_name.upper()}"))
print("\nClassification Report (Holdout Method):")
print(results['holdout']['classification_report'])

# Step 3: Evaluate with Lasso regularization
print("\nEvaluating with Lasso Regularization:")
print("-" * 50)
lasso_results = {}
best_lasso_C = {}

# Find best C
cv_scores = []
for C in C_range:
    model = get_regularized_model(model_name, reg_type='lasso', C=C)
    scores = cross_val_score(model, X, y, cv=5, scoring='f1_macro')
    cv_scores.append(np.mean(scores))

best_lasso_C[model_name] = C_range[np.argmax(cv_scores)]
print(f"\nBest Lasso C for {model_name}: {best_lasso_C[model_name]:.4f}")

# Evaluate with best C
model = get_regularized_model(model_name, reg_type='lasso', 
                            C=best_lasso_C[model_name])
results = evaluate_all_methods(model, X, y)
lasso_results[model_name] = results
print(format_results(results, f"Lasso {model_name.upper()}"))
print("\nClassification Report (Holdout Method):")
print(results['holdout']['classification_report'])

# Step 4: Compare Results
print("\nComparison Summary:")
print("-" * 50)

comparison_data = []
base_f1 = base_results[model_name]['cv_5']['mean_f1']
ridge_f1 = ridge_results[model_name]['cv_5']['mean_f1']
lasso_f1 = lasso_results[model_name]['cv_5']['mean_f1']

comparison_data.append({
    'Model': model_name.upper(),
    'Base F1': base_f1,
    'Ridge F1': ridge_f1,
    'Lasso F1': lasso_f1,
    'Ridge Improvement (%)': ((ridge_f1 - base_f1) / base_f1) * 100,
    'Lasso Improvement (%)': ((lasso_f1 - base_f1) / base_f1) * 100
})

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.round(4)
print("\nOverall Comparison:")
print(comparison_df)

Data processed using onehot encoding
Shape of X: (4860, 47)
Number of classes in y: 3

Evaluating Base Models:
--------------------------------------------------
holdout took 0.09 seconds
cv_5 took 0.05 seconds
cv_10 took 0.10 seconds
loocv took 5.29 seconds
bootstrap took 0.25 seconds

Results for Base LDA:
      Method  F1-Score  Std Dev
0    Holdout    0.8899      NaN
1   CV (k=5)    0.8814   0.0108
2  CV (k=10)    0.8808   0.0189
3      LOOCV    0.9019   0.2975
4  Bootstrap    0.8782   0.0080

Classification Report (Holdout Method):
              precision    recall  f1-score   support

           1       1.00      0.99      0.99       439
           3       0.86      0.87      0.87       323
           4       0.81      0.82      0.81       210

    accuracy                           0.91       972
   macro avg       0.89      0.89      0.89       972
weighted avg       0.91      0.91      0.91       972


Evaluating with Ridge Regularization:
-------------------------------------

# QDA

In [93]:
X, y, preprocessor = load_and_preprocess_data(encoding_method='onehot')

# Step 1: Evaluate base models
print("\nEvaluating Base Models:")
print("-" * 50)

# Logistic
base_results = {}
model_name = 'qda'
model = get_base_model(model_name)
results = evaluate_all_methods(model, X, y)
base_results[model_name] = results
print(format_results(results, f"Base {model_name.upper()}"))
print("\nClassification Report (Holdout Method):")
print(results['holdout']['classification_report'])

# Step 2: Evaluate with Ridge regularization
print("\nEvaluating with Ridge Regularization:")
print("-" * 50)
C_range = np.logspace(-4, 4, 10)
ridge_results = {}
best_ridge_C = {}

# Find best C
cv_scores = []
for C in C_range:
    model = get_regularized_model(model_name, reg_type='ridge', C=C)
    scores = cross_val_score(model, X, y, cv=5, scoring='f1_macro')
    cv_scores.append(np.mean(scores))

best_ridge_C[model_name] = C_range[np.argmax(cv_scores)]
print(f"\nBest Ridge C for {model_name}: {best_ridge_C[model_name]:.4f}")

# Evaluate with best C
model = get_regularized_model(model_name, reg_type='ridge', 
                            C=best_ridge_C[model_name])
results = evaluate_all_methods(model, X, y)
ridge_results[model_name] = results
print(format_results(results, f"Ridge {model_name.upper()}"))
print("\nClassification Report (Holdout Method):")
print(results['holdout']['classification_report'])

# Step 3: Evaluate with Lasso regularization
print("\nEvaluating with Lasso Regularization:")
print("-" * 50)
lasso_results = {}
best_lasso_C = {}

# Find best C
cv_scores = []
for C in C_range:
    model = get_regularized_model(model_name, reg_type='lasso', C=C)
    scores = cross_val_score(model, X, y, cv=5, scoring='f1_macro')
    cv_scores.append(np.mean(scores))

best_lasso_C[model_name] = C_range[np.argmax(cv_scores)]
print(f"\nBest Lasso C for {model_name}: {best_lasso_C[model_name]:.4f}")

# Evaluate with best C
model = get_regularized_model(model_name, reg_type='lasso', 
                            C=best_lasso_C[model_name])
results = evaluate_all_methods(model, X, y)
lasso_results[model_name] = results
print(format_results(results, f"Lasso {model_name.upper()}"))
print("\nClassification Report (Holdout Method):")
print(results['holdout']['classification_report'])

# Step 4: Compare Results
print("\nComparison Summary:")
print("-" * 50)

comparison_data = []
base_f1 = base_results[model_name]['cv_5']['mean_f1']
ridge_f1 = ridge_results[model_name]['cv_5']['mean_f1']
lasso_f1 = lasso_results[model_name]['cv_5']['mean_f1']

comparison_data.append({
    'Model': model_name.upper(),
    'Base F1': base_f1,
    'Ridge F1': ridge_f1,
    'Lasso F1': lasso_f1,
    'Ridge Improvement (%)': ((ridge_f1 - base_f1) / base_f1) * 100,
    'Lasso Improvement (%)': ((lasso_f1 - base_f1) / base_f1) * 100
})

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.round(4)
print("\nOverall Comparison:")
print(comparison_df)

Data processed using onehot encoding
Shape of X: (4860, 47)
Number of classes in y: 3

Evaluating Base Models:
--------------------------------------------------
holdout took 0.07 seconds
cv_5 took 0.09 seconds
cv_10 took 0.13 seconds




loocv took 3.51 seconds




bootstrap took 0.26 seconds

Results for Base QDA:
      Method  F1-Score  Std Dev
0    Holdout    0.7336      NaN
1   CV (k=5)    0.7333   0.0117
2  CV (k=10)    0.7379   0.0129
3      LOOCV    0.7745   0.4179
4  Bootstrap    0.7522   0.0307

Classification Report (Holdout Method):
              precision    recall  f1-score   support

           1       1.00      0.89      0.94       439
           3       0.74      0.45      0.56       323
           4       0.54      1.00      0.70       210

    accuracy                           0.77       972
   macro avg       0.76      0.78      0.73       972
weighted avg       0.82      0.77      0.76       972


Evaluating with Ridge Regularization:
--------------------------------------------------

Best Ridge C for qda: 21.5443
holdout took 0.01 seconds
cv_5 took 0.04 seconds
cv_10 took 0.09 seconds




loocv took 3.54 seconds




bootstrap took 0.26 seconds

Results for Ridge QDA:
      Method  F1-Score  Std Dev
0    Holdout    0.8991      NaN
1   CV (k=5)    0.9030   0.0094
2  CV (k=10)    0.9040   0.0087
3      LOOCV    0.9189   0.2729
4  Bootstrap    0.9030   0.0062

Classification Report (Holdout Method):
              precision    recall  f1-score   support

           1       1.00      0.99      0.99       439
           3       0.93      0.81      0.87       323
           4       0.76      0.93      0.84       210

    accuracy                           0.92       972
   macro avg       0.90      0.91      0.90       972
weighted avg       0.92      0.92      0.92       972


Evaluating with Lasso Regularization:
--------------------------------------------------

Best Lasso C for qda: 21.5443
holdout took 0.01 seconds
cv_5 took 0.05 seconds
cv_10 took 0.09 seconds




loocv took 3.52 seconds




bootstrap took 0.28 seconds

Results for Lasso QDA:
      Method  F1-Score  Std Dev
0    Holdout    0.8991      NaN
1   CV (k=5)    0.9030   0.0094
2  CV (k=10)    0.9040   0.0087
3      LOOCV    0.9189   0.2729
4  Bootstrap    0.9034   0.0064

Classification Report (Holdout Method):
              precision    recall  f1-score   support

           1       1.00      0.99      0.99       439
           3       0.93      0.81      0.87       323
           4       0.76      0.93      0.84       210

    accuracy                           0.92       972
   macro avg       0.90      0.91      0.90       972
weighted avg       0.92      0.92      0.92       972


Comparison Summary:
--------------------------------------------------

Overall Comparison:
  Model  Base F1  Ridge F1  Lasso F1  Ridge Improvement (%)  \
0   QDA   0.7333     0.903     0.903                 23.144   

   Lasso Improvement (%)  
0                 23.144  
