In [1]:
%pip install -r requirements.txt
# General imports
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

# Merge the datasets
df1 = pd.read_csv('Data_Class_1.csv')
df2 = pd.read_csv('Data_Class_3.csv')\
df3 = pd.read_csv('Data_Class_4.csv')

# Concatenate the 3 datasets 
df = pd.concat([df1, df2, df3], ignore_index=True)

df.info(), df.head()

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4860 entries, 0 to 4859
Data columns (total 18 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Id                                  4860 non-null   int64  
 1   Altitude                            4860 non-null   int64  
 2   Slope_Orientation                   4860 non-null   int64  
 3   Slope                               4860 non-null   int64  
 4   Horizontal_Distance_To_Water        4860 non-null   int64  
 5   Vertical_Distance_To_Water          4860 non-null   int64  
 6   Horizontal_Distance_To_Roadways     4860 non-null   int64  
 7   Shadow_Index_9h                     4860 non-null   int64  
 8   Shadow_Index_12h                    4860 non-null   int64  
 9   Shadow_Index_15h             

(None,
    Id  Altitude  Slope_Orientation  Slope  Horizontal_Distance_To_Water  \
 0  41      2699                347      3                             0   
 1  52      2739                323     25                            85   
 2  53      2696                 72      2                            30   
 3  56      2722                315     24                            30   
 4  68      2919                 13     13                            90   
 
    Vertical_Distance_To_Water  Horizontal_Distance_To_Roadways  \
 0                           0                             2096   
 1                          43                             3118   
 2                           0                             3271   
 3                          19                             3216   
 4                           6                             5321   
 
    Shadow_Index_9h  Shadow_Index_12h  Shadow_Index_15h  \
 0              213               234               159   
 1           

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Label encode the categorical features
label_encoder = LabelEncoder()
df['Soil_Type'] = label_encoder.fit_transform(df['Soil_Type'])
df['Wilderness_Area'] = label_encoder.fit_transform(df['Wilderness_Area'])
df['Vegetation_Type'] = label_encoder.fit_transform(df['Vegetation_Type'])

# Split the dataset into features and target variable
X = df.drop(columns=['Vegetation_Type', 'Id'])
y = df['Vegetation_Type']

# Split the dataset into train and test sets for Holdout method
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Check the shape of the resulting datasets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((3402, 16), (1458, 16), (3402,), (1458,))

In [3]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.model_selection import cross_val_score, LeaveOneOut, StratifiedKFold, train_test_split
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, f1_score

# Label encode the categorical features
label_encoder = LabelEncoder()
df['Soil_Type'] = label_encoder.fit_transform(df['Soil_Type'])
df['Wilderness_Area'] = label_encoder.fit_transform(df['Wilderness_Area'])
df['Vegetation_Type'] = label_encoder.fit_transform(df['Vegetation_Type'])

# Split the dataset into features and target variable
X = df.drop(columns=['Vegetation_Type', 'Id'])
y = df['Vegetation_Type']

# Split the dataset into train and test sets for Holdout method
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Instantiate models with increased max_iter for Logistic Regression
log_reg = LogisticRegression(max_iter=20000, random_state=42)
lda = LDA()
qda = QDA()

# Holdout method for Logistic Regression
log_reg.fit(X_train_scaled, y_train)
y_pred_log_reg = log_reg.predict(X_test_scaled)
log_reg_holdout_acc = accuracy_score(y_test, y_pred_log_reg)
log_reg_holdout_f1 = f1_score(y_test, y_pred_log_reg, average='weighted')

# Cross Validation (k=5 and k=10) for Logistic Regression
kfold_5 = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
kfold_10 = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

log_reg_cv_5_acc = cross_val_score(log_reg, scaler.fit_transform(X), y, cv=kfold_5, scoring='accuracy').mean()
log_reg_cv_5_f1 = cross_val_score(log_reg, scaler.fit_transform(X), y, cv=kfold_5, scoring='f1_weighted').mean()
log_reg_cv_10_acc = cross_val_score(log_reg, scaler.fit_transform(X), y, cv=kfold_10, scoring='accuracy').mean()
log_reg_cv_10_f1 = cross_val_score(log_reg, scaler.fit_transform(X), y, cv=kfold_10, scoring='f1_weighted').mean()

# LOOCV for Logistic Regression
loo = LeaveOneOut()
log_reg_loocv_acc = cross_val_score(log_reg, scaler.fit_transform(X), y, cv=loo, scoring='accuracy').mean()
log_reg_loocv_f1 = cross_val_score(log_reg, scaler.fit_transform(X), y, cv=loo, scoring='f1_weighted').mean()

# Bootstrap method for Logistic Regression
bootstrap_iterations = 100
bootstrap_acc_log_reg = []
bootstrap_f1_log_reg = []

for i in range(bootstrap_iterations):
    X_resampled, y_resampled = resample(X_train_scaled, y_train, random_state=i)
    log_reg.fit(X_resampled, y_resampled)
    y_pred_bootstrap = log_reg.predict(X_test_scaled)
    bootstrap_acc_log_reg.append(accuracy_score(y_test, y_pred_bootstrap))
    bootstrap_f1_log_reg.append(f1_score(y_test, y_pred_bootstrap, average='weighted'))

log_reg_bootstrap_acc = sum(bootstrap_acc_log_reg) / len(bootstrap_acc_log_reg)
log_reg_bootstrap_f1 = sum(bootstrap_f1_log_reg) / len(bootstrap_f1_log_reg)

# Store results for Logistic Regression
log_reg_results = {
    'Holdout Accuracy': log_reg_holdout_acc,
    'Holdout F1-Score': log_reg_holdout_f1,
    'CV (k=5) Accuracy': log_reg_cv_5_acc,
    'CV (k=5) F1-Score': log_reg_cv_5_f1,
    'CV (k=10) Accuracy': log_reg_cv_10_acc,
    'CV (k=10) F1-Score': log_reg_cv_10_f1,
    'LOOCV Accuracy': log_reg_loocv_acc,
    'LOOCV F1-Score': log_reg_loocv_f1,
    'Bootstrap Accuracy': log_reg_bootstrap_acc,
    'Bootstrap F1-Score': log_reg_bootstrap_f1
}

# Apply scaling to LDA and QDA and calculate metrics
# LDA
lda.fit(X_train_scaled, y_train)
y_pred_lda = lda.predict(X_test_scaled)
lda_holdout_acc = accuracy_score(y_test, y_pred_lda)
lda_holdout_f1 = f1_score(y_test, y_pred_lda, average='weighted')

lda_cv_5_acc = cross_val_score(lda, scaler.fit_transform(X), y, cv=kfold_5, scoring='accuracy').mean()
lda_cv_5_f1 = cross_val_score(lda, scaler.fit_transform(X), y, cv=kfold_5, scoring='f1_weighted').mean()
lda_cv_10_acc = cross_val_score(lda, scaler.fit_transform(X), y, cv=kfold_10, scoring='accuracy').mean()
lda_cv_10_f1 = cross_val_score(lda, scaler.fit_transform(X), y, cv=kfold_10, scoring='f1_weighted').mean()

lda_loocv_acc = cross_val_score(lda, scaler.fit_transform(X), y, cv=loo, scoring='accuracy').mean()
lda_loocv_f1 = cross_val_score(lda, scaler.fit_transform(X), y, cv=loo, scoring='f1_weighted').mean()

bootstrap_acc_lda = []
bootstrap_f1_lda = []
for i in range(bootstrap_iterations):
    X_resampled, y_resampled = resample(X_train_scaled, y_train, random_state=i)
    lda.fit(X_resampled, y_resampled)
    y_pred_bootstrap_lda = lda.predict(X_test_scaled)
    bootstrap_acc_lda.append(accuracy_score(y_test, y_pred_bootstrap_lda))
    bootstrap_f1_lda.append(f1_score(y_test, y_pred_bootstrap_lda, average='weighted'))

lda_bootstrap_acc = sum(bootstrap_acc_lda) / len(bootstrap_acc_lda)
lda_bootstrap_f1 = sum(bootstrap_f1_lda) / len(bootstrap_f1_lda)

# Store results for LDA
lda_results = {
    'Holdout Accuracy': lda_holdout_acc,
    'Holdout F1-Score': lda_holdout_f1,
    'CV (k=5) Accuracy': lda_cv_5_acc,
    'CV (k=5) F1-Score': lda_cv_5_f1,
    'CV (k=10) Accuracy': lda_cv_10_acc,
    'CV (k=10) F1-Score': lda_cv_10_f1,
    'LOOCV Accuracy': lda_loocv_acc,
    'LOOCV F1-Score': lda_loocv_f1,
    'Bootstrap Accuracy': lda_bootstrap_acc,
    'Bootstrap F1-Score': lda_bootstrap_f1
}

# QDA
qda.fit(X_train_scaled, y_train)
y_pred_qda = qda.predict(X_test_scaled)
qda_holdout_acc = accuracy_score(y_test, y_pred_qda)
qda_holdout_f1 = f1_score(y_test, y_pred_qda, average='weighted')

qda_cv_5_acc = cross_val_score(qda, scaler.fit_transform(X), y, cv=kfold_5, scoring='accuracy').mean()
qda_cv_5_f1 = cross_val_score(qda, scaler.fit_transform(X), y, cv=kfold_5, scoring='f1_weighted').mean()
qda_cv_10_acc = cross_val_score(qda, scaler.fit_transform(X), y, cv=kfold_10, scoring='accuracy').mean()
qda_cv_10_f1 = cross_val_score(qda, scaler.fit_transform(X), y, cv=kfold_10, scoring='f1_weighted').mean()

qda_loocv_acc = cross_val_score(qda, scaler.fit_transform(X), y, cv=loo, scoring='accuracy').mean()
qda_loocv_f1 = cross_val_score(qda, scaler.fit_transform(X), y, cv=loo, scoring='f1_weighted').mean()

bootstrap_acc_qda = []
bootstrap_f1_qda = []
for i in range(bootstrap_iterations):
    X_resampled, y_resampled = resample(X_train_scaled, y_train, random_state=i)
    qda.fit(X_resampled, y_resampled)
    y_pred_bootstrap_qda = qda.predict(X_test_scaled)
    bootstrap_acc_qda.append(accuracy_score(y_test, y_pred_bootstrap_qda))
    bootstrap_f1_qda.append(f1_score(y_test, y_pred_bootstrap_qda, average='weighted'))

qda_bootstrap_acc = sum(bootstrap_acc_qda) / len(bootstrap_acc_qda)
qda_bootstrap_f1 = sum(bootstrap_f1_qda) / len(bootstrap_f1_qda)

# Store results for QDA
qda_results = {
    'Holdout Accuracy': qda_holdout_acc,
    'Holdout F1-Score': qda_holdout_f1,
    'CV (k=5) Accuracy': qda_cv_5_acc,
    'CV (k=5) F1-Score': qda_cv_5_f1,
    'CV (k=10) Accuracy': qda_cv_10_acc,
    'CV (k=10) F1-Score': qda_cv_10_f1,
    'LOOCV Accuracy': qda_loocv_acc,
    'LOOCV F1-Score': qda_loocv_f1,
    'Bootstrap Accuracy': qda_bootstrap_acc,
    'Bootstrap F1-Score': qda_bootstrap_f1
}

# Final comparison
print("Logistic Regression Results:", log_reg_results)
print("LDA Results:", lda_results)
print("QDA Results:", qda_results)


KeyboardInterrupt: 

In [30]:
# Determine the best method based on the results for Logistic Regression
best_method = max(log_reg_results, key=log_reg_results.get)
print(f"Best resampling method for Logistic Regression: {best_method}")


Best resampling method for Logistic Regression: Holdout F1-Score


In [32]:
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.metrics import f1_score, accuracy_score

# Ridge Classifier
ridge_clf = RidgeClassifier()
ridge_clf.fit(X_train_scaled, y_train)
y_pred_ridge = ridge_clf.predict(X_test_scaled)
ridge_holdout_acc = accuracy_score(y_test, y_pred_ridge)
ridge_holdout_f1 = f1_score(y_test, y_pred_ridge, average='weighted')

# Lasso Logistic Regression (penalty='l1')
lasso_clf = LogisticRegression(penalty='l1', solver='saga', max_iter=20000, random_state=42)
lasso_clf.fit(X_train_scaled, y_train)
y_pred_lasso = lasso_clf.predict(X_test_scaled)
lasso_holdout_acc = accuracy_score(y_test, y_pred_lasso)
lasso_holdout_f1 = f1_score(y_test, y_pred_lasso, average='weighted')

# Ridge Logistic Regression (penalty='l2')
ridge_logistic_clf = LogisticRegression(penalty='l2', solver='saga', max_iter=20000, random_state=42)
ridge_logistic_clf.fit(X_train_scaled, y_train)
y_pred_ridge_logistic = ridge_logistic_clf.predict(X_test_scaled)
ridge_logistic_holdout_acc = accuracy_score(y_test, y_pred_ridge_logistic)
ridge_logistic_holdout_f1 = f1_score(y_test, y_pred_ridge_logistic, average='weighted')

# Store results for Ridge and Lasso Logistic Regression
ridge_logistic_results = {
    'Holdout Accuracy': ridge_logistic_holdout_acc,
    'Holdout F1-Score': ridge_logistic_holdout_f1
}

lasso_logistic_results = {
    'Holdout Accuracy': lasso_holdout_acc,
    'Holdout F1-Score': lasso_holdout_f1
}

ridge_classifier_results = {
    'Holdout Accuracy': ridge_holdout_acc,
    'Holdout F1-Score': ridge_holdout_f1
}

# Final comparison
print("Ridge Classifier Results:", ridge_classifier_results)
print("Lasso Logistic Regression Results:", lasso_logistic_results)
print("Ridge Logistic Regression Results:", ridge_logistic_results)



Ridge Classifier Results: {'Holdout Accuracy': 0.850480109739369, 'Holdout F1-Score': 0.8475563349336467}
Lasso Logistic Regression Results: {'Holdout Accuracy': 0.9279835390946503, 'Holdout F1-Score': 0.9281248633090963}
Ridge Logistic Regression Results: {'Holdout Accuracy': 0.9300411522633745, 'Holdout F1-Score': 0.9302523274468142}
