In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

## Data Split

In [4]:
file_path = '../data/training_data.csv'
data = pd.read_csv(file_path)

X = data.drop('Class', axis=1)
y = data['Class']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, stratify=y, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=1)

In [5]:
y_val

9901     0
1205     0
13453    1
18544    2
21737    1
        ..
7781     2
43581    1
41061    1
13236    1
16434    2
Name: Class, Length: 7221, dtype: int64

## Train Random Forest Model

In [6]:
rf_model = RandomForestClassifier(random_state=1)
rf_model.fit(X_train, y_train)

Evaluate Model

In [7]:
y_pred_rf = rf_model.predict(X_val)

print('Random Forest Accuracy:', accuracy_score(y_val, y_pred_rf))
print('Random Forest Confusion Matrix:\n', confusion_matrix(y_val, y_pred_rf))
print('Random Forest Classification Report:\n', classification_report(y_val, y_pred_rf))

Random Forest Accuracy: 0.9993075751280986
Random Forest Confusion Matrix:
 [[1040    0    1]
 [   0 3833    0]
 [   3    1 2343]]
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1041
           1       1.00      1.00      1.00      3833
           2       1.00      1.00      1.00      2347

    accuracy                           1.00      7221
   macro avg       1.00      1.00      1.00      7221
weighted avg       1.00      1.00      1.00      7221



# Train XGBoost

In [8]:
from xgboost import XGBClassifier

In [9]:
# Train an XGBoost model using the full feature set
xgb_model_full = XGBClassifier(random_state=1, eval_metric='mlogloss')
xgb_model_full.fit(X_train, y_train)

In [10]:
# Make predictions on the validation set
y_pred_xgb_val = xgb_model_full.predict(X_val)

# Evaluate the model on the validation set using accuracy, confusion matrix, and classification report
xgb_acc_val = accuracy_score(y_val, y_pred_xgb_val)
xgb_conf_matrix_val = confusion_matrix(y_val, y_pred_xgb_val)
xgb_class_report_val = classification_report(y_val, y_pred_xgb_val)

# Output the results for analysis
print('XGBoost Accuracy:', xgb_acc_val)
print('XGBoost Confusion Matrix:\n', xgb_conf_matrix_val)
print('XGBoost Classification Report:\n', xgb_class_report_val)

XGBoost Accuracy: 0.9994460601024788
XGBoost Confusion Matrix:
 [[1041    0    0]
 [   0 3833    0]
 [   3    1 2343]]
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1041
           1       1.00      1.00      1.00      3833
           2       1.00      1.00      1.00      2347

    accuracy                           1.00      7221
   macro avg       1.00      1.00      1.00      7221
weighted avg       1.00      1.00      1.00      7221



## Training selected features

In [11]:
X_selected = data[['ndvi', 'ndwi', 'mtvi2', 'tgi', 'vari']]

In [12]:
# Split dataset with selected features
X_train, X_temp, y_train, y_temp = train_test_split(X_selected, y, test_size=0.30, stratify=y, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=1)

# Train Random Forest model 
rf_model_selected = RandomForestClassifier(random_state=1)
rf_model_selected.fit(X_train, y_train)

In [13]:
# Train XGBoost on selected features
xgb_model_selected = XGBClassifier(random_state=1, eval_metric='mlogloss')
xgb_model_selected.fit(X_train, y_train)

In [14]:
X_train

Unnamed: 0,ndvi,ndwi,mtvi2,tgi,vari
12546,0.153584,0.096257,0.708875,-5265.0,-7.363637
45680,0.187829,-0.195011,0.274847,11500.0,-0.012480
34792,-0.033557,0.160350,0.242939,495.0,0.330882
12046,0.203343,0.066955,0.775280,-560.0,1.424658
9367,0.034138,-0.143295,-0.120643,1280.0,-0.175331
...,...,...,...,...,...
12780,0.346591,-0.034935,0.901159,-2230.0,6.625000
18563,0.018374,-0.073703,-0.066210,3220.0,-0.096825
38622,0.463781,-0.491449,0.545676,23520.0,-0.050755
32548,0.147436,0.045333,0.575919,2205.0,0.473684


## Validate RF

In [15]:

# Make predictions on the validation set
y_pred_rf_selected = rf_model_selected.predict(X_val)

# Evaluate the model using accuracy, confusion matrix, and classification report
print('Random Forest Accuracy:', accuracy_score(y_val, y_pred_rf_selected))
print('Random Forest Confusion Matrix:\n', confusion_matrix(y_val, y_pred_rf_selected))
print('Random Forest Classification Report:\n', classification_report(y_val, y_pred_rf_selected))

Random Forest Accuracy: 0.9962609056917324
Random Forest Confusion Matrix:
 [[1037    0    4]
 [   0 3829    4]
 [   5   14 2328]]
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1041
           1       1.00      1.00      1.00      3833
           2       1.00      0.99      0.99      2347

    accuracy                           1.00      7221
   macro avg       1.00      1.00      1.00      7221
weighted avg       1.00      1.00      1.00      7221



## Validate XGB

In [16]:
# Make predictions on the validation set
y_pred_xgb_val = xgb_model_selected.predict(X_val)

# Evaluate the model on the validation set using accuracy, confusion matrix, and classification report
xgb_acc_val = accuracy_score(y_val, y_pred_xgb_val)
xgb_conf_matrix_val = confusion_matrix(y_val, y_pred_xgb_val)
xgb_class_report_val = classification_report(y_val, y_pred_xgb_val)

# Output the results for analysis
print('XGBoost Accuracy:', xgb_acc_val)
print('XGBoost Confusion Matrix:\n', xgb_conf_matrix_val)
print('XGBoost Classification Report:\n', xgb_class_report_val)

XGBoost Accuracy: 0.9963993906661127
XGBoost Confusion Matrix:
 [[1038    0    3]
 [   0 3827    6]
 [   4   13 2330]]
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1041
           1       1.00      1.00      1.00      3833
           2       1.00      0.99      0.99      2347

    accuracy                           1.00      7221
   macro avg       1.00      1.00      1.00      7221
weighted avg       1.00      1.00      1.00      7221



## Evaluate on the Test set to check for overfitting

In [17]:
from sklearn.metrics import roc_auc_score

In [18]:
y_pred_test = rf_model_selected.predict(X_test)

# Evaluate the model on the test set using accuracy, confusion matrix, classification report, and additional metric (ROC AUC score)
rf_acc_test = accuracy_score(y_test, y_pred_test)
rf_conf_matrix_test = confusion_matrix(y_test, y_pred_test)
rf_class_report_test = classification_report(y_test, y_pred_test)

# Additional evaluation metric: ROC AUC Score
y_prob_test = rf_model_selected.predict_proba(X_test)
rf_roc_auc_test = roc_auc_score(y_test, y_prob_test, multi_class='ovr', average='weighted')

# Output the results for analysis
print(rf_acc_test)
print(rf_conf_matrix_test)
print(rf_class_report_test)
print(rf_roc_auc_test)

0.9973691498199945
[[1039    0    2]
 [   0 3829    4]
 [   4    9 2335]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1041
           1       1.00      1.00      1.00      3833
           2       1.00      0.99      1.00      2348

    accuracy                           1.00      7222
   macro avg       1.00      1.00      1.00      7222
weighted avg       1.00      1.00      1.00      7222

0.9999677015049663


## Evaluate Test set on XGB

In [19]:
y_pred_test = xgb_model_selected.predict(X_test)

# Evaluate the model on the test set using accuracy, confusion matrix, classification report, and additional metric (ROC AUC score)
xgb_acc_test = accuracy_score(y_test, y_pred_test)
xgb_conf_matrix_test = confusion_matrix(y_test, y_pred_test)
xgb_class_report_test = classification_report(y_test, y_pred_test)

# ROC AUC Score
y_prob_test = xgb_model_selected.predict_proba(X_test)
xgb_roc_auc_test = roc_auc_score(y_test, y_prob_test, multi_class='ovr', average='weighted')

# Output the results for analysis
print(xgb_acc_test)
print(xgb_conf_matrix_test)
print(xgb_class_report_test)
print(xgb_roc_auc_test)

0.9972306840210468
[[1037    0    4]
 [   0 3826    7]
 [   2    7 2339]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1041
           1       1.00      1.00      1.00      3833
           2       1.00      1.00      1.00      2348

    accuracy                           1.00      7222
   macro avg       1.00      1.00      1.00      7222
weighted avg       1.00      1.00      1.00      7222

0.9999581491943512


## Cross Validation

In [20]:
from sklearn.model_selection import cross_val_score

In [21]:
# Perform 5-Fold Cross-Validation
cv_scores = cross_val_score(xgb_model_selected, X_selected, y, cv=5, scoring='accuracy')

# Print the average cross-validation score
print("Cross-Validation Accuracy: ", cv_scores.mean())

Cross-Validation Accuracy:  0.9831960348799196


In [43]:
# Output the cross-validation accuracy scores and the average
print(f'Fold 1:  {cv_scores[0] * 100:.6f}')
print(f'Fold 2:  {cv_scores[1] * 100:.6f}')
print(f'Fold 3:  {cv_scores[2] * 100:.6f}')
print(f'Fold 4:  {cv_scores[3] * 100:.6f}')
print(f'Fold 5:  {cv_scores[4] * 100:.6f}')


Fold 1:  94.828123
Fold 2:  99.459965
Fold 3:  98.068135
Fold 4:  99.833818
Fold 5:  99.407977
