# Overview

This notebook tries to train a simple decision tree on the ratios and use it to choose the most important ratios to train other models.

## Imports


In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import numpy as np

# # Assuming ratios_df is your DataFrame
df = pd.read_csv("../1_decision_tree/cleaned_ratios.csv")
df.head()

# Drop the 'file_path' column and separate features (X) and labels (y)
X = df.drop(columns=["file_path", "salient"])
y = df["salient"]

In [2]:
X.head()

Unnamed: 0,distance_0_1_to_distance_0_2,distance_0_3_to_distance_5_10,distance_0_4_to_distance_4_13,distance_0_7_to_distance_18_23,distance_0_9_to_distance_14_18,distance_0_10_to_distance_8_20,distance_0_11_to_distance_11_20,distance_0_16_to_distance_2_4,distance_0_17_to_distance_1_14,distance_0_19_to_distance_9_12,...,angle_1_4_14,angle_2_4_9,angle_2_5_9,angle_2_14_22,angle_3_10_15,angle_4_15_16,angle_5_8_12,angle_5_14_17,angle_6_8_20,angle_7_16_17
0,1.020027,2.098896,4.916117,2.684519,1.554847,0.726802,0.581233,7.198481,1.484318,1.039785,...,0.1042,0.107202,0.265472,0.364389,0.004095,0.469564,0.097709,0.327963,0.163143,0.183878
1,1.025078,2.212228,4.97817,2.569172,1.484482,0.695759,0.578634,6.979357,1.488992,1.128448,...,0.114629,0.132248,0.303421,0.372807,0.003113,0.494791,0.10539,0.313466,0.170611,0.170144
2,1.027181,1.832562,4.986065,2.517126,1.51036,1.300608,0.640325,9.157373,3.347487,0.797908,...,0.1386,0.174568,0.36262,0.334458,0.006785,0.471296,0.007272,0.346034,0.263366,0.273151
3,1.034087,2.237776,4.927011,2.580544,1.370526,0.672495,0.574309,9.23927,1.547455,0.98861,...,0.172974,0.194703,0.328241,0.339795,0.004166,0.370708,0.135475,0.332853,0.167066,0.162496
4,1.040316,2.18392,4.615139,2.654145,1.403396,0.683371,0.604192,8.286063,1.564145,0.924552,...,0.168337,0.186293,0.329435,0.331054,0.004886,0.49982,0.092372,0.319599,0.153758,0.171903


In [3]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: salient, dtype: int64

In [4]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define the parameter grid for the Random Forest
param_grid = {
    'n_estimators': [1, 5, 10, 20, 30, 40, 50, 100],  # Number of trees in the forest
    'max_depth': [5, 10, 20, 30, 50, 100, 200],  # Maximum depth of the tree
    'min_samples_split': [1, 2, 3, 4, 5, 10, 15],  # Minimum samples required to split an internal node
    'min_samples_leaf': [2, 3, 4, 5, 6, 7],  # Minimum samples required to be at a leaf node
}

# Set up 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the GridSearchCV with the RandomForestClassifier and parameter grid
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=kf,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Perform the grid search
grid_search.fit(X, y)

# Get the best estimator
best_clf = grid_search.best_estimator_

# Print the best parameters and the best score
print("Best parameters found by grid search:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Train the best classifier on the entire dataset
best_clf.fit(X, y)

# Predict on the same dataset
y_pred = best_clf.predict(X)

# Evaluate the best classifier
print("Classification Report:\n", classification_report(y, y_pred))

Fitting 5 folds for each of 2352 candidates, totalling 11760 fits


1680 fits failed out of a total of 11760.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1680 fits failed with the following error:
Traceback (most recent call last):
  File "/home/sadat/miniconda3/envs/cowbytes/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/sadat/miniconda3/envs/cowbytes/lib/python3.8/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/home/sadat/miniconda3/envs/cowbytes/lib/python3.8/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/home/sadat/miniconda3/envs/cowbytes/lib/python3.8/site-packages/sklearn/uti

Best parameters found by grid search: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
Best cross-validation accuracy: 0.9266873139292686
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2160
           1       1.00      1.00      1.00      2123

    accuracy                           1.00      4283
   macro avg       1.00      1.00      1.00      4283
weighted avg       1.00      1.00      1.00      4283



In [8]:
grid_search.cv_results_['mean_test_score']

array([       nan,        nan,        nan, ..., 0.91034362, 0.91221087,
       0.91431286])

In [9]:
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import cross_val_score
import numpy as np

# Define a custom scorer for ROC AUC
roc_auc_scorer = make_scorer(roc_auc_score, needs_proba=True)

# Perform cross-validation to obtain accuracy and ROC AUC scores
accuracy_scores = cross_val_score(best_clf, X, y, cv=kf, scoring='accuracy')
roc_auc_scores = cross_val_score(best_clf, X, y, cv=kf, scoring=roc_auc_scorer)

# Calculate mean and standard deviation for accuracy and ROC AUC
mean_accuracy = np.mean(accuracy_scores)
std_accuracy = np.std(accuracy_scores)

mean_roc_auc = np.mean(roc_auc_scores)
std_roc_auc = np.std(roc_auc_scores)


Mean Accuracy: 0.9267
Standard Deviation of Accuracy: 0.0055
Mean ROC AUC: 0.9795
Standard Deviation of ROC AUC: 0.0021


In [10]:

# Print the results
print("Mean Accuracy: {:.6f}".format(mean_accuracy))
print("Standard Deviation of Accuracy: {:.6f}".format(std_accuracy))
print("Mean ROC AUC: {:.6f}".format(mean_roc_auc))
print("Standard Deviation of ROC AUC: {:.6f}".format(std_roc_auc))

Mean Accuracy: 0.926687
Standard Deviation of Accuracy: 0.005488
Mean ROC AUC: 0.979540
Standard Deviation of ROC AUC: 0.002091


## XGBoost

In [13]:
from sklearn.model_selection import GridSearchCV, KFold
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Define the parameter grid for the XGBoost classifier
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the ensemble
    'max_depth': [3, 5, 7, 9, 11],  # Maximum depth of the tree
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Step size shrinkage
    'subsample': [0.6, 0.8, 1.0],  # Fraction of samples used for fitting the trees
    'colsample_bytree': [0.6, 0.8, 1.0],  # Fraction of features used for fitting the trees
    'gamma': [0, 0.1, 0.2, 0.3]  # Minimum loss reduction required to make a further partition
}

# Set up 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the GridSearchCV with the XGBClassifier and parameter grid
grid_search = GridSearchCV(
    estimator=XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'),  # Avoid warnings with label encoder
    param_grid=param_grid,
    cv=kf,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Perform the grid search
grid_search.fit(X, y)

# Get the best estimator
best_clf = grid_search.best_estimator_

# Print the best parameters and the best score
print("Best parameters found by grid search:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Train the best classifier on the entire dataset
best_clf.fit(X, y)

# Predict on the same dataset
y_pred = best_clf.predict(X)

# Evaluate the best classifier
print("Classification Report:\n", classification_report(y, y_pred))


Fitting 5 folds for each of 2160 candidates, totalling 10800 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

KeyboardInterrupt: 