In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

In [2]:
# 1. Load your CSV files
train_data = pd.read_csv('HOG_Training.csv')
test_data = pd.read_csv('HOG_Testing.csv')

In [3]:
# 2. Separate features and labels
X_train = train_data.iloc[:, 1:]  # All columns except the first one
y_train = train_data.iloc[:, 0]   # First column as target
X_test = test_data.iloc[:, 1:]    # All columns except the first one
y_test = test_data.iloc[:, 0]     # First column as target

In [4]:
# 3. Handle unseen labels in test set
test_mask = y_test.isin(y_train.unique())
X_test_filtered = X_test[test_mask]
y_test_filtered = y_test[test_mask]

In [5]:
# 4. Encode the class labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test_filtered)

In [6]:
# 5. Handle missing values (NaN) using imputation
print("Handling missing values...")
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test_filtered)

Handling missing values...


In [7]:
# 6. Train a baseline model first
print("Training baseline decision tree...")
baseline_dt = DecisionTreeClassifier(random_state=42)
baseline_dt.fit(X_train_imputed, y_train_encoded)
baseline_pred = baseline_dt.predict(X_test_imputed)
baseline_accuracy = accuracy_score(y_test_encoded, baseline_pred)
print(f"Baseline Decision Tree Accuracy: {baseline_accuracy:.4f}")

Training baseline decision tree...
Baseline Decision Tree Accuracy: 0.5671


In [8]:
# 7. Try a few strategically selected hyperparameter combinations
print("Testing specific hyperparameter combinations...")
models = [
    ("Default + Max Depth 10", DecisionTreeClassifier(max_depth=10, random_state=42)),
    ("Default + Max Depth 20", DecisionTreeClassifier(max_depth=20, random_state=42)),
    ("Entropy + Max Depth 20", DecisionTreeClassifier(criterion='entropy', max_depth=20, random_state=42)),
    ("Min Samples Split 10", DecisionTreeClassifier(min_samples_split=10, random_state=42)),
    ("Min Samples Leaf 5", DecisionTreeClassifier(min_samples_leaf=5, random_state=42)),
    ("Balanced Weights", DecisionTreeClassifier(class_weight='balanced', random_state=42)),
    ("Combined Parameters", DecisionTreeClassifier(
        criterion='entropy',
        max_depth=20,
        min_samples_split=10,
        min_samples_leaf=2,
        class_weight='balanced',
        random_state=42
    ))
]

best_model = None
best_accuracy = baseline_accuracy
best_name = "Baseline"

for name, model in models:
    model.fit(X_train_imputed, y_train_encoded)
    y_pred = model.predict(X_test_imputed)
    accuracy = accuracy_score(y_test_encoded, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model
        best_name = name

print(f"\nBest model: {best_name} with accuracy: {best_accuracy:.4f}")

Testing specific hyperparameter combinations...
Default + Max Depth 10 Accuracy: 0.5372
Default + Max Depth 20 Accuracy: 0.5817
Entropy + Max Depth 20 Accuracy: 0.6071
Min Samples Split 10 Accuracy: 0.5861
Min Samples Leaf 5 Accuracy: 0.5709
Balanced Weights Accuracy: 0.5906
Combined Parameters Accuracy: 0.6020

Best model: Entropy + Max Depth 20 with accuracy: 0.6071


In [9]:
# 8. Print classification report for the best model
if best_model is not None:
    y_pred = best_model.predict(X_test_imputed)
    print("\nClassification Report:")
    print(classification_report(y_test_encoded, y_pred))

    # Feature importance analysis
    feature_importances = best_model.feature_importances_
    feature_indices = np.argsort(feature_importances)[::-1]
    print("\nTop 10 most important features:")
    for i in range(min(10, len(feature_indices))):
        print(f"Feature {feature_indices[i]}: {feature_importances[feature_indices[i]]:.4f}")


Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.35      0.48       144
           1       0.90      0.63      0.74       164
           2       0.91      0.78      0.84       152
           3       0.00      0.00      0.00         0
           4       0.64      0.36      0.46       150
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.88      0.60      0.71       164
          11       0.92      0.92      0.92       246
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.47      0.31      0.37       150
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
