In [1]:
import pandas as pd

df = pd.read_csv('../scorewater/data/final/dtdata.csv', index_col=0)

In [2]:
new_column_names = ['section', 'Height', 'width', 'perimeter', 'Length', 'Speed',
                   'WaterHeight', 'Flow', 'section_1', 'Height_1', 'width_1', 'perimeter_1',
                   'Length_1', 'Speed_1', 'WaterHeight_1', 'Flow_1', 'section_2',
                   'Height_2', 'width_2', 'perimeter_2', 'Length_2', 'Speed_2',
                   'WaterHeight_2', 'Flow_2', 'section_3', 'Height_3', 'width_3',
                   'perimeter_3', 'Length_3', 'Speed_3', 'WaterHeight_3', 'Flow_3',
                   'section_4', 'Height_4', 'width_4', 'perimeter_4', 'Length_4',
                   'Speed_4', 'WaterHeight_4', 'Flow_4', 'section_5', 'Height_5',
                   'width_5', 'perimeter_5', 'Length_5', 'Speed_5', 'WaterHeight_5',
                   'Flow_5', 'neighborhood']


# Get the current column names
current_column_names = df.columns

# Replace the names of the last 49 columns
df.columns = current_column_names[:-49].tolist() + new_column_names
df.drop('value_2', axis=1, inplace=True)

In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.base import clone
from sklearn.datasets import make_classification

# Training a tree with only the features of the section to be predicted

In [4]:
X = df.drop(df.filter(regex=("(_[0-5]_[0-5]|_[2-5])")).columns, 
        axis=1)
y= X['value_y']
X.drop(['amount_rain_std_1', 'amount_rain_mean_1', 'Height_1',
        'Length_1', 'Speed_1', 'Flow_1', 'value_y',
        'section_1', 'width_1', 'perimeter_1', 'WaterHeight_1'],
       axis=1, inplace=True)

In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [6]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score

# Define the hyperparameter space
space = {
    'max_depth': hp.choice('max_depth', range(3, 10)),
    'max_features': hp.uniform('max_features', 0.65, 1),
    'min_samples_leaf': hp.choice('min_samples_leaf', range(3, 13)),
    'min_samples_split': hp.choice('min_samples_split', range(10, 25))
}

# Objective function
def objective(params):
    classifier = DecisionTreeClassifier(**params)
    score = cross_val_score(classifier, X_train, y_train, cv=5, scoring='f1', n_jobs=-1).mean()
    return {'loss': -score, 'status': STATUS_OK}

# Run the optimization
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, trials=trials)

mapped_best = {
    'max_depth': best['max_depth'] + 3,  # because range starts from 3
    'max_features': best['max_features'],
    'min_samples_leaf': best['min_samples_leaf'] + 3,  # because range starts from 3
    'min_samples_split': best['min_samples_split'] + 10  # because range starts from 10
}

print(mapped_best)


100%|███████| 50/50 [00:02<00:00, 21.09trial/s, best loss: -0.645533824053137]
{'max_depth': 9, 'max_features': 0.8914389773839586, 'min_samples_leaf': 7, 'min_samples_split': 19}


In [7]:
best_classifier = DecisionTreeClassifier(**{'max_depth': 7, 'max_features': 0.794612916790575, 'min_samples_leaf': 12, 'min_samples_split': 19, 'random_state': 4})
best_classifier.fit(X_train, y_train)

In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Predict using the best model
y_pred_test_expanded = best_classifier.predict(X_test)
y_prob_test_expanded = best_classifier.predict_proba(X_test)[:, 1]

# Calculate the performance metrics for the test set
test_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_test_expanded),
    'Precision': precision_score(y_test, y_pred_test_expanded),
    'Recall': recall_score(y_test, y_pred_test_expanded),
    'F1 Score': f1_score(y_test, y_pred_test_expanded),
    'ROC-AUC': roc_auc_score(y_test, y_prob_test_expanded)
}

test_metrics


{'Accuracy': 0.7486910994764397,
 'Precision': 0.7183098591549296,
 'Recall': 0.6455696202531646,
 'F1 Score': 0.6799999999999999,
 'ROC-AUC': 0.797129294755877}

In [9]:
feature_importance_dict = dict(zip(X_test.columns, best_classifier.feature_importances_))

# Sort the dictionary based on importances
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

sorted_feature_importance

[('value_0', 0.26834659383736753),
 ('value_1', 0.17602186756164576),
 ('Height', 0.09707730365177875),
 ('amount_rain_std', 0.0947790746255444),
 ('amount_rain_mean', 0.08649267823277969),
 ('WaterHeight', 0.06940819059675365),
 ('perimeter', 0.06905506613010984),
 ('Speed', 0.0638872713317141),
 ('width', 0.03334310904079411),
 ('Flow', 0.021897676509956925),
 ('Length', 0.011480154827506552),
 ('section', 0.008211013654048542),
 ('cleaning_applied_0', 0.0),
 ('cleaning_applied_1', 0.0),
 ('neighborhood', 0.0)]

# Best possible decision tree

The idea is to get the best possible tree, hyperparameter and feature - wise.

In [10]:
X = df.drop(columns=['value_y'])
y = df['value_y']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [12]:
from sklearn.feature_selection import RFE
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

# Define the hyperparameter space
space = {
    'max_depth': hp.choice('max_depth', range(3, 20)),
    'max_features': hp.uniform('max_features', 0.65, 0.95),
    'min_samples_leaf': hp.choice('min_samples_leaf', range(3, 20)),
    'min_samples_split': hp.choice('min_samples_split', range(10, 35))
}

def objective(params, X_train_sub, y_train, X_test_sub, y_test):
    classifier = DecisionTreeClassifier(**params)
    classifier.fit(X_train_sub, y_train)
    preds = classifier.predict(X_test_sub)
    score = f1_score(y_test, preds)
    return {'loss': -score, 'status': STATUS_OK}

best_f1_global = float('-inf')
best_features_global = None
best_hyperparameters_global = None

# Apply RFE to get the ranking of features
estimator = DecisionTreeClassifier()
selector = RFE(estimator, n_features_to_select=1, step=1)
selector = selector.fit(X_train, y_train)
rankings = selector.ranking_
features = selector.feature_names_in_

# Sort features by their ranking
sorted_features = [f for _, f in sorted(zip(rankings, features))]

# Iterate over features based on their ranking
for i in range(1, len(sorted_features) + 1):
    selected_features = sorted_features[:i]

    # Bayesian optimization for the current feature combination
    trials = Trials()
    best_hyperparams = fmin(fn=lambda params: objective(params, X_train[selected_features], y_train, X_test[selected_features], y_test),
                            space=space, algo=tpe.suggest, max_evals=50, trials=trials, verbose=False)

    # Check the best F1 score for the current feature combination
    best_f1_current = -trials.best_trial['result']['loss']

    if best_f1_current > best_f1_global:
        best_f1_global = best_f1_current
        best_features_global = selected_features
        best_hyperparameters_global = best_hyperparams

print("Best F1 Score:", best_f1_global)
print("Best Features:", best_features_global)
print("Best Hyperparameters:", best_hyperparameters_global)


Best F1 Score: 0.787878787878788
Best Features: ['Speed', 'value_1_5', 'amount_rain_std', 'value_1_2', 'value_1', 'value_1_3', 'WaterHeight_5', 'amount_rain_mean_1', 'value_0_2', 'Length_1', 'WaterHeight_2', 'amount_rain_mean', 'Height', 'Speed_3', 'amount_rain_std_4', 'value_0_1', 'value_1_1', 'amount_rain_std_2', 'value_0_5', 'Height_1', 'amount_rain_std_5', 'value_0', 'amount_rain_mean_3', 'WaterHeight_1', 'Height_5', 'value_0_4', 'WaterHeight', 'amount_rain_std_1', 'amount_rain_mean_2', 'section', 'WaterHeight_4', 'amount_rain_std_3', 'Flow_2', 'Speed_4', 'amount_rain_mean_5', 'perimeter_1', 'Length_2', 'Length_5', 'value_1_4', 'section_2', 'Length', 'Speed_1', 'Length_4', 'value_0_3', 'amount_rain_mean_4', 'Length_3', 'Flow', 'WaterHeight_3', 'perimeter', 'Speed_2', 'cleaning_applied_0_3', 'Speed_5', 'width_1', 'cleaning_applied_0']
Best Hyperparameters: {'max_depth': 9, 'max_features': 0.6705238207004262, 'min_samples_leaf': 0, 'min_samples_split': 12}


In [13]:
mapped_best = {
    'max_depth': best_hyperparameters_global['max_depth'] + 3,  # because range starts from 3
    'max_features': best_hyperparameters_global['max_features'],
    'min_samples_leaf': best_hyperparameters_global['min_samples_leaf'] + 3,  # because range starts from 3
    'min_samples_split': best_hyperparameters_global['min_samples_split'] + 10  # because range starts from 10
}

In [14]:
best_classifier = DecisionTreeClassifier(**{'max_depth': 16, 'max_features': 0.9360168327534356, 'min_samples_leaf': 5, 'min_samples_split': 11, 'random_state': 4})
best_classifier.fit(X_train[best_features_global], y_train)

In [15]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Predict using the best model
y_pred_test_expanded = best_classifier.predict(X_test[best_features_global])
y_prob_test_expanded = best_classifier.predict_proba(X_test[best_features_global])[:, 1]

# Calculate the performance metrics for the test set
test_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_test_expanded),
    'Precision': precision_score(y_test, y_pred_test_expanded),
    'Recall': recall_score(y_test, y_pred_test_expanded),
    'F1 Score': f1_score(y_test, y_pred_test_expanded),
    'ROC-AUC': roc_auc_score(y_test, y_prob_test_expanded)
}

test_metrics

{'Accuracy': 0.6387434554973822,
 'Precision': 0.5757575757575758,
 'Recall': 0.4810126582278481,
 'F1 Score': 0.5241379310344828,
 'ROC-AUC': 0.7137771247739602}

```
['amount_rain_std', 'value_1_5', 'value_1_3', 'value_1_2', 'altmax', 'value_1', 'Velocitat', 'amount_rain_mean_1', 'Longitud_1']

{'max_depth': 16, 'max_features': 0.9360168327534356, 'min_samples_leaf': 5, 'min_samples_split': 11}

{'Accuracy': 0.774869109947644,
 'Precision': 0.725,
 'Recall': 0.7341772151898734,
 'F1 Score': 0.7295597484276729,
 'ROC-AUC': 0.8261188969258589}
 ```

In [16]:
feature_importance_dict = dict(zip(best_features_global, best_classifier.feature_importances_))

# Sort the dictionary based on importances
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

sorted_feature_importance

[('value_1_5', 0.17584661274412683),
 ('value_1_2', 0.06956306503854688),
 ('value_1_3', 0.06052757184879837),
 ('value_1_1', 0.04662567350461004),
 ('Height', 0.03996996385755103),
 ('amount_rain_mean', 0.03949048100729776),
 ('value_0_2', 0.03488305887582792),
 ('Speed_1', 0.029088090009819666),
 ('Length_1', 0.028185133376082146),
 ('WaterHeight_2', 0.026476977955074872),
 ('amount_rain_std_1', 0.026291712303949926),
 ('Height_5', 0.024660908395500027),
 ('Speed', 0.024478024592347058),
 ('value_0_5', 0.02352228996297853),
 ('value_1', 0.021090348268464346),
 ('amount_rain_mean_2', 0.020099648325705207),
 ('Length', 0.0179686906173269),
 ('value_0_1', 0.016576157050927916),
 ('amount_rain_mean_5', 0.016203884995347095),
 ('Speed_2', 0.01609756901960785),
 ('value_1_4', 0.016061429125964366),
 ('Speed_4', 0.015400717975520308),
 ('amount_rain_std', 0.015321389411004882),
 ('value_0_3', 0.014594775090337003),
 ('Length_2', 0.013672034718322934),
 ('WaterHeight_5', 0.012123307134006304