# Train model Decision Trees

## Prepare data

In [2]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

INPUT_DATA_FOLDER = "data/source"
ALL_MERGED_DATA = "all_merged_data.csv"

MODEL_FOLDER = "model"
MODEL_FILE_NAME = "decision_trees_model.pkl"

FEATURES_TO_INCLUDE = [
    'city_resolvedAddress',
    'day_temp',
    'day_humidity',
    'hour_windspeed',
    'hour_conditions',
    'event_start_hour',
    # 'event_num_regions',
    # 'event_num_alarms_24h',
    # 'vectors'
]
TARGET_FEATURE = 'is_alarm'

# Load dataset from a CSV file
df = pd.read_csv(f"{INPUT_DATA_FOLDER}/{ALL_MERGED_DATA}", sep=";")

# Separate the target variable from the input features
X = df[FEATURES_TO_INCLUDE]
y = df[TARGET_FEATURE]

# Convert all columns to float
X = X.apply(pd.to_numeric, errors='coerce')
y = y.apply(pd.to_numeric, errors='coerce')

# Replace NaN values with default
X.fillna(0, inplace=True)
y.fillna(0, inplace=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# X_train[X_train['vectors'] > 0.0].head(3)
X_train.head(3)


Unnamed: 0,city_resolvedAddress,day_temp,day_humidity,hour_windspeed,hour_conditions,event_start_hour
12714,0.0,23.1,64.5,11.2,0.0,0.0
130784,0.0,5.6,56.6,21.6,0.0,0.0
125855,0.0,10.1,93.4,15.5,0.0,0.0


## Train model

In [3]:
# Train a Decision Tree model
model = DecisionTreeClassifier(max_depth=None, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)


Accuracy: 0.7905920394179691


## Save model

In [4]:
with open(f"{MODEL_FOLDER}/{MODEL_FILE_NAME}", 'wb') as f:
    pickle.dump(model, f)

## Calculate confusion matrix

In [5]:
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)

Confusion Matrix:
[[25358  3778]
 [ 4382  5449]]
