# Train model LogisticRegression

## Prepare data

In [5]:
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
INPUT_DATA_FOLDER = "data/source"
ALL_MERGED_DATA = "all_merged_data.csv"

MODEL_FOLDER = "model"
MODEL_FILE_NAME = "logistic_regression_model.pkl"

FEATURES_TO_INCLUDE = [
    'city_resolvedAddress',
    'day_temp',
    'day_humidity',
    'hour_windspeed',
    'hour_conditions',
    'city',
    #'event_start_hour',
    'num_regions',
    'hours_last_day',
   # 'vectors'
]
TARGET_FEATURE = 'is_alarm'

# Load dataset from a CSV file
df = pd.read_csv(f"{INPUT_DATA_FOLDER}/{ALL_MERGED_DATA}", sep=";")
df.set_index('hour_datetimeEpoch', inplace=True)
df.sort_index(inplace=True)
# Separate the target variable from the input features
X = df[FEATURES_TO_INCLUDE]
y = df[TARGET_FEATURE]

# Convert all columns to float
X = X.apply(pd.to_numeric, errors='coerce')
y = y.apply(pd.to_numeric, errors='coerce')

# Replace NaN values with default
X.fillna(0, inplace=True)
y.fillna(0, inplace=True)

# Split the dataset into training and testing sets
tss = TimeSeriesSplit(2)
train_index, test_index = list(tss.split(X))[-1]
X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train[X_train['vectors'] > 0.0].head(3)
X_train.head(3)


Unnamed: 0_level_0,city_resolvedAddress,day_temp,day_humidity,hour_windspeed,hour_conditions,city,num_regions,hours_last_day
hour_datetimeEpoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1645653600,0.0,2.6,83.7,15.5,0.0,0.0,0.0,0.0
1645653600,0.0,3.3,80.8,10.8,0.0,0.0,0.0,0.0
1645653600,0.0,4.8,68.3,3.2,0.0,0.0,0.0,0.0


## Train model

In [6]:
# Runs for quite a time
# Results in best C = 0.00014563484775012445, best penalty l2 (which is default)
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import GridSearchCV
# import numpy as np

# logistic = LogisticRegression()

# pipe = Pipeline(steps=[('logistic', logistic)])

# C = np.logspace(-4, 4, 50)
# penalty = ['l1', 'l2']
# parameters = dict(logistic__C=C,
#                     logistic__penalty=penalty)

# clf = GridSearchCV(pipe, parameters, verbose=10)

# clf.fit(X, y)
# print('Best Penalty:', clf.best_estimator_.get_params()['logistic__penalty'])
# print('Best C:', clf.best_estimator_.get_params()['logistic__C'])
# print(); print(clf.best_estimator_.get_params()['logistic'])

In [7]:
# Initialize a logistic regression model
model = LogisticRegression(C=0.00014563484775012445)

# Train the model on the training set
model.fit(X_train, y_train)

# Evaluate the model on the testing set
score = model.score(X_test, y_test)

# Print the accuracy score of the model
print(f"Accuracy: {score}")


Accuracy: 0.9176901352948903


## Save model

In [8]:
with open(f"{MODEL_FOLDER}/{MODEL_FILE_NAME}", 'wb') as f:
    pickle.dump(model, f)

## Calculate confusion matrix

In [9]:
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)

Confusion Matrix:
[[50033   686]
 [ 4327  5858]]
