# Train model Stochastic Gradient Descent

## Prepare data

In [2]:
import pandas as pd
import pickle
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

INPUT_DATA_FOLDER = "data/source"
ALL_MERGED_DATA = "all_merged_data.csv"

MODEL_FOLDER = "model"
MODEL_FILE_NAME = "stochastic_gradient_descent_model.pkl"

FEATURES_TO_INCLUDE = [
    'day_datetime',
    'day_temp',
    'day_humidity',
    'hour_windspeed',
    'hour_conditions',
    'city',
    'event_start_hour',
    #  'event_num_regions',
    #  'event_num_alarms_24h',
    'vectors'
]
TARGET_FEATURE = 'is_alarm'

# Load dataset from a CSV file
df = pd.read_csv(f"{INPUT_DATA_FOLDER}/{ALL_MERGED_DATA}", sep=";")

# Separate the target variable from the input features
X = df[FEATURES_TO_INCLUDE]
y = df[TARGET_FEATURE]

# Convert all columns to float
X = X.apply(pd.to_numeric, errors='coerce')
y = y.apply(pd.to_numeric, errors='coerce')

# Replace NaN values with default
X.fillna(0, inplace=True)
y.fillna(0, inplace=True)

# Split the dataset into training and testing sets
tss = TimeSeriesSplit(2)
train_index, test_index = list(tss.split(X))[-1]
X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]


## Train model

In [3]:
# Runs for quite a time
# Results in best alpha 0.001
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

sgd = SGDClassifier()

pipe = Pipeline(steps=[('sgd', sgd)])

alphas = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5]
parameters = dict(sgd__alpha=alphas)

clf = GridSearchCV(pipe, parameters, verbose=10)

clf.fit(X, y)
print('Best alpha:', clf.best_estimator_.get_params()['sgd__alpha'])
print(); print(clf.best_estimator_.get_params()['sgd'])

Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV 1/5; 1/11] START sgd__alpha=1e-05...........................................
[CV 1/5; 1/11] END ............sgd__alpha=1e-05;, score=0.474 total time=  14.3s
[CV 2/5; 1/11] START sgd__alpha=1e-05...........................................
[CV 2/5; 1/11] END ............sgd__alpha=1e-05;, score=0.748 total time=  15.4s
[CV 3/5; 1/11] START sgd__alpha=1e-05...........................................
[CV 3/5; 1/11] END ............sgd__alpha=1e-05;, score=0.252 total time=  15.9s
[CV 4/5; 1/11] START sgd__alpha=1e-05...........................................
[CV 4/5; 1/11] END ............sgd__alpha=1e-05;, score=0.251 total time=  17.2s
[CV 5/5; 1/11] START sgd__alpha=1e-05...........................................
[CV 5/5; 1/11] END ............sgd__alpha=1e-05;, score=0.748 total time=  10.9s
[CV 1/5; 2/11] START sgd__alpha=0.0001..........................................
[CV 1/5; 2/11] END ...........sgd__alpha=0.0001;

In [9]:
# Train an SGD model
model = SGDClassifier(loss='squared_error', penalty='l2', alpha=0.001, max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.800920793298842


## Save model

In [3]:
with open(f"{MODEL_FOLDER}/{MODEL_FILE_NAME}", 'wb') as f:
    pickle.dump(model, f)

## Calculate confusion matrix

In [4]:
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)

Confusion Matrix:
[[52015     0]
 [12929     0]]
