In [1]:
import pandas as pd

data = pd.read_csv("test.csv",low_memory=False)
data.columns

Index(['Station_code', 'Date', 'Hour', 'Dd', 'Fh', 'Ff', 'Fx', 'T', 'T10n',
       'Td', 'Sq', 'Q', 'Dr', 'Rh', 'P', 'Vv', 'N', 'U', 'Ww', 'Ix', 'M', 'R',
       'S', 'O', 'Y', 'Incident_ID', 'Incident_Starttime', 'Incident_Endtime',
       'Incident_Duration', 'Incident_Priority', 'Service_Area',
       'Municipality', 'Damage_Type', 'LON', 'LAT', 'Incident_Endtime_Hour',
       'Incident_Duration_Hour', 'Incident_Starttime_Minute',
       'Incident_Endtime_Minute', 'Incident_Duration_Minute', 'Deployment_ID',
       'Vehicle_Type', 'Vehicle_Role', 'Fire_Station',
       'Fire_Station_Service_Status', 'Driving_Time_To_Incident', 'Vehicle'],
      dtype='object')

In [2]:

relevant_columns = [
    'Dd', 'Fh', 'Ff', 'Fx', 'Dr', 'Rh', 'P', 'Vv', 'N', 'U', 'M', 'R', 'S', 'O', 'Y',
    'Incident_Duration', 'Service_Area', 'Municipality', 'Damage_Type', 'LON', 'LAT', 'Date', 'Hour'
]

filtered_data = data[relevant_columns]

filtered_data.head()  

Unnamed: 0,Dd,Fh,Ff,Fx,Dr,Rh,P,Vv,N,U,...,O,Y,Incident_Duration,Service_Area,Municipality,Damage_Type,LON,LAT,Date,Hour
0,260,40.0,30,60,0,0,10246,57.0,8.0,93,...,0.0,0.0,,,,,,,2005-01-01,1
1,230,30.0,30,60,0,0,10244,58.0,8.0,91,...,0.0,0.0,,,,,,,2005-01-01,2
2,230,40.0,30,50,0,0,10241,40.0,1.0,94,...,0.0,0.0,,,,,,,2005-01-01,3
3,220,40.0,40,50,0,0,10239,12.0,0.0,96,...,0.0,0.0,,,,,,,2005-01-01,4
4,230,40.0,40,50,0,0,10237,14.0,3.0,97,...,0.0,0.0,,,,,,,2005-01-01,5


In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Creating a binary target variable 'Incident_Occurred'
# 1 indicates an incident occurred, 0 indicates no incident
data['Incident_Occurred'] = np.where(data['Incident_ID'].notna(), 1, 0)



In [4]:

data['Incident_Occurred'] = np.where(data['Incident_Duration'].notna(), 1, 0)


incident_data = data[data['Incident_Occurred'] == 1]
non_incident_data = data[data['Incident_Occurred'] == 0]


non_incident_sample = non_incident_data.sample(n=len(incident_data), random_state=42)


balanced_data = pd.concat([incident_data, non_incident_sample])



In [16]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report


weather_features = ['Fh','Ff', 'Fx', 'Dr', 'Rh', 'T', 'Vv', 'P', 'N', 'U']
X = balanced_data[weather_features]
y = balanced_data['Incident_Occurred']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

model.feature_importances_


array([0.19209939, 0.13989459, 0.24164725, 0.01724393, 0.02854956,
       0.10226454, 0.05787913, 0.11849309, 0.02569267, 0.07623585])

In [15]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("ROC AUC:", roc_auc)
print("\nClassification Report:\n", classification_rep)

Accuracy: 0.9031906190346333
ROC AUC: 0.9034124858656192

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.92      0.90      1800
           1       0.92      0.89      0.90      1867

    accuracy                           0.90      3667
   macro avg       0.90      0.90      0.90      3667
weighted avg       0.90      0.90      0.90      3667



In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Support Vector Machine': SVC(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    print(f"{name} - Accuracy: {accuracy:.4f}, ROC AUC: {roc_auc:.4f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


AttributeError: 'LogisticRegression' object has no attribute 'feature_importances_'

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


lr_pipeline = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, random_state=42))


lr_pipeline.fit(X_train, y_train)


y_pred = lr_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print(f"Logistic Regression (Scaled) - Accuracy: {accuracy:.4f}, ROC AUC: {roc_auc:.4f}")


Logistic Regression (Scaled) - Accuracy: 0.8467, ROC AUC: 0.8472


In [9]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report


xgb_classifier = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')


xgb_classifier.fit(X_train, y_train)



In [10]:

y_pred_xgb = xgb_classifier.predict(X_test)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
roc_auc_xgb = roc_auc_score(y_test, y_pred_xgb)
classification_rep_xgb = classification_report(y_test, y_pred_xgb)


print("XGBoost - Accuracy:", accuracy_xgb)
print("XGBoost - ROC AUC:", roc_auc_xgb)
print("\nXGBoost - Classification Report:\n", classification_rep_xgb)


XGBoost - Accuracy: 0.8748295609490047
XGBoost - ROC AUC: 0.8750220198774028

XGBoost - Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.89      0.87      1800
           1       0.89      0.86      0.88      1867

    accuracy                           0.87      3667
   macro avg       0.87      0.88      0.87      3667
weighted avg       0.88      0.87      0.87      3667



In [12]:
xgb_classifier.feature_importances_

array([0.05976155, 0.62091374, 0.05737747, 0.04804856, 0.04981291,
       0.04537668, 0.0409399 , 0.0382539 , 0.03951531], dtype=float32)

In [13]:
balanced_data.columns

Index(['Station_code', 'Date', 'Hour', 'Dd', 'Fh', 'Ff', 'Fx', 'T', 'T10n',
       'Td', 'Sq', 'Q', 'Dr', 'Rh', 'P', 'Vv', 'N', 'U', 'Ww', 'Ix', 'M', 'R',
       'S', 'O', 'Y', 'Incident_ID', 'Incident_Starttime', 'Incident_Endtime',
       'Incident_Duration', 'Incident_Priority', 'Service_Area',
       'Municipality', 'Damage_Type', 'LON', 'LAT', 'Incident_Endtime_Hour',
       'Incident_Duration_Hour', 'Incident_Starttime_Minute',
       'Incident_Endtime_Minute', 'Incident_Duration_Minute', 'Deployment_ID',
       'Vehicle_Type', 'Vehicle_Role', 'Fire_Station',
       'Fire_Station_Service_Status', 'Driving_Time_To_Incident', 'Vehicle',
       'Incident_Occurred'],
      dtype='object')