## City Event Severity Classification

##### Load csv data as pandas dataframe

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('../../COR-Flood-Project/Dados/Sistema Comando/comando_history_apiv2_clean.csv')

  df = pd.read_csv('../../COR-Flood-Project/Dados/Sistema Comando/comando_history_apiv2_clean.csv')


##### Data types

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102726 entries, 0 to 102725
Data columns (total 16 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   tipo        102722 non-null  object 
 1   pop_id      102722 non-null  float64
 2   latitude    90787 non-null   float64
 3   inicio      102726 non-null  object 
 4   titulo      102710 non-null  object 
 5   fim         102726 non-null  object 
 6   aviso_id    66390 non-null   float64
 7   descricao   102660 non-null  object 
 8   informe_id  102720 non-null  float64
 9   gravidade   102712 non-null  object 
 10  id          102726 non-null  int64  
 11  longitude   90811 non-null   float64
 12  status      102726 non-null  object 
 13  bairro      61348 non-null   object 
 14  prazo       61446 non-null   object 
 15  pop_titulo  102663 non-null  object 
dtypes: float64(5), int64(1), object(10)
memory usage: 12.5+ MB


##### Missing values

In [4]:
df.isna().sum().to_frame('NAN Values')

Unnamed: 0,NAN Values
tipo,4
pop_id,4
latitude,11939
inicio,0
titulo,16
fim,0
aviso_id,36336
descricao,66
informe_id,6
gravidade,14


In [5]:
(df == 0).sum().to_frame('Zero Values').T

Unnamed: 0,tipo,pop_id,latitude,inicio,titulo,fim,aviso_id,descricao,informe_id,gravidade,id,longitude,status,bairro,prazo,pop_titulo
Zero Values,0,16,0,0,0,0,58073,0,0,0,1,0,0,0,0,0


##### Data type mapping

In [6]:
# categorical variables
'''
tipo
pop_id
aviso_id
informe_id - id
gravidade
id - id
status - constant
bairro 
prazo
pop_titulo - redundant to pop_id
day_of_week - engineered
month - engineered
hour - engineered
'''
# numerical variables
'''
latitude
longitude
duration - engineered
distance_to_center - engineered
'''

'\nlatitude\nlongitude\nduration - engineered\ndistance_to_center - engineered\n'

##### Data Preparation

In [7]:
# Drop rows with missing values for coordinates
# df = df.dropna(subset=['latitude', 'longitude'])

df['inicio'] = pd.to_datetime(df['inicio'])
df['fim'] = pd.to_datetime(df['fim'])

# Feature engineering ----------

# Extract relevant temporal features
df['day_of_week'] = df['inicio'].dt.dayofweek
df['month'] = df['inicio'].dt.month
df['hour'] = df['inicio'].dt.hour
df['duration'] = (df['fim'] - df['inicio']).dt.total_seconds()

# Feature engineering for geospatial data
latitude_ref, longitude_ref = -22.9005252, -43.1987181
df['distance_to_center'] = ((df['latitude'] - latitude_ref)**2 + (df['longitude'] - longitude_ref)**2)**0.5

# Handling missing values --------

# Filling missing values for numerical columns
df['latitude'].fillna(df['latitude'].mean(), inplace=True)
df['longitude'].fillna(df['longitude'].mean(), inplace=True)
df['distance_to_center'].fillna(df['distance_to_center'].mean(), inplace=True)
df['duration'].fillna(df['duration'].mean(), inplace=True)

# Filling missing values for categorical columns (Except for the target variable)
df['tipo'].fillna(df['prazo'].mode()[0], inplace=True)
df['pop_id'].fillna(-1, inplace=True)
df['aviso_id'].fillna(-1, inplace=True)
df['bairro'].fillna('Unknown', inplace=True)
df['prazo'].fillna(df['prazo'].mode()[0], inplace=True)

# Handling Categorical variables ---------

# Convert categorical variables to 'category' data type
# df['tipo'] = df['tipo'].astype('category')
df['pop_id'] = df['pop_id'].astype('category') # from integer
df['aviso_id'] = df['aviso_id'].astype('category') # from integer

# Encode string categorical variables into numerical format
df['tipo'] = df['tipo'].astype('category').cat.codes
df['bairro'] = df['bairro'].astype('category').cat.codes
df['prazo'] = df['prazo'].astype('category').cat.codes
df['gravidade'] = df['gravidade'].astype('category').cat.codes

# Drop rows where target viariable is missing
df.dropna(subset=['gravidade'], inplace=True)

# Feature selection based on relevance (Excluding redundant variables)
selected_features = ['latitude', 'longitude', 'duration', 'distance_to_center', 'gravidade', 'tipo', 'pop_id', 'aviso_id', 'bairro', 'prazo', 'day_of_week', 'month', 'hour']
df = df[selected_features]  # Include the target variable in the selected features

# Display the updated DataFrame
print(df.head())

    latitude  longitude  duration  distance_to_center  gravidade  tipo pop_id  \
0 -22.918343 -43.291408    7548.0            0.117066          1     1    2.0   
1 -22.918343 -43.291408    1114.0            0.117066          1     1    4.0   
2 -22.918343 -43.291408    7114.0            0.117066          1     1    4.0   
3 -22.918343 -43.291408    6600.0            0.117066          1     1    1.0   
4 -22.918343 -43.291408    5083.0            0.117066          1     1    2.0   

  aviso_id  bairro  prazo  day_of_week  month  hour  
0      0.0     353      0            0      2    12  
1      0.0     353      0            0      2    12  
2      0.0     353      0            1      2    14  
3      0.0     353      0            1      2    15  
4      0.0     353      0            1      2    15  


##### Fit and evaluate classification models on the training data

In [57]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler

# Assuming 'df' is your DataFrame with the dataset
# Assuming 'gravidade' is the target variable, and other columns are predictors

# Split the data into features (X) and target variable (y)
X = df.drop('gravidade', axis=1)
y = df['gravidade']

# Apply undersampling
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.5, random_state=42)

# Display class counts
class_counts = pd.DataFrame([pd.value_counts(i) for i in [y_train.values, y_test.values]], index=['train', 'test']).T.sort_index().T
class_counts.index.name = 'class'
display(class_counts)

# Model 1: Logistic Regression
logreg_model = LogisticRegression(max_iter=5000)
logreg_model.fit(X_train, y_train)
logreg_predictions = logreg_model.predict(X_test)

# Model 2: Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)

# Model 3: Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

# Evaluate models
models = [logreg_model, dt_model, rf_model]
predictions = [logreg_predictions, dt_predictions, rf_predictions]
model_names = ['Logistic Regression', 'Decision Tree', 'Random Forest']

for model, pred, name in zip(models, predictions, model_names):
    accuracy = accuracy_score(y_test, pred)
    report = classification_report(y_test, pred)
    confusion_mat = confusion_matrix(y_test, pred)
    
    print(f"--- {name} ---")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", confusion_mat)
    print("\n")


Unnamed: 0_level_0,-1,0,1,2,3,4
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
train,5,9,6,6,7,9
test,9,5,8,8,7,5


--- Logistic Regression ---
Accuracy: 0.3571
Classification Report:
               precision    recall  f1-score   support

          -1       0.80      0.44      0.57         9
           0       0.00      0.00      0.00         5
           1       0.50      0.38      0.43         8
           2       0.44      0.50      0.47         8
           3       0.33      0.14      0.20         7
           4       0.21      0.60      0.32         5

    accuracy                           0.36        42
   macro avg       0.38      0.34      0.33        42
weighted avg       0.43      0.36      0.36        42

Confusion Matrix:
 [[4 0 0 0 0 5]
 [0 0 1 3 0 1]
 [1 0 3 1 1 2]
 [0 4 0 4 0 0]
 [0 1 1 1 1 3]
 [0 0 1 0 1 3]]


--- Decision Tree ---
Accuracy: 0.4286
Classification Report:
               precision    recall  f1-score   support

          -1       1.00      0.33      0.50         9
           0       0.25      0.40      0.31         5
           1       0.40      0.25      0.31       

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
