## City Event Severity Linear Regression

##### Load csv data as pandas dataframe

In [88]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('../../COR-Flood-Project/Dados/Sistema Comando/comando_history_apiv2_clean.csv')

  df = pd.read_csv('../../COR-Flood-Project/Dados/Sistema Comando/comando_history_apiv2_clean.csv')


##### Data types

In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102726 entries, 0 to 102725
Data columns (total 16 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   tipo        102722 non-null  object 
 1   pop_id      102722 non-null  float64
 2   latitude    90787 non-null   float64
 3   inicio      102726 non-null  object 
 4   titulo      102710 non-null  object 
 5   fim         102726 non-null  object 
 6   aviso_id    66390 non-null   float64
 7   descricao   102660 non-null  object 
 8   informe_id  102720 non-null  float64
 9   gravidade   102712 non-null  object 
 10  id          102726 non-null  int64  
 11  longitude   90811 non-null   float64
 12  status      102726 non-null  object 
 13  bairro      61348 non-null   object 
 14  prazo       61446 non-null   object 
 15  pop_titulo  102663 non-null  object 
dtypes: float64(5), int64(1), object(10)
memory usage: 12.5+ MB


In [106]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102726 entries, 0 to 102725
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   latitude            102726 non-null  float64 
 1   longitude           102726 non-null  float64 
 2   duration            102726 non-null  float64 
 3   distance_to_center  102726 non-null  float64 
 4   gravidade           102726 non-null  int8    
 5   tipo                102726 non-null  int8    
 6   pop_id              102726 non-null  category
 7   aviso_id            102726 non-null  category
 8   bairro              102726 non-null  int16   
 9   prazo               102726 non-null  int8    
 10  day_of_week         102726 non-null  int32   
 11  month               102726 non-null  int32   
 12  hour                102726 non-null  int32   
dtypes: category(2), float64(4), int16(1), int32(3), int8(3)
memory usage: 5.4 MB


##### Missing values

In [90]:
df.isna().sum().to_frame('NAN Values')

Unnamed: 0,NAN Values
tipo,4
pop_id,4
latitude,11939
inicio,0
titulo,16
fim,0
aviso_id,36336
descricao,66
informe_id,6
gravidade,14


In [92]:
(df == 0).sum().to_frame('Zero Values').T

Unnamed: 0,tipo,pop_id,latitude,inicio,titulo,fim,aviso_id,descricao,informe_id,gravidade,id,longitude,status,bairro,prazo,pop_titulo
Zero Values,0,16,0,0,0,0,58073,0,0,0,1,0,0,0,0,0


##### Data type mapping

In [96]:
# categorical variables
'''
tipo
pop_id
aviso_id
informe_id - id
gravidade
id - id
status - constant
bairro 
prazo
pop_titulo - redundant to pop_id
day_of_week - engineered
month - engineered
hour - engineered
'''
# numerical variables
'''
latitude
longitude
duration - engineered
distance_to_center - engineered
'''

'\nlatitude\nlongitude\nduration - engineered\ndistance_to_center - engineered\n'

##### Data Preparation

In [97]:
# Drop rows with missing values for coordinates
# df = df.dropna(subset=['latitude', 'longitude'])

df['inicio'] = pd.to_datetime(df['inicio'])
df['fim'] = pd.to_datetime(df['fim'])

# Feature engineering ----------

# Extract relevant temporal features
df['day_of_week'] = df['inicio'].dt.dayofweek
df['month'] = df['inicio'].dt.month
df['hour'] = df['inicio'].dt.hour
df['duration'] = (df['fim'] - df['inicio']).dt.total_seconds()

# Feature engineering for geospatial data
latitude_ref, longitude_ref = -22.9005252, -43.1987181
df['distance_to_center'] = ((df['latitude'] - latitude_ref)**2 + (df['longitude'] - longitude_ref)**2)**0.5

# Handling missing values --------

# Filling missing values for numerical columns
df['latitude'].fillna(df['latitude'].mean(), inplace=True)
df['longitude'].fillna(df['longitude'].mean(), inplace=True)
df['distance_to_center'].fillna(df['distance_to_center'].mean(), inplace=True)
df['duration'].fillna(df['duration'].mean(), inplace=True)

# Filling missing values for categorical columns (Except for the target variable)
df['tipo'].fillna(df['prazo'].mode()[0], inplace=True)
df['pop_id'].fillna(-1, inplace=True)
df['aviso_id'].fillna(-1, inplace=True)
df['bairro'].fillna('Unknown', inplace=True)
df['prazo'].fillna(df['prazo'].mode()[0], inplace=True)
# df['gravidade'].fillna(df['gravidade'].mode()[0], inplace=True) # target variable

# Handling Categorical variables ---------

# Convert categorical variables to 'category' data type
# df['tipo'] = df['tipo'].astype('category')
df['pop_id'] = df['pop_id'].astype('category') # from integer
df['aviso_id'] = df['aviso_id'].astype('category') # from integer
# df['bairro'] = df['tipo'].astype('category')
# df['prazo'] = df['prazo'].astype('category')
# df['gravidade'] = df['gravidade'].astype('category') # target variable

# Encode string categorical variables into numerical format
df['tipo'] = df['tipo'].astype('category').cat.codes
df['bairro'] = df['bairro'].astype('category').cat.codes
df['prazo'] = df['prazo'].astype('category').cat.codes
df['gravidade'] = df['gravidade'].astype('category').cat.codes

# Drop rows where target viariable is missing
df.dropna(subset=['gravidade'], inplace=True)

# Feature selection based on relevance (Excluding redundant variables)
selected_features = ['latitude', 'longitude', 'duration', 'distance_to_center', 'gravidade', 'tipo', 'pop_id', 'aviso_id', 'bairro', 'prazo', 'day_of_week', 'month', 'hour']
df = df[selected_features]  # Include the target variable in the selected features

# Display the updated DataFrame
print(df.head())

    latitude  longitude  duration  distance_to_center  gravidade  tipo pop_id  \
0 -22.918343 -43.291408    7548.0            0.117066          1     1    2.0   
1 -22.918343 -43.291408    1114.0            0.117066          1     1    4.0   
2 -22.918343 -43.291408    7114.0            0.117066          1     1    4.0   
3 -22.918343 -43.291408    6600.0            0.117066          1     1    1.0   
4 -22.918343 -43.291408    5083.0            0.117066          1     1    2.0   

  aviso_id  bairro  prazo  day_of_week  month  hour  
0      0.0     353      0            0      2    12  
1      0.0     353      0            0      2    12  
2      0.0     353      0            1      2    14  
3      0.0     353      0            1      2    15  
4      0.0     353      0            1      2    15  


##### Fit regression models on the training data

In [98]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline

# Assuming 'gravidade' is the target variable, and other relevant features are predictors
X = df.drop(['gravidade'], axis=1)
y = df['gravidade']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Simple Linear Regression (Baseline Model)
simple_model = LinearRegression()
simple_model.fit(X_train, y_train)

In [99]:
# 2. Polynomial Regression (Non-linear Effects)
poly_model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
poly_model.fit(X_train, y_train)

In [100]:
# 3. Regularized Regression (Lasso Regression)
lasso_model = Lasso(alpha=0.01)  # Adjust alpha for regularization strength
lasso_model.fit(X_train, y_train)

##### Evaluate models' predictions on the test data

In [101]:
# Evaluate the models on the test set
simple_predictions = simple_model.predict(X_test)
poly_predictions = poly_model.predict(X_test)
lasso_predictions = lasso_model.predict(X_test)

# Calculate evaluation metrics
simple_mse = mean_squared_error(y_test, simple_predictions)
poly_mse = mean_squared_error(y_test, poly_predictions)
lasso_mse = mean_squared_error(y_test, lasso_predictions)

simple_r2 = r2_score(y_test, simple_predictions)
poly_r2 = r2_score(y_test, poly_predictions)
lasso_r2 = r2_score(y_test, lasso_predictions)

# Print results
print("**Results of Training Three Linear Regression Models:**\n")

# 1. Simple Linear Regression
print("1. Simple Linear Regression (Baseline Model):")
print(f"Coefficients: {simple_model.coef_}")
print(f"Intercept: {simple_model.intercept_}")
print(f"Mean Squared Error (MSE): {simple_mse}")
print(f"R-squared: {simple_r2}\n")

# 2. Polynomial Regression
print("2. Polynomial Regression (Non-linear Effects):")
print(f"Mean Squared Error (MSE): {poly_mse}")
print(f"R-squared: {poly_r2}\n")

# 3. Regularized Regression (Lasso Regression)
print("3. Regularized Regression (Lasso Regression):")
print(f"Coefficients after Regularization: {lasso_model.coef_}")
print(f"Mean Squared Error (MSE): {lasso_mse}")
print(f"R-squared: {lasso_r2}\n")

**Results of Training Three Linear Regression Models:**

1. Simple Linear Regression (Baseline Model):
Coefficients: [ 3.25792891e-02  3.89117175e-01  6.44087821e-08  2.71994965e-01
 -1.36426935e-01  2.17265623e-03  1.14163703e-06 -7.85828896e-05
  3.65065963e-02 -7.97387473e-03 -2.83566924e-03 -9.82238400e-04]
Intercept: 18.868882570252648
Mean Squared Error (MSE): 0.2195502672371177
R-squared: 0.005581851167200114

2. Polynomial Regression (Non-linear Effects):
Mean Squared Error (MSE): 0.2201560392249258
R-squared: 0.0028381029298987226

3. Regularized Regression (Lasso Regression):
Coefficients after Regularization: [ 0.00000000e+00  0.00000000e+00 -5.45429613e-09 -0.00000000e+00
 -0.00000000e+00  2.11496329e-03  1.17887300e-06 -6.81035133e-05
  0.00000000e+00 -5.10791725e-03 -2.01050048e-03 -5.85253565e-04]
Mean Squared Error (MSE): 0.22005287026309325
R-squared: 0.00330539039592459



##### Classification metrics

In [104]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Round predictions for classification
simple_class_predictions = simple_predictions.round().astype(int)
poly_class_predictions = poly_predictions.round().astype(int)
lasso_class_predictions = lasso_predictions.round().astype(int)

# Evaluate classification metrics
simple_accuracy = accuracy_score(y_test, simple_class_predictions)
poly_accuracy = accuracy_score(y_test, poly_class_predictions)
lasso_accuracy = accuracy_score(y_test, lasso_class_predictions)

# In multiclass classification, precision, recall, and f1_score can be calculated per class (average='weighted' is common)
simple_precision = precision_score(y_test, simple_class_predictions, average='weighted')
poly_precision = precision_score(y_test, poly_class_predictions, average='weighted')
lasso_precision = precision_score(y_test, lasso_class_predictions, average='weighted')

simple_recall = recall_score(y_test, simple_class_predictions, average='weighted')
poly_recall = recall_score(y_test, poly_class_predictions, average='weighted')
lasso_recall = recall_score(y_test, lasso_class_predictions, average='weighted')

simple_f1 = f1_score(y_test, simple_class_predictions, average='weighted')
poly_f1 = f1_score(y_test, poly_class_predictions, average='weighted')
lasso_f1 = f1_score(y_test, lasso_class_predictions, average='weighted')

# Confusion matrices
simple_conf_matrix = confusion_matrix(y_test, simple_class_predictions)
poly_conf_matrix = confusion_matrix(y_test, poly_class_predictions)
lasso_conf_matrix = confusion_matrix(y_test, lasso_class_predictions)

# Print classification results
print("**Results of Training Three Linear Regression Models (Classification Metrics):**\n")

# 1. Simple Linear Regression
print("1. Simple Linear Regression (Baseline Model):")
print(f"Accuracy: {simple_accuracy}")
print(f"Precision: {simple_precision}")
print(f"Recall: {simple_recall}")
print(f"F1 Score: {simple_f1}")
print(f"Confusion Matrix:\n{simple_conf_matrix}\n")

# 2. Polynomial Regression
print("2. Polynomial Regression (Non-linear Effects):")
print(f"Accuracy: {poly_accuracy}")
print(f"Precision: {poly_precision}")
print(f"Recall: {poly_recall}")
print(f"F1 Score: {poly_f1}")
print(f"Confusion Matrix:\n{poly_conf_matrix}\n")

# 3. Regularized Regression (Lasso Regression)
print("3. Regularized Regression (Lasso Regression):")
print(f"Accuracy: {lasso_accuracy}")
print(f"Precision: {lasso_precision}")
print(f"Recall: {lasso_recall}")
print(f"F1 Score: {lasso_f1}")
print(f"Confusion Matrix:\n{lasso_conf_matrix}\n")


**Results of Training Three Linear Regression Models (Classification Metrics):**

1. Simple Linear Regression (Baseline Model):
Accuracy: 0.9406697167331841
Precision: 0.8848595159788888
Recall: 0.9406697167331841
F1 Score: 0.9119114997769041
Confusion Matrix:
[[    0     0     3     0     0     0]
 [    0     0    46     0     0     0]
 [    0     0 19327     0     0     0]
 [    0     0     5     0     0     0]
 [    0     0  1151     0     0     0]
 [    0     0    14     0     0     0]]

2. Polynomial Regression (Non-linear Effects):
Accuracy: 0.9389662221356955
Precision: 0.8850231736267538
Recall: 0.9389662221356955
F1 Score: 0.9111970363133454
Confusion Matrix:
[[    0     0     3     0     0     0]
 [    0     0    44     2     0     0]
 [    0     2 19292    32     0     1]
 [    0     0     5     0     0     0]
 [    0     0  1147     4     0     0]
 [    0     0    14     0     0     0]]

3. Regularized Regression (Lasso Regression):
Accuracy: 0.9406697167331841
Precision: 0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
