In [None]:
!python -m pip install pandas
!python -m pip install matplotlib
!python -m pip install seaborn 
!python -m pip install catboost
!python -m pip install scikit-learn

In [11]:
import pandas as pd

In [12]:
df = pd.read_csv('Data/Miami_Accidents(Clean).csv')

In [13]:
df.columns



Index(['ID', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat', 'Start_Lng',
       'End_Lat', 'End_Lng', 'Distance(mi)', 'Description', 'Street',
       'Zipcode', 'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)',
       'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Weather_Condition', 'Bump', 'Crossing',
       'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
       'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
       'Sunrise_Sunset'],
      dtype='object')

In [14]:
columns = ['Severity', 'Start_Time', 'End_Time', 'Start_Lat', 'Start_Lng',
           'End_Lat', 'End_Lng', 'Distance(mi)', 'Description', 'Street',
           'Zipcode', 'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)',
           'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)',
           'Precipitation(in)', 'Weather_Condition', 'Bump', 'Crossing',
           'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
           'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop','Sunrise_Sunset']

# Assuming `df` is your DataFrame
df = df[columns]

Data preparation to use it in models

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

X = df.drop(['Severity'], axis=1)  
y = df['Severity']

# We update the selection of categories and numbers based on X
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# One hot encoded 
onehot_encoder = OneHotEncoder(handle_unknown='ignore')
X_train_encoded = onehot_encoder.fit_transform(X_train[categorical_cols])
X_test_encoded = onehot_encoder.transform(X_test[categorical_cols])

# scaling numerical data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled = scaler.transform(X_test[numerical_cols])


In [16]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack


X_train_final = hstack((X_train_encoded, X_train_scaled))
X_test_final = hstack((X_test_encoded, X_test_scaled))

multinomial_model = LogisticRegression(multi_class='multinomial', solver='lbfgs',  max_iter=1000)


multinomial_model.fit(X_train_final, y_train)


train_score = multinomial_model.score(X_train_final, y_train)
test_score = multinomial_model.score(X_test_final, y_test)

print(f'Train Accuracy: {train_score}')
print(f'Test Accuracy: {test_score}')


predictions = multinomial_model.predict(X_test_final)


Train Accuracy: 0.9932613179228523
Test Accuracy: 0.9711263579188107


In [17]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from scipy.sparse import hstack

new_data = {
    'Start_Time': '1/1/2021 8:00',  
    'End_Time': '1/1/2021 9:00',    
    'Start_Lat': 25.3453,
    'Start_Lng': -80.2354,
    'End_Lat': 25.7234,
    'End_Lng': -80.2343,
    'Distance(mi)': 0.5,
    'Description': 'At SR-953/42nd Ave/Le Jeune Rd - Accident.',
    'Street': 'I95 S',
    'Zipcode': '33179',
    'Temperature(F)': 89,  
    'Wind_Chill(F)': 79.486101,
    'Humidity(%)': 78,  
    'Pressure(in)': 29.94,
    'Visibility(mi)': 10,  
    'Wind_Speed(mph)': 10, 
    'Precipitation(in)': 0.009458,
    'Weather_Condition': 'Overcast',
    'Bump': False,
    'Crossing': False,
    'Give_Way': False,
    'Junction': False,
    'No_Exit': True,
    'Railway': False,
    'Roundabout': False,
    'Station': False,
    'Stop': False,
    'Traffic_Calming': False,
    'Traffic_Signal': False,
    'Turning_Loop': False,
    'Sunrise_Sunset': 'Day'
}


new_data_df = pd.DataFrame([new_data])



for col in new_data_df.select_dtypes(include=['bool']).columns:
    new_data_df[col] = new_data_df[col].astype(str)


new_data_encoded = onehot_encoder.transform(new_data_df[categorical_cols])


new_data_scaled = scaler.transform(new_data_df[numerical_cols].astype(float))

# Combinar características codificadas y escaladas
new_data_final = hstack([new_data_encoded, new_data_scaled])

predicted_severity = multinomial_model.predict(new_data_final)

print('The severity prediction is:', predicted_severity[0])


The severity prediction is: 3


In [18]:
print(df.iloc[0])

Severity                                                      2
Start_Time                                  2016-11-30 16:35:52
End_Time                                    2016-11-30 16:35:52
Start_Lat                                              25.78601
Start_Lng                                             -80.25809
End_Lat                                                25.78336
End_Lng                                               -80.26911
Distance(mi)                                               0.71
Description          At SR-953/42nd Ave/Le Jeune Rd - Accident.
Street                                           Dolphin Expy W
Zipcode                                                   33126
Temperature(F)                                             78.1
Wind_Chill(F)                                         78.486101
Humidity(%)                                                76.0
Pressure(in)                                              29.96
Visibility(mi)                          

In [19]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from scipy.sparse import hstack

new_data = {
    
    'Start_Time': '11/30/2016 16:40:31',  
    'End_Time': '11/30/2016 17:10:19',    
    'Start_Lat': 25.78601,
    'Start_Lng': -80.25809,
    'End_Lat': 25.78336,
    'End_Lng': -80.26911,
    'Distance(mi)': 0.71,
    'Description': 'At SR-953/42nd Ave/Le Jeune Rd - Accident.',
    'Street': 'Dolphin Expy W',
    'Zipcode': '33126',
    'Temperature(F)': 78.1,  
    'Wind_Chill(F)': 78.486101,
    'Humidity(%)': 76,  
    'Pressure(in)': 29.96,
    'Visibility(mi)': 10,  
    'Wind_Speed(mph)': 11.5, 
    'Precipitation(in)': 0.009293,
    'Weather_Condition': 'Mostly Cloudy',
    'Bump': False,
    'Crossing': False,
    'Give_Way': False,
    'Junction': False,
    'No_Exit': False,
    'Railway': False,
    'Roundabout': False,
    'Station': False,
    'Stop': False,
    'Traffic_Calming': False,
    'Traffic_Signal': False,
    'Turning_Loop': False,
    'Sunrise_Sunset': 'Day'
}


new_data_df = pd.DataFrame([new_data])

# Convertir booleanos a strings para la codificación one-hot
for col in new_data_df.select_dtypes(include=['bool']).columns:
    new_data_df[col] = new_data_df[col].astype(str)

# Codificar las características categóricas
new_data_encoded = onehot_encoder.transform(new_data_df[categorical_cols])

# Escalar las características numéricas
# Asegúrate de que las columnas numéricas en 'numerical_cols_for_prediction' no incluyan a 'Severity'
new_data_scaled = scaler.transform(new_data_df[numerical_cols].astype(float))

# Combinar características codificadas y escaladas
new_data_final = hstack([new_data_encoded, new_data_scaled])

predicted_severity = multinomial_model.predict(new_data_final)

print('The severity prediction is:', predicted_severity[0])


The severity prediction is: 3
