In [1]:
!python -m pip install pandas
!python -m pip install scikit-learn




[notice] A new release of pip available: 22.3 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





Data preparation to use it in models

In [2]:
import pandas as pd
df = pd.read_csv("Data/Miami_Accidents(Clean).csv")
df = df.drop('ID', axis=1)


model

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Define numerical, categorical, and text columns for preprocessing
numerical_cols = ['Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 
                  'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)', 
                  'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)']

categorical_cols = ['Street', 'Zipcode', 'Weather_Condition', 'Sunrise_Sunset']

text_col = 'Description'  # Text column for feature extraction

# Configure the ColumnTransformer to apply appropriate transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Normalize numerical data
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),  # Convert categorical variables into dummy/indicator variables
        ('txt', CountVectorizer(), text_col)  # Convert text data into a sparse matrix of word counts
    ],
    remainder='drop'  # Ignore columns not specified in transformers
)

# Create a pipeline with preprocessing and logistic regression model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, multi_class='multinomial'))  # Use logistic regression for multi-class classification
])

# Prepare training and testing data
X = df.drop('Severity', axis=1)  # Exclude the target variable
y = df['Severity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model using the pipeline
pipeline.fit(X_train, y_train)

# Evaluate the model
predictions = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))  # Output the accuracy of the model
print("Classification Report:\n", classification_report(y_test, predictions))  # Print the classification report to understand model performance for each class


Accuracy: 0.9790900922976394
Classification Report:
               precision    recall  f1-score   support

           1       0.29      0.29      0.29         7
           2       0.98      1.00      0.99     23641
           3       0.60      0.25      0.36       512
           4       0.99      0.90      0.94       326

    accuracy                           0.98     24486
   macro avg       0.71      0.61      0.64     24486
weighted avg       0.97      0.98      0.98     24486



Confusion matrix

In [4]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred =pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.29      0.29      0.29         7
           2       0.98      1.00      0.99     23641
           3       0.60      0.25      0.36       512
           4       0.99      0.90      0.94       326

    accuracy                           0.98     24486
   macro avg       0.71      0.61      0.64     24486
weighted avg       0.97      0.98      0.98     24486

Confusion matrix:
 [[    2     2     3     0]
 [    5 23548    85     3]
 [    0   382   130     0]
 [    0    32     0   294]]


Predictions

1.

In [5]:
print(df.iloc[1225])

Severity                                                             4
Start_Time                                         2016-10-17 23:23:59
End_Time                                           2016-10-17 23:23:59
Start_Lat                                                    25.785738
Start_Lng                                                   -80.175632
End_Lat                                                       25.78575
End_Lng                                                      -80.17564
Distance(mi)                                                     0.001
Description          Closed at Trooper Robert G Smith Brg - Road cl...
Street                                                MacArthur Cswy W
Zipcode                                                          33132
Temperature(F)                                                    80.1
Wind_Chill(F)                                                78.486101
Humidity(%)                                                       64.0
Pressu

should be 4

In [6]:
import pandas as pd

new_data = {
    'Start_Time': ['2016-10-17 23:23:59'],
    'End_Time': ['2016-10-17 23:23:59'],
    'Start_Lat': [25.785738],
    'Start_Lng': [-80.175632],
    'End_Lat': [25.78575],
    'End_Lng': [-80.17564],
    'Distance(mi)': [0.001],
    'Description': ['Closed at Trooper Robert G Smith Brg - Road closed due to accident.'],
    'Street': ['MacArthur Cswy W'],
    'Zipcode': ['33132'],
    'Temperature(F)': [80.1],
    'Wind_Chill(F)': [78.486101],
    'Humidity(%)': [64.0],
    'Pressure(in)': [29.93],
    'Visibility(mi)': [10.0],
    'Wind_Speed(mph)': [6.9],
    'Precipitation(in)': [0.009293],
    'Weather_Condition': ['Partly Cloudy'],
    'Bump': [False],
    'Crossing': [True],
    'Give_Way': [False],
    'Junction': [False],
    'No_Exit': [True],
    'Railway': [False],
    'Roundabout': [False],
    'Station': [False],
    'Stop': [False],
    'Traffic_Calming': [False],
    'Traffic_Signal': [False],
    'Turning_Loop': [False],
    'Sunrise_Sunset': ['Night']
}


new_data_df = pd.DataFrame(new_data)


predicted_severity = pipeline.predict(new_data_df)


print('The predicted severity is:', predicted_severity[0])


The predicted severity is: 4


2.

In [7]:
print(df.iloc[0])

Severity                                                      2
Start_Time                                  2016-11-30 16:35:52
End_Time                                    2016-11-30 16:35:52
Start_Lat                                              25.78601
Start_Lng                                             -80.25809
End_Lat                                                25.78336
End_Lng                                               -80.26911
Distance(mi)                                               0.71
Description          At SR-953/42nd Ave/Le Jeune Rd - Accident.
Street                                           Dolphin Expy W
Zipcode                                                   33126
Temperature(F)                                             78.1
Wind_Chill(F)                                         78.486101
Humidity(%)                                                76.0
Pressure(in)                                              29.96
Visibility(mi)                          

should be 2

In [8]:
import pandas as pd

new_data = {
    'Start_Time': ['11/30/2016 16:40:31'],
    'End_Time': ['11/30/2016 17:10:19'],
    'Start_Lat': [25.78601],
    'Start_Lng': [-80.25809],
    'End_Lat': [25.78336],
    'End_Lng': [-80.26911],
    'Distance(mi)': [0.71],
    'Description': ['At SR-953/42nd Ave/Le Jeune Rd - Accident.'],
    'Street': ['Dolphin Expy W'],
    'Zipcode': ['33126'],
    'Temperature(F)': [78.1],
    'Wind_Chill(F)': [78.486101],
    'Humidity(%)': [76],
    'Pressure(in)': [29.96],
    'Visibility(mi)': [10],
    'Wind_Speed(mph)': [11.5],
    'Precipitation(in)': [0.009293],
    'Weather_Condition': ['Mostly Cloudy'],
    'Bump': [False],
    'Crossing': [False],
    'Give_Way': [False],
    'Junction': [False],
    'No_Exit': [False],
    'Railway': [False],
    'Roundabout': [False],
    'Station': [False],
    'Stop': [False],
    'Traffic_Calming': [False],
    'Traffic_Signal': [False],
    'Turning_Loop': [False],
    'Sunrise_Sunset': ['Day']
}

new_data_df = pd.DataFrame(new_data)
predicted_severity = pipeline.predict(new_data_df)
print('The predicted severity is:', predicted_severity[0])


The predicted severity is: 2
