In [None]:
!python -m pip install pandas
!python -m pip install scikit-learn
!python -m pip install imblearn
!python -m pip install xgboost

In [4]:
import pandas as pd
df = pd.read_csv(r"C:\Users\Juliana\OneDrive\Documents\GitHub\Group3_AI4ALL-Project\Data\Miami_Accidents(Clean).csv")
df = df.drop('ID', axis=1)

Model

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming 'df' is your DataFrame which already contains your data.

# Define numerical, categorical, and text columns for preprocessing
numerical_cols = ['Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 
                  'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)', 
                  'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)']

categorical_cols = ['Street', 'Zipcode', 'Weather_Condition', 'Sunrise_Sunset']

text_col = 'Description'  # Text column for feature extraction

# Set up the ColumnTransformer for preprocessing
# This will apply different transformations to different types of data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Normalize numerical data
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),  # Convert categorical data to one-hot encoded vectors
        ('txt', CountVectorizer(), text_col)  # Convert text data into a matrix of token counts
    ],
    remainder='drop'  # Ignore columns that are not specified
)

# Create the pipeline with preprocessing, SMOTE, and XGBClassifier
model_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),  # First apply the preprocessor defined above
    ('smote', SMOTE(random_state=42)),  # Then apply SMOTE to handle class imbalance
    ('classifier', XGBClassifier(  # Finally, use XGBClassifier to train the model
        n_estimators=100,
        scale_pos_weight=3,  # Adjust this based on the ratio of majority to minority class instances
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42
    ))
])

# Prepare features and target variable
X = df.drop('Severity', axis=1)  # Features
y = df['Severity'] - 1  # Target variable, adjust classes to start from 0

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model using the pipeline
model_pipeline.fit(X_train, y_train)

# Make predictions (remember to adjust back the predictions to original class labels)
predictions_adjusted = model_pipeline.predict(X_test)
predictions = predictions_adjusted + 1  # Adjust predictions to match original class labels

# Evaluate the model using the original class labels
print("Accuracy:", accuracy_score(y_test + 1, predictions))  # Adjust y_test to match original labels for comparison
print("Classification Report:\n", classification_report(y_test + 1, predictions))


Parameters: { "scale_pos_weight" } are not used.



Accuracy: 0.977088948787062
Classification Report:
               precision    recall  f1-score   support

           1       0.45      0.71      0.56         7
           2       0.99      0.99      0.99     23641
           3       0.50      0.61      0.55       512
           4       0.94      0.90      0.92       326

    accuracy                           0.98     24486
   macro avg       0.72      0.80      0.75     24486
weighted avg       0.98      0.98      0.98     24486



Confusion matrix

In [8]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred =model_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.45      0.71      0.56         7
           1       0.99      0.99      0.99     23641
           2       0.50      0.61      0.55       512
           3       0.94      0.90      0.92       326

    accuracy                           0.98     24486
   macro avg       0.72      0.80      0.75     24486
weighted avg       0.98      0.98      0.98     24486

Matriz de Confusión:
 [[    5     1     1     0]
 [    6 23314   303    18]
 [    0   198   313     1]
 [    0    30     3   293]]


Predictions 

In [None]:
1.

In [None]:
print(df.iloc[1225])

should be 4

In [10]:
import pandas as pd

# Datos para predecir
new_data_to_predict = {
    'Start_Time': ['2016-10-17 23:23:59'],
    'End_Time': ['2016-10-17 23:23:59'],
    'Start_Lat': [25.785738],
    'Start_Lng': [-80.175632],
    'End_Lat': [25.78575],
    'End_Lng': [-80.17564],
    'Distance(mi)': [0.001],
    'Description': ['Closed at Trooper Robert G Smith Brg - Road closed due to accident.'],
    'Street': ['MacArthur Cswy W'],
    'Zipcode': ['33132'],
    'Temperature(F)': [80.1],
    'Wind_Chill(F)': [78.486101],
    'Humidity(%)': [64.0],
    'Pressure(in)': [29.93],
    'Visibility(mi)': [10.0],
    'Wind_Speed(mph)': [6.9],
    'Precipitation(in)': [0.009293],
    'Weather_Condition': ['Partly Cloudy'],
    'Bump': [False],
    'Crossing': [True],
    'Give_Way': [False],
    'Junction': [False],
    'No_Exit': [True],
    'Railway': [False],
    'Roundabout': [False],
    'Station': [False],
    'Stop': [False],
    'Traffic_Calming': [False],
    'Traffic_Signal': [False],
    'Turning_Loop': [False],
    'Sunrise_Sunset': ['Night']
}


new_data_df = pd.DataFrame(new_data_to_predict)

predicted_severity = model_pipeline.predict(new_data_df)


predicted_severity = predicted_severity + 1

# 
print('The predicted severity is:', predicted_severity[0])


The predicted severity is: 4


2.

print(df.iloc[0])

should be 2

In [11]:
import pandas as pd

new_data_to_predict = {
    'Start_Time': ['11/30/2016 16:40:31'],
    'End_Time': ['11/30/2016 17:10:19'],
    'Start_Lat': [25.78601],
    'Start_Lng': [-80.25809],
    'End_Lat': [25.78336],
    'End_Lng': [-80.26911],
    'Distance(mi)': [0.71],
    'Description': ['At SR-953/42nd Ave/Le Jeune Rd - Accident.'],
    'Street': ['Dolphin Expy W'],
    'Zipcode': ['33126'],
    'Temperature(F)': [78.1],
    'Wind_Chill(F)': [78.486101],
    'Humidity(%)': [76],
    'Pressure(in)': [29.96],
    'Visibility(mi)': [10],
    'Wind_Speed(mph)': [11.5],
    'Precipitation(in)': [0.009293],
    'Weather_Condition': ['Mostly Cloudy'],
    'Bump': [False],
    'Crossing': [False],
    'Give_Way': [False],
    'Junction': [False],
    'No_Exit': [False],
    'Railway': [False],
    'Roundabout': [False],
    'Station': [False],
    'Stop': [False],
    'Traffic_Calming': [False],
    'Traffic_Signal': [False],
    'Turning_Loop': [False],
    'Sunrise_Sunset': ['Day']
}

# Convert the dictionary into a DataFrame
new_data_df = pd.DataFrame(new_data_to_predict)

# Using the pipeline to make the forecast
predicted_severity = model_pipeline.predict(new_data_df)

# Adjust the prediction to the original class scale, if needed
predicted_severity = predicted_severity + 1  

print('The predicted severity is:', predicted_severity[0])


The predicted severity is: 3
