In [3]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

crashes_cleaned = pd.read_csv('../data/crashes_cleaned.csv')
people_cleaned = pd.read_csv('../data/people_cleaned.csv')

In [6]:
crashes_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442382 entries, 0 to 442381
Columns: 128 entries, Unnamed: 0 to MOST_SEVERE_INJURY_REPORTED, NOT EVIDENT
dtypes: float64(2), int64(124), object(2)
memory usage: 432.0+ MB


In [9]:
col_names = ['CRASH_RECORD_ID', 'AGE', 'SEX_M']

people_cleaned = people_cleaned[col_names]

people_cleaned['CRASH_RECORD_ID'].value_counts()

CRASH_RECORD_ID
31ecf6862c691ff12d3856213b902c146b07337b42a5692e3a176a66d684d221028bb5118ef6d67a313bcaed9e97bee1855cb1f5e8650f49e8dc17663475a1ee    61
13026c7fb51566d9ca487a093e38c6f5621c2ec25be48c306b6574983b61daeee589524b96bb2bfe66ddd0f695c8d2bf3ab0297558528e9c7a70363c763d6bd1    50
3eda323ea45cd6e2b459bf5ba570dcf74e71f3fe1aa449231a47fd1dd20ce71de888840d420dc54b61ca643159b46494979dad05e407d8138438a675c615575d    48
1829f52c1281a0396ef94692331b3dc530bc4be5a54cd55e94c24a5e5e49b800fbcf9f24dabe4c8277c8964ad05aadc89e90fd94021959d6dff5fad55480d595    46
c727dc759107cf17b2e8141149347128bb4bc26b026c7805562206c7c5761c543dd7cc0e47fc11379455a2ecbb2847c3d1744d6feb78f276d9a457e9beeb6121    45
                                                                                                                                    ..
1fc027744abf5678572904c8fb22aed533ad8876b6e8e12f7e81fb766d5fa6435b06a9b5deec2621b711ceb338aae7bf1fbfa3529da6bb64167e67dfab8c34b9     1
5bd03fd838daf3447fa553dd6c093a97d18141d

In [8]:
combined = crashes_cleaned.merge(people_cleaned, on='CRASH_RECORD_ID', how='inner')

combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1055832 entries, 0 to 1055831
Columns: 130 entries, Unnamed: 0 to SEX_M
dtypes: float64(4), int64(124), object(2)
memory usage: 1.0+ GB


In [33]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.6/235.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.11.0 imblearn-0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [34]:

from imblearn.over_sampling import SMOTE

le = LabelEncoder()

crashes_cleaned['PRIM_CONTRIBUTORY_CAUSE'] = le.fit_transform(crashes_cleaned['PRIM_CONTRIBUTORY_CAUSE'])

X = crashes_cleaned.drop(columns = ['CRASH_RECORD_ID', 'Unnamed: 0', 'PRIM_CONTRIBUTORY_CAUSE', 'LONGITUDE', 'LATITUDE', 
                                    'WEATHER_CONDITION_CLEAR', 'ALIGNMENT_STRAIGHT AND LEVEL', 'ROADWAY_SURFACE_COND_SNOW OR SLUSH', 
                                    'ROADWAY_SURFACE_COND_WET', 'ROAD_DEFECT_UNKNOWN', 'MOST_SEVERE_INJURY_NO INDICATION OF INJURY'])
y = crashes_cleaned['PRIM_CONTRIBUTORY_CAUSE']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
ss = StandardScaler()

ss.fit(X_train_resampled)

X_train_scaled = ss.transform(X_train_resampled)
X_test_scaled = ss.transform(X_test)

In [36]:
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter = 1000)

model.fit(X_train_scaled, y_train_resampled)

In [37]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, log_loss, f1_score, ConfusionMatrixDisplay, roc_auc_score

y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
#precision = precision_score(y_test, y_pred)
#recall = recall_score(y_test, y_pred)
#f1 = f1_score(y_test, y_pred)
#auc_score = roc_auc_score(y_test, y_pred)

print('Accuracy: ', accuracy)
#print('Precision: ' + precision)
#print('Recall: ' + recall)
#print('F1 score: ' + f1)
#print('ROC-AUC: ' + auc_score)





Accuracy:  0.8407718181489385


In [None]:
from sklearn.model_selection import GridSearchCV
le = LabelEncoder()

crashes_cleaned['PRIM_CONTRIBUTORY_CAUSE'] = le.fit_transform(crashes_cleaned['PRIM_CONTRIBUTORY_CAUSE'])

X2 = crashes_cleaned.drop(columns = ['CRASH_RECORD_ID', 'Unnamed: 0', 'PRIM_CONTRIBUTORY_CAUSE', 'LONGITUDE', 'LATITUDE', 
                                    'WEATHER_CONDITION_CLEAR', 'ALIGNMENT_STRAIGHT AND LEVEL', 'ROADWAY_SURFACE_COND_SNOW OR SLUSH', 
                                    'ROADWAY_SURFACE_COND_WET', 'ROAD_DEFECT_UNKNOWN', 'MOST_SEVERE_INJURY_NO INDICATION OF INJURY'])

y = crashes_cleaned['PRIM_CONTRIBUTORY_CAUSE']

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, random_state=0)

ss = StandardScaler()

ss.fit(X_train)

X_train_scaled = ss.transform(X_train)
X_test_scaled = ss.transform(X_test)

# Define the hyperparameter grid for tuning
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'lbfgs', 'sag', 'saga'],
    'class_weight': [None, 'balanced'],
    'multi_class': ['ovr', 'multinomial'],
    'max_iter': [100, 500, 1000],
    'tol': [1e-4, 1e-3, 1e-2],
}

# Create a grid search object
grid_search = GridSearchCV(model, param_grid, cv=5)

# Fit the grid search to the data
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the final model with the best hyperparameters
final_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', **best_params)
final_model.fit(X_train_scaled, y_train)

# Evaluate the final model on the test set
test_accuracy = final_model.score(X_test, y_test)


