# Setup

In [25]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline


# import data
data = pd.read_csv("cleaned_data.csv")
data.head()

X = data.loc[:,data.columns!="Accident Type"]
Y = data.loc[:,"Accident Type"]

# creating testing and training data
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=.25)

print("X-train shape: ", x_train.shape)
print("Y-train shape: ", y_train.shape)
print("X-test shape: ", x_test.shape)
print("Y-test shape: ", y_test.shape)

# turn conf matrix in df for visualization
def conf_matrix_to_df(conf_matrix, target_names):
    return pd.DataFrame(conf_matrix, columns=target_names, index=target_names)

X-train shape:  (162075, 137)
Y-train shape:  (162075,)
X-test shape:  (54025, 137)
Y-test shape:  (54025,)
['Derailment' 'Side collision' 'Hwy-rail crossing' 'Other impacts'
 'Other (describe in narrative)' 'Rear end collision'
 'Fire/violent rupture' 'Obstruction' 'Raking collision'
 'Head on collision' 'Broken train collision' 'RR grade crossing'
 'Explosion-detonation' nan]


# Modeling

## Random Forest

### Simple Random Forest Classifier

Using a random forest to classify 'Accident Type'. 
<br>Fitting the intial model with no paramter adjustements besides n_estimator (# of trees in the forest) and calculating the accuracy with the testing data.

#### Test

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

cols = ['Time', 'Accident Type', 'State Abbreviation', 'Track Type', 'Track Class', 'Train Speed', 'Accident Cause', 'Reporting Railroad Name', 'Other Railroad Name', 'Maintenance Railroad Name', 'Weather Condition']
X = pd.DataFrame(data.loc[:,cols])
Y = pd.DataFrame(data.loc[:,'Accident Type'])

for column in X.columns:
    X[column] = X[column].fillna('NA')
for column in Y.columns:
    Y[column] = Y[column].fillna('NA')

x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=.25)

print("X-train shape: ", x_train.shape)
print("Y-train shape: ", y_train.shape)
print("X-test shape: ", x_test.shape)
print("Y-test shape: ", y_test.shape)

x_train.dtypes

#encoding catgorical data
ohe = OneHotEncoder()
oe = OrdinalEncoder()
ohe_cols = ['State Abbreviation', 'Track Type', 'Accident Cause', 'Reporting Railroad Name', 'Other Railroad Name', 'Maintenance Railroad Name', 'Weather Condition']
oe_cols = ['Time', 'Track Class']
column_transform = make_column_transformer(
    (ohe, ohe_cols),
    (oe, oe_cols)
)

# rf pipeline
rf_test = RandomForestClassifier()
rf_pipeline = make_pipeline(column_transform, rf_test)

rf_pipeline.fit(x_train, y_train)
pred = rf_pipeline.predict(x_test)
conf_mat = confusion_matrix(y_test, pred)
conf_mat_df = conf_matrix_to_df(conf_mat,data['Accident Type'].unique())
conf_mat_df

X-train shape:  (162075, 11)
Y-train shape:  (162075, 1)
X-test shape:  (54025, 11)
Y-test shape:  (54025, 1)


TypeError: Encoders require their input to be uniformly strings or numbers. Got ['float', 'str']

In [3]:
train, test = train_test_split(data, test_size=.25)

x_train = train.loc[:,train.columns!="Accident Type"]
y_train = train.loc[:,"Accident Type"]
#print(x_train.head())
#print(y_train.head())

x_test = test.loc[:,train.columns!="Accident Type"]
y_test = test.loc[:,"Accident Type"]
#print(x_test.head())
#print(y_test.head())

n_est = 50

# creating and fitting model
simple_rf = RandomForestClassifier(n_estimators=n_est)
#simple_rf.fit(x_train,y_train)

# throwing error due to strings in data, going to need to use One_Hot or Label Encoding in sklearn
# to make strings numerical if we want to use RF approach

#simple_rf.score(x_test,y_test)

ValueError: could not convert string to float: 'Seaboard Coast Line Railroad'

Getting confusion matrix and classification report (precision, recall, accuracy, etc.)

In [None]:
'''simple_rf_pred = simple_rf.predict(x_test)

conf_mat = confusion_matrix(y_test,simple_rf_pred)
conf_mat

classification_report(y_test, simple_rf_pred)'''