In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [12]:
# import data
data = pd.read_csv("data_nn.csv")

# Convert Total Damage Cost from numbers with commas to numbers
data['Total Damage Cost'] = data['Total Damage Cost'].str.replace(',', '')
data['Total Damage Cost'] = data['Total Damage Cost'].astype(float)
data.head()


Unnamed: 0,Accident Year,Accident Month,Maintenance Railroad Name,State Name,Visibility,Train Speed,Total Damage Cost
0,17.0,6.0,NORTHEAST ILLINOIS REGIONAL COMMUTER RAIL CORP...,ILLINOIS,Day,10.0,132013.0
1,17.0,6.0,NORTHEAST ILLINOIS REGIONAL COMMUTER RAIL CORP...,ILLINOIS,Day,0.0,132013.0
2,81.0,4.0,Conrail,NEW YORK,Day,0.0,4235.0
3,7.0,1.0,Cargill Elevator,LOUISIANA,Day,4.0,9986.0
4,17.0,10.0,Illinois Central Railroad Company,ILLINOIS,Dark,0.0,13231.0


In [13]:
# Since Maintenance Railroad Name, State Name, and Visibilty are categorical variables, we need to convert them to numerical values. We will use one-hot encoding to do this. We will also drop the ID column since it is not needed for the model.
# We save the column values for later when we graph the results.
data = pd.get_dummies(data, columns=['Maintenance Railroad Name', 'State Name', 'Visibility'])
data.head()

Unnamed: 0,Accident Year,Accident Month,Train Speed,Total Damage Cost,Maintenance Railroad Name_3M,Maintenance Railroad Name_4 BROTHERS RAIL MAINTENANCE INC.,Maintenance Railroad Name_84 Lumber,Maintenance Railroad Name_A & R BULK,"Maintenance Railroad Name_A & R TRANSPORT, INC.",Maintenance Railroad Name_A E STALEY,...,State Name_VERMONT,State Name_VIRGINIA,State Name_WASHINGTON,State Name_WEST VIRGINIA,State Name_WISCONSIN,State Name_WYOMING,Visibility_Dark,Visibility_Dawn,Visibility_Day,Visibility_Dusk
0,17.0,6.0,10.0,132013.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,17.0,6.0,0.0,132013.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,81.0,4.0,0.0,4235.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,7.0,1.0,4.0,9986.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,17.0,10.0,0.0,13231.0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [9]:
# split data into training and testing
# We are going to try and predict Total Damage Cost
# We will use all other columns as features
X = data.drop("Total Damage Cost", axis=1)
y = data["Total Damage Cost"]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# create model
mlp = MLPClassifier(hidden_layer_sizes=(100,100,100), max_iter=1000)

# train model
mlp.fit(x_train, y_train)

# predict
predictions = mlp.predict(x_test)

# accuracy
accuracy_score(y_test, predictions)


ValueError: Input X contains NaN.
MLPClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Finding features that were most important
importances = mlp.coefs_[0]
importances

# Plotting the features
plt.figure(figsize=(16,8))
plt.bar(x_train.columns, importances)
plt.xticks(rotation=90)
plt.show()