# Cleaning

## Setup

In [13]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
# preprocessing/pipelines
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.compose import make_column_selector as selector


# import data
full_data = pd.read_csv("Rail_Equipment_Accident_Incident_Data.csv")
print(full_data.shape)
full_data.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Reporting Railroad Code,Reporting Railroad Name,Report Year,Accident Number,PDF Link,Accident Year,Accident Month,Other Railroad Code,Other Railroad Name,Other Accident Number,...,Other Railroad SMT Grouping,Other Parent Railroad Company Code,Other Parent Railroad Company Name,Other Railroad Holding Company,Maintenance Railroad Company Grouping,Maintenance Railroad Class,Maintenance Railroad SMT Grouping,Maintenance Parent Railroad Company Code,Maintenance Parent Railroad Company Name,Maintenance Railroad Holding Company
0,NS,Norfolk Southern Railway Company,2016.0,120068,https://safetydata.fra.dot.gov/Officeofsafety/...,16.0,4.0,,,,...,,,,,,Class 1,SMT-3 - Norfolk Southern,NS,Norfolk Southern Railway Company,Norfolk Southern Railway Company
1,NS,Norfolk Southern Railway Company,2016.0,120068,https://safetydata.fra.dot.gov/Officeofsafety/...,16.0,4.0,,,,...,,,,,,Class 1,SMT-3 - Norfolk Southern,NS,Norfolk Southern Railway Company,Norfolk Southern Railway Company
2,CR,Conrail,1981.0,420001,https://safetydata.fra.dot.gov/Officeofsafety/...,81.0,4.0,,,,...,,,,,,Class 3,,CRSH,Consolidated Rail Corporation,Not Assigned
3,NS,Norfolk Southern Railway Company,2016.0,120161,https://safetydata.fra.dot.gov/Officeofsafety/...,16.0,4.0,,,,...,,,,,,Class 1,SMT-3 - Norfolk Southern,NS,Norfolk Southern Railway Company,Norfolk Southern Railway Company
4,NS,Norfolk Southern Railway Company,2016.0,120161,https://safetydata.fra.dot.gov/Officeofsafety/...,16.0,4.0,,,,...,,,,,,Class 1,SMT-3 - Norfolk Southern,NS,Norfolk Southern Railway Company,Norfolk Southern Railway Company


## Dropping Features

In [16]:
features_to_drop = ["Reporting Railroad Code", #code variables/features
                 "PDF Link",
                 "Other Railroad Code",
                 "Maintenance Railroad Code",
                 "Accident Type Code",
                 "State Code",
                 "County Code",
                 "Visibility Code",
                 "Weather Condition Code",
                 "Track Type Code",
                 "Train Direction Code",
                 "Equipment Type Code",
                 "Signalization Code",
                 "Method of Operation Code",
                 "Remote Control Locomotive Code",
                 "Primary Accident Cause Code",
                 "Contributing Accident Cause Code",
                 "Accident Cause Code",
                 "Class Code",
                 "Reporting Parent Railroad Company Code",
                 "Other Parent Railroad Company Code",
                 "Division Code",
                 "Maintenance Parent Railroad Company Code", #identification variables/features
                 "Accident Number",
                 "First Car Initials",
                 "Report Key",
                 "Incident Key",
                 "Train Number",
                 "Maintenance Accident Number",
                 "State Name" #have state abbrev
                 
]

data = full_data.drop(features_to_drop,axis=1)
data.shape

(216100, 130)

In [17]:
# data["Accident Cause"].head(5)
s = "JOB SHOVING 8 CARS INTO 121 TRACK DERAILED TWO CARS AND TWO ENGINES JUST EAST OF 121 SWITCH"
# find the column that contains s
data.columns[data.isin([s]).any()]

Index([], dtype='object')

In [18]:
# Dropping additional columns here that we won't initially need
data = data.drop(["Accident Cause"],axis=1)

## Feature Engineering/Cleaning

In [6]:
# supposedly these are columns with multiple data types, need further examination
df_test = data.iloc[:,[22,36,37,46,47,48,49,50,51,52,53,54,56,112,113,118]]
df_test.head()

Unnamed: 0,Subdivision,Track Class,Track Density,Signalization,Method of Operation,Adjunct Code 1,Adjunct Code Name 1,Adjunct Code 2,Adjunct Code Name 2,Adjunct Code 3,Adjunct Code Name 3,Remote Control Locomotive,First Car Number,Special Study 2,Latitude,Class
0,SYSTEM,1,,Not Signaled,Other Than Main Track,,,,,,,Not a remotely controlled operation,203.0,000-000-000,41.884035,No
1,SYSTEM,1,,Not Signaled,Other Than Main Track,,,,,,,,,000-000-000,41.884035,No
2,,1,,,,,,,,,,,900064.0,,,1L
3,,1,,,,,,,,,,Not a remotely controlled operation,2000.0,,0.0,1L
4,CHICAGO,1,,Not Signaled,Other Than Main Track,,,,,,,Not a remotely controlled operation,668680.0,000-000-000,41.054254,Cl


### Handling NAs

In [40]:
# Handling NaNs first 
# remove columns with > 50,000 NaNs
# set remaining NaNs to mean for numerical data, encode "NA" for categorical
na_counts = data.isna().sum()
cols_to_drop = na_counts[na_counts > 50000].index.tolist()
data.drop(cols_to_drop, axis=1, inplace=True)

#split data into numerical and categorical data to process

# quanititative features
num_data = data.select_dtypes(include=['float64', 'int64'])
#num_data.fillna(num_data.mean(), inplace=True)
print(num_data.isna().sum().sort_values(ascending=False))

# qualititative features
cat_data = data.select_dtypes(include=['object'])
cat_data.fillna('NA', inplace=True)
cat_data = cat_data.astype(str)
#print(cat_data.isna().sum().sort_values(ascending=False))
#print(num_data.dtypes)
new_dat = pd.concat([num_data,cat_data], axis=1)
new_dat.head()

Hours Engineers On Duty                   45952
Firemen On Duty                           40014
Brakemen On Duty                          33774
Conductors On Duty                        27498
Engineers On Duty                         25561
First Car Position                        13357
Train Speed                                   3
Report Year                                   1
Derailed Loaded Passenger Cars                1
Derailed Empty Freight Cars                   1
Derailed Empty Passenger Cars                 1
Derailed Cabooses                             1
Equipment Damage Cost                         1
Track Damage Cost                             1
Total Damage Cost                             1
Railroad Employees Killed                     1
Cabooses                                      1
Railroad Employees Injured                    1
Passengers Killed                             1
Passengers Injured                            1
Others Killed                           

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


Unnamed: 0,Report Year,Accident Year,Accident Month,Maintenance Accident Year,Maintenance Accident Month,Day,Hazmat Cars,Hazmat Cars Damaged,Hazmat Released Cars,Persons Evacuated,...,Joint Track Class,Class,Reporting Railroad Class,Reporting Railroad SMT Grouping,Reporting Parent Railroad Company Name,Reporting Railroad Holding Company,Maintenance Railroad Class,Maintenance Railroad SMT Grouping,Maintenance Parent Railroad Company Name,Maintenance Railroad Holding Company
0,2016.0,16.0,4.0,16.0,4.0,9.0,0.0,0.0,0.0,0.0,...,1,CL,Class 1,SMT-3 - Norfolk Southern,Norfolk Southern Railway Company,Norfolk Southern Railway Company,Class 1,SMT-3 - Norfolk Southern,Norfolk Southern Railway Company,Norfolk Southern Railway Company
1,2016.0,16.0,4.0,16.0,4.0,9.0,4.0,0.0,0.0,0.0,...,1,CL,Class 1,SMT-3 - Norfolk Southern,Norfolk Southern Railway Company,Norfolk Southern Railway Company,Class 1,SMT-3 - Norfolk Southern,Norfolk Southern Railway Company,Norfolk Southern Railway Company
2,1981.0,81.0,4.0,81.0,4.0,6.0,0.0,0.0,0.0,0.0,...,1,1L,Class 3,,Consolidated Rail Corporation,Not Assigned,Class 3,,Consolidated Rail Corporation,Not Assigned
3,2016.0,16.0,4.0,16.0,4.0,16.0,0.0,0.0,0.0,0.0,...,1,CL,Class 1,SMT-3 - Norfolk Southern,Norfolk Southern Railway Company,Norfolk Southern Railway Company,Class 1,SMT-3 - Norfolk Southern,Norfolk Southern Railway Company,Norfolk Southern Railway Company
4,2016.0,16.0,4.0,16.0,4.0,16.0,0.0,0.0,0.0,0.0,...,1,CL,Class 1,SMT-3 - Norfolk Southern,Norfolk Southern Railway Company,Norfolk Southern Railway Company,Class 1,SMT-3 - Norfolk Southern,Norfolk Southern Railway Company,Norfolk Southern Railway Company


### Preprocessing Pipeline 

In [22]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude=object)),
        ("cat", categorical_transformer, selector(dtype_include=object)),
    ]
)



In [31]:
ohe = OneHotEncoder()
ohe.fit_transform(cat_data)

<216100x401079 sparse matrix of type '<class 'numpy.float64'>'
	with 7563500 stored elements in Compressed Sparse Row format>

In [41]:
# Testing Preprocessing Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

def conf_matrix_to_df(conf_matrix, target_names):
    return pd.DataFrame(conf_matrix, columns=target_names, index=target_names)

rf_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier())]
)

# data for test
X = pd.DataFrame(new_dat.drop('Accident Type',axis=1))
Y = pd.DataFrame(new_dat.loc[:,'Accident Type'])

x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=.25)

# testing pipeline with RF
rf_pipe.fit(x_train,y_train)
pred = rf_pipe.predict(x_test)
conf_mat = confusion_matrix(y_test, pred)
conf_mat_df = conf_matrix_to_df(conf_mat,data['Accident Type'].unique())
conf_mat_df

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Unnamed: 0,Derailment,Side collision,Hwy-rail crossing,Other impacts,Other (describe in narrative),Rear end collision,Fire/violent rupture,Obstruction,Raking collision,Head on collision,Broken train collision,RR grade crossing,Explosion-detonation,NaN
Derailment,10,157,0,0,0,3,0,0,0,5,0,0,0,13
Side collision,0,34416,0,2,1,42,0,8,14,227,0,2,7,186
Hwy-rail crossing,0,17,0,1,0,1,0,0,0,0,0,0,0,0
Other impacts,0,419,0,238,0,23,0,3,39,25,0,0,0,30
Other (describe in narrative),0,370,0,1,79,16,0,3,0,10,0,0,17,81
Rear end collision,0,420,0,0,1,2310,0,5,6,5,0,0,0,2
Fire/violent rupture,0,1,0,0,0,0,0,0,0,0,0,0,0,0
Obstruction,0,431,0,6,0,167,0,226,61,29,0,3,3,5
Raking collision,0,603,0,2,0,14,0,7,416,69,0,0,1,4
Head on collision,0,2964,0,2,2,23,0,7,22,1909,0,1,8,180


# Cleaning for Neural Network

In [7]:
data_nn = data

# Accident Year, Accident Month, Maintenence Railroad name, State Name, Visibility, Train Speed, Total Damage Cost
data_nn = data_nn[["Accident Year", "Accident Month", "Maintenance Railroad Name", "State Name", "Visibility", "Train Speed", "Total Damage Cost"]]


# Save the data to a csv file
data_nn.to_csv("data_nn.csv", index=False)


## Regularization

## Saving

In [7]:
# save data to file
data.to_csv("cleaned_data.csv",index=False) # index=False to not save the index column

In [8]:
# saves all column names to separate file in order to find different variables
with open("column_names.txt", "w") as f:
    for s in data.columns:
        f.write(s + "\n")