In [927]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [980]:
df = pd.read_csv("train.csv")

In [981]:
df['Dates'] = pd.to_datetime(df['Crash Date/Time']).dt.date
df['Time'] = pd.to_datetime(df['Crash Date/Time']).dt.time
df['Day'] = pd.to_datetime(df['Crash Date/Time']).dt.day

In [982]:
df['Year'] = pd.DatetimeIndex(df['Dates']).year
df['Month'] = pd.DatetimeIndex(df['Dates']).month

In [983]:
df.keys()

Index(['x', 'Local Case Number', 'Agency Name', 'ACRS Report Type',
       'Crash Date/Time', 'Route Type', 'Road Name', 'Cross-Street Type',
       'Cross-Street Name', 'Off-Road Description', 'Municipality',
       'Related Non-Motorist', 'Collision Type', 'Weather',
       'Surface Condition', 'Light', 'Traffic Control',
       'Driver Substance Abuse', 'Non-Motorist Substance Abuse', 'Person ID',
       'Injury Severity', 'Circumstance', 'Drivers License State',
       'Vehicle ID', 'Vehicle Damage Extent', 'Vehicle First Impact Location',
       'Vehicle Second Impact Location', 'Vehicle Body Type',
       'Vehicle Movement', 'Vehicle Continuing Dir', 'Vehicle Going Dir',
       'Speed Limit', 'Driverless Vehicle', 'Parked Vehicle', 'Vehicle Year',
       'Vehicle Make', 'Vehicle Model', 'Equipment Problems', 'Latitude',
       'Longitude', 'Location', 'Fault', 'Dates', 'Time', 'Day', 'Year',
       'Month'],
      dtype='object')

In [984]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51490 entries, 0 to 51489
Data columns (total 47 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   x                               51490 non-null  object 
 1   Local Case Number               45037 non-null  object 
 2   Agency Name                     51490 non-null  object 
 3   ACRS Report Type                51490 non-null  object 
 4   Crash Date/Time                 51490 non-null  object 
 5   Route Type                      46522 non-null  object 
 6   Road Name                       46988 non-null  object 
 7   Cross-Street Type               46471 non-null  object 
 8   Cross-Street Name               46968 non-null  object 
 9   Off-Road Description            6950 non-null   object 
 10  Municipality                    7567 non-null   object 
 11  Related Non-Motorist            3648 non-null   object 
 12  Collision Type                  

In [985]:
cols = [0, 1, 4, 6, 8, 9, 10, 11, 18, 19, 21, 23, 27, 34, 36, 40, 42, 43]
df = df.drop(df.columns[cols], axis=1)

In [986]:
df.dropna(axis=0, how='any', inplace=True)

In [987]:
df.shape

(23918, 29)

In [988]:
df.keys()

Index(['Agency Name', 'ACRS Report Type', 'Route Type', 'Cross-Street Type',
       'Collision Type', 'Weather', 'Surface Condition', 'Light',
       'Traffic Control', 'Driver Substance Abuse', 'Injury Severity',
       'Drivers License State', 'Vehicle Damage Extent',
       'Vehicle First Impact Location', 'Vehicle Second Impact Location',
       'Vehicle Movement', 'Vehicle Continuing Dir', 'Vehicle Going Dir',
       'Speed Limit', 'Driverless Vehicle', 'Parked Vehicle', 'Vehicle Make',
       'Equipment Problems', 'Latitude', 'Longitude', 'Fault', 'Day', 'Year',
       'Month'],
      dtype='object')

In [989]:
# for col in df:
#     print(col, " ",len(df[col].unique()))
    
# for col in df_test:
#     print(col, " ",len(df_test[col].unique()))
    

In [990]:
def one_hot_encode(df):
    cols = ['Agency Name', 'ACRS Report Type', 'Speed Limit', 'Driverless Vehicle', 'Parked Vehicle', 'Route Type', 
            'Cross-Street Type', 'Collision Type', 'Weather', 'Surface Condition', 
            'Traffic Control', 'Driver Substance Abuse', 'Light', 'Vehicle Continuing Dir',
            'Vehicle Going Dir', 'Vehicle First Impact Location', 
            'Vehicle Second Impact Location', 'Vehicle Movement', 'Vehicle Damage Extent']
    one_hot_encoded_data = pd.get_dummies(df, columns = cols)
    df = one_hot_encoded_data
    return df

def ranking_encode(df):
    orderal_columns = ['Drivers License State', 'Vehicle Make',
       'Equipment Problems', 'Injury Severity']
    orderal_encoder = LabelEncoder()
    for column in orderal_columns:
        df[column] = orderal_encoder.fit_transform(df[column])
    return df

In [991]:
df = one_hot_encode(df)
print(df)

              Injury Severity Drivers License State Vehicle Make  \
0      SUSPECTED MINOR INJURY                    MD          GMC   
1          NO APPARENT INJURY                    MD         FORD   
3             POSSIBLE INJURY                    MD         CHEV   
4      SUSPECTED MINOR INJURY                    MD          KIA   
5          NO APPARENT INJURY                    MD       TOYOTA   
...                       ...                   ...          ...   
51480         POSSIBLE INJURY                    WA        HONDA   
51484      NO APPARENT INJURY                    MD         SPAR   
51487  SUSPECTED MINOR INJURY                    MD      PONTIAC   
51488         POSSIBLE INJURY                    MD         FORD   
51489      NO APPARENT INJURY                    MD         MACK   

      Equipment Problems   Latitude  Longitude  Fault  Day  Year  Month  ...  \
0              NO MISUSE  39.263378 -77.344203      1    5  2017      6  ...   
1              NO MISUS

In [992]:
df = ranking_encode(df)
print(df)

       Injury Severity  Drivers License State  Vehicle Make  \
0                    3                     25           216   
1                    1                     25           157   
3                    2                     25            75   
4                    3                     25           352   
5                    1                     25           667   
...                ...                    ...           ...   
51480                2                     59           249   
51484                1                     25           588   
51487                3                     25           537   
51488                2                     25           157   
51489                1                     25           387   

       Equipment Problems   Latitude  Longitude  Fault  Day  Year  Month  ...  \
0                       4  39.263378 -77.344203      1    5  2017      6  ...   
1                       4  39.068913 -77.063227      1   22  2020      2  ...   


In [993]:
X_train = df.loc[ : , df.columns != 'Fault'].values
Y_train = df['Fault'].values

print(X_train.shape)
print(Y_train.shape)

(23918, 203)
(23918,)


In [995]:
df_test = pd.read_csv("test.csv")

In [996]:
df_test.shape

(77235, 42)

In [997]:
df_test.columns

Index(['Id', 'Report Number', 'Local Case Number', 'Agency Name',
       'ACRS Report Type', 'Crash Date/Time', 'Route Type', 'Road Name',
       'Cross-Street Type', 'Cross-Street Name', 'Off-Road Description',
       'Municipality', 'Related Non-Motorist', 'Collision Type', 'Weather',
       'Surface Condition', 'Light', 'Traffic Control',
       'Driver Substance Abuse', 'Non-Motorist Substance Abuse', 'Person ID',
       'Injury Severity', 'Circumstance', 'Drivers License State',
       'Vehicle ID', 'Vehicle Damage Extent', 'Vehicle First Impact Location',
       'Vehicle Second Impact Location', 'Vehicle Body Type',
       'Vehicle Movement', 'Vehicle Continuing Dir', 'Vehicle Going Dir',
       'Speed Limit', 'Driverless Vehicle', 'Parked Vehicle', 'Vehicle Year',
       'Vehicle Make', 'Vehicle Model', 'Equipment Problems', 'Latitude',
       'Longitude', 'Location'],
      dtype='object')

In [998]:
df_test['Dates'] = pd.to_datetime(df_test['Crash Date/Time']).dt.date
df_test['Time'] = pd.to_datetime(df_test['Crash Date/Time']).dt.time
df_test['Day'] = pd.to_datetime(df_test['Crash Date/Time']).dt.day
df_test['Year'] = pd.DatetimeIndex(df_test['Dates']).year
df_test['Month'] = pd.DatetimeIndex(df_test['Dates']).month

In [999]:
ids = df_test['Id']

In [1000]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77235 entries, 0 to 77234
Data columns (total 47 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Id                              77235 non-null  int64  
 1   Report Number                   77235 non-null  object 
 2   Local Case Number               67162 non-null  object 
 3   Agency Name                     77235 non-null  object 
 4   ACRS Report Type                77235 non-null  object 
 5   Crash Date/Time                 77235 non-null  object 
 6   Route Type                      69768 non-null  object 
 7   Road Name                       70166 non-null  object 
 8   Cross-Street Type               69755 non-null  object 
 9   Cross-Street Name               70164 non-null  object 
 10  Off-Road Description            7069 non-null   object 
 11  Municipality                    8498 non-null   object 
 12  Related Non-Motorist            

In [1001]:
cols = [0, 1, 2, 5, 7, 9, 10, 11, 12, 19, 20, 22, 24, 28, 35, 37, 41, 42, 43]

In [1002]:
df_test = df_test.drop(df_test.columns[cols], axis=1)

In [1003]:
df_test.keys()


Index(['Agency Name', 'ACRS Report Type', 'Route Type', 'Cross-Street Type',
       'Collision Type', 'Weather', 'Surface Condition', 'Light',
       'Traffic Control', 'Driver Substance Abuse', 'Injury Severity',
       'Drivers License State', 'Vehicle Damage Extent',
       'Vehicle First Impact Location', 'Vehicle Second Impact Location',
       'Vehicle Movement', 'Vehicle Continuing Dir', 'Vehicle Going Dir',
       'Speed Limit', 'Driverless Vehicle', 'Parked Vehicle', 'Vehicle Make',
       'Equipment Problems', 'Latitude', 'Longitude', 'Day', 'Year', 'Month'],
      dtype='object')

In [1004]:
df_test.shape

(77235, 28)

In [1005]:
df_test.shape

(77235, 28)

In [1006]:
df_test = one_hot_encode(df_test)
print(df_test)

              Injury Severity Drivers License State  Vehicle Make  \
0          NO APPARENT INJURY                    MD         LEXUS   
1          NO APPARENT INJURY                    MD    MITSUBISHI   
2          NO APPARENT INJURY                    MD  FRIEGHTLINER   
3          NO APPARENT INJURY                    MD          FORD   
4          NO APPARENT INJURY                   NaN          UTIL   
...                       ...                   ...           ...   
77230      NO APPARENT INJURY                    MD    PETERBUILT   
77231      NO APPARENT INJURY                    DC        TOYOTA   
77232  SUSPECTED MINOR INJURY                    MD         MAZDA   
77233      NO APPARENT INJURY                    MD          CADI   
77234      NO APPARENT INJURY                    MD          NISS   

      Equipment Problems   Latitude  Longitude  Day  Year  Month  \
0              NO MISUSE  38.988440 -77.127668   16  2015     12   
1              NO MISUSE  39.094075

In [1007]:
df_test = ranking_encode(df_test)

In [1008]:
classifier = RandomForestClassifier(n_estimators = 5000)
classifier.fit(X_train, Y_train)
y_pred = classifier.predict(df_test)

In [1009]:
y_pred = pd.DataFrame(data = np.array(y_pred), columns = ['Fault'])
sub = pd.concat([ids, y_pred], axis=1)
sub.to_csv("submission.csv", index=False)