In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
sns.set()

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc

In [2]:
df = pd.read_csv('train.csv',index_col=False)
df.head()

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-2017855,MapQuest,201.0,2,2018-07-19 20:30:23,2018-07-19 21:14:11,34.153896,-118.275482,,,...,False,False,False,False,False,False,Night,Day,Day,Day
1,A-3340193,Bing,,2,2020-12-27 13:22:48,2020-12-27 15:02:42,40.261747,-75.25002,40.263001,-75.251072,...,False,False,False,False,False,False,Day,Day,Day,Day
2,A-3274372,Bing,,2,2020-12-19 20:27:52,2020-12-19 22:23:39,29.980875,-90.073829,29.981676,-90.073111,...,False,False,False,False,False,False,Night,Night,Night,Night
3,A-2782559,Bing,,3,2016-09-27 17:29:27,2016-09-27 23:29:27,39.01887,-77.10289,39.01958,-77.10708,...,False,False,False,False,False,False,Day,Day,Day,Day
4,A-3722269,Bing,,2,2020-02-11 19:22:00,2020-02-11 23:22:00,45.74394,-120.17567,45.74394,-120.17567,...,False,False,False,False,False,False,Night,Night,Night,Night


In [3]:
df['Severity'] = (df['Severity'] > 2) * 1
df.head()

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-2017855,MapQuest,201.0,0,2018-07-19 20:30:23,2018-07-19 21:14:11,34.153896,-118.275482,,,...,False,False,False,False,False,False,Night,Day,Day,Day
1,A-3340193,Bing,,0,2020-12-27 13:22:48,2020-12-27 15:02:42,40.261747,-75.25002,40.263001,-75.251072,...,False,False,False,False,False,False,Day,Day,Day,Day
2,A-3274372,Bing,,0,2020-12-19 20:27:52,2020-12-19 22:23:39,29.980875,-90.073829,29.981676,-90.073111,...,False,False,False,False,False,False,Night,Night,Night,Night
3,A-2782559,Bing,,1,2016-09-27 17:29:27,2016-09-27 23:29:27,39.01887,-77.10289,39.01958,-77.10708,...,False,False,False,False,False,False,Day,Day,Day,Day
4,A-3722269,Bing,,0,2020-02-11 19:22:00,2020-02-11 23:22:00,45.74394,-120.17567,45.74394,-120.17567,...,False,False,False,False,False,False,Night,Night,Night,Night


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2962779 entries, 0 to 2962778
Data columns (total 49 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   ID                     object 
 1   Source                 object 
 2   TMC                    float64
 3   Severity               int32  
 4   Start_Time             object 
 5   End_Time               object 
 6   Start_Lat              float64
 7   Start_Lng              float64
 8   End_Lat                float64
 9   End_Lng                float64
 10  Distance.mi.           float64
 11  Description            object 
 12  Number                 float64
 13  Street                 object 
 14  Side                   object 
 15  City                   object 
 16  County                 object 
 17  State                  object 
 18  Zipcode                object 
 19  Country                object 
 20  Timezone               object 
 21  Airport_Code           object 
 22  Weather_Timestamp 

In [6]:
df['Start_Time'] = pd.to_datetime(df['Start_Time'], errors='coerce')
df['End_Time'] = pd.to_datetime(df['End_Time'], errors='coerce')

# Extract year, month, day, hour and weekday
df['month']=df['Start_Time'].dt.strftime('%b')
df['Hour']=df['Start_Time'].dt.hour
df['Weekday']=df['Start_Time'].dt.strftime('%a')

# Extract the amount of time in the unit of minutes for each accident, round to the nearest integer
timed='Time_Duration(min)'
df[timed]=round((df['End_Time']-df['Start_Time'])/np.timedelta64(1,'m'))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2962779 entries, 0 to 2962778
Data columns (total 53 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   ID                     object        
 1   Source                 object        
 2   TMC                    float64       
 3   Severity               int32         
 4   Start_Time             datetime64[ns]
 5   End_Time               datetime64[ns]
 6   Start_Lat              float64       
 7   Start_Lng              float64       
 8   End_Lat                float64       
 9   End_Lng                float64       
 10  Distance.mi.           float64       
 11  Description            object        
 12  Number                 float64       
 13  Street                 object        
 14  Side                   object        
 15  City                   object        
 16  County                 object        
 17  State                  object        
 18  Zipcode               

In [7]:
# Check if there is any negative time_duration values
df[timed][df[timed]<=0]

Series([], Name: Time_Duration(min), dtype: float64)

In [9]:
# Drop the rows with td<0

neg_outliers=df[timed]<=0

# Set outliers to NAN
df[neg_outliers] = np.nan

# Drop rows with negative td
df.dropna(subset=[timed],axis=0,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2962779 entries, 0 to 2962778
Data columns (total 53 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   ID                     object        
 1   Source                 object        
 2   TMC                    float64       
 3   Severity               float64       
 4   Start_Time             datetime64[ns]
 5   End_Time               datetime64[ns]
 6   Start_Lat              float64       
 7   Start_Lng              float64       
 8   End_Lat                float64       
 9   End_Lng                float64       
 10  Distance.mi.           float64       
 11  Description            object        
 12  Number                 float64       
 13  Street                 object        
 14  Side                   object        
 15  City                   object        
 16  County                 object        
 17  State                  object        
 18  Zipcode               

In [11]:
# Remove outliers for Time_Duration(min): n * standard_deviation (n=3), backfill with median

n=3

median = df[timed].median()
std = df[timed].std()
outliers = (df[timed] - median).abs() > std*n

# Set outliers to NAN
df[outliers] = np.nan

# Fill NAN with median
df[timed].fillna(median, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2962779 entries, 0 to 2962778
Data columns (total 53 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   ID                     object        
 1   Source                 object        
 2   TMC                    float64       
 3   Severity               float64       
 4   Start_Time             datetime64[ns]
 5   End_Time               datetime64[ns]
 6   Start_Lat              float64       
 7   Start_Lng              float64       
 8   End_Lat                float64       
 9   End_Lng                float64       
 10  Distance.mi.           float64       
 11  Description            object        
 12  Number                 float64       
 13  Street                 object        
 14  Side                   object        
 15  City                   object        
 16  County                 object        
 17  State                  object        
 18  Zipcode               

In [None]:
df['Severity']

In [12]:
# Print time_duration information
print('Max time to clear an accident: {} minutes or {} hours or {} days; Min to clear an accident td: {} minutes.'.format(df[timed].max(),round(df[timed].max()/60), round(df[timed].max()/60/24), df[timed].min()))

Max time to clear an accident: 10588.0 minutes or 176 hours or 7 days; Min to clear an accident td: 1.0 minutes.


In [17]:
feature_lst=['Source','TMC','Severity','Start_Lng','Start_Lat',
             'Distance.mi.','Side','City','County','State','Timezone',
             'Temperature.F.','Humidity...','Pressure.in.', 
             'Visibility.mi.', 'Wind_Direction','Weather_Condition',
             'Amenity','Bump','Crossing','Give_Way','Junction','No_Exit',
             'Railway','Roundabout','Station','Stop','Traffic_Calming',
             'Traffic_Signal','Turning_Loop','Sunrise_Sunset','Hour',
             'Weekday', 'Time_Duration(min)']

In [18]:
# Select the dataset to include only the selected features
df_sel=df[feature_lst].copy()

In [20]:
df_sel.to_csv('new_cleaned_train.csv')

In [22]:
# Check missing values
df_sel.isna().mean()

Source                0.0
TMC                   0.0
Severity              0.0
Start_Lng             0.0
Start_Lat             0.0
Distance.mi.          0.0
Side                  0.0
City                  0.0
County                0.0
State                 0.0
Timezone              0.0
Temperature.F.        0.0
Humidity...           0.0
Pressure.in.          0.0
Visibility.mi.        0.0
Wind_Direction        0.0
Weather_Condition     0.0
Amenity               0.0
Bump                  0.0
Crossing              0.0
Give_Way              0.0
Junction              0.0
No_Exit               0.0
Railway               0.0
Roundabout            0.0
Station               0.0
Stop                  0.0
Traffic_Calming       0.0
Traffic_Signal        0.0
Turning_Loop          0.0
Sunrise_Sunset        0.0
Hour                  0.0
Weekday               0.0
Time_Duration(min)    0.0
dtype: float64

In [23]:
df_sel.dropna(subset=df_sel.columns[df_sel.isnull().mean()!=0], how='any', axis=0, inplace=True)
df_sel.shape

(1846028, 34)

In [24]:
df_ohe = pd.get_dummies(df_sel,drop_first=True)

In [30]:
df_sel = df_ohe
df_sel.columns

Index(['TMC', 'Severity', 'Start_Lng', 'Start_Lat', 'Distance.mi.',
       'Temperature.F.', 'Humidity...', 'Pressure.in.', 'Visibility.mi.',
       'Amenity',
       ...
       'Weather_Condition_Widespread Dust', 'Weather_Condition_Wintry Mix',
       'Weather_Condition_Wintry Mix / Windy', 'Sunrise_Sunset_Night',
       'Weekday_Mon', 'Weekday_Sat', 'Weekday_Sun', 'Weekday_Thu',
       'Weekday_Tue', 'Weekday_Wed'],
      dtype='object', length=11669)

In [32]:
df_sel_chunk1, drop = train_test_split(df_sel, test_size=0.40, random_state=42)

MemoryError: Unable to allocate 20.0 GiB for an array with shape (11645, 1846028) and data type uint8

In [31]:
df_sel.to_csv('new_cleaned_train.csv')

KeyboardInterrupt: 

In [28]:
X = df_sel.drop(columns=['Severity'],inplace=True)
Y = df_sel['Severity']

MemoryError: Unable to allocate 20.0 GiB for an array with shape (11645, 1846028) and data type uint8

In [37]:
#select only 70% of the data and perform shits on that
#X_sub, x_test, Y_sub, y_test = train_test_split(X, Y, test_size=0.10, random_state=42)

In [20]:
# Generate dummies for categorical data
df_dummy = pd.get_dummies(X,drop_first=True)

In [20]:
X

Unnamed: 0,Start_Lng,Start_Lat,Distance.mi.,Wind_Speed.mph.,Hour,Time_Duration(min)
0,-118.275482,34.153896,0.000,5.8,20.0,44.0
1,-75.250020,40.261747,0.103,6.0,13.0,100.0
2,-90.073829,29.980875,0.070,12.0,20.0,116.0
3,-77.102890,39.018870,0.230,5.8,17.0,360.0
4,-120.175670,45.743940,0.000,20.0,19.0,240.0
...,...,...,...,...,...,...
2962774,-76.559240,43.060840,0.000,3.5,14.0,30.0
2962775,-96.562760,32.784410,0.000,17.0,8.0,45.0
2962776,-93.292750,44.419400,0.000,8.0,7.0,33.0
2962777,-122.121000,47.253830,0.000,13.0,8.0,29.0


In [21]:
X_tr, X_te, Y_tr, Y_te = train_test_split(X, Y, test_size=0.2, random_state=42) 

In [None]:
num_neigh = np.arange(1,10)
acc_tr = np.empty(9)
acc_te = np.empty(9)

#trying out knn for diff values of num_neigh
for n in num_neigh:
    knn_model = KNeighborsClassifier(n_neighbors=n)
    knn_model.fit(X_tr,Y_tr)
    print("now going for ", str(n))
    acc_tr[n-1] = knn_model.score(X_tr,Y_tr)
    acc_te[n-1] = knn_model.score(X_te,Y_te)

In [20]:
dt_entropy = DecisionTreeClassifier(max_depth=8, criterion='entropy', random_state=1)


# Fit dt_entropy to the training set
dt_entropy.fit(X_tr, Y_tr)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=8, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best')

In [23]:
y_hat= dt_entropy.predict(X_te)

# Evaluate accuracy_entropy
accuracy_entropy = accuracy_score(Y_te, y_hat)


# Print accuracy_entropy
print('[Decision Tree -- entropy] accuracy_score: {:.3f}.'.format(accuracy_entropy))


[Decision Tree -- entropy] accuracy_score: 0.740.


In [18]:
rand_forest =RandomForestClassifier(n_estimators=40)

#Train the model using the training sets y_pred=clf.predict(X_test)
rand_forest.fit(X_tr,Y_tr)

y_hat2=rand_forest.predict(X_te)


# Get the accuracy score
acc=accuracy_score(Y_te, y_hat2)
acc

0.8323013736025114

In [19]:
# testing for Kaggle
kaggle_df = pd.read_csv('test.csv')
kaggle_df

Unnamed: 0,ID,Source,TMC,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance.mi.,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,MapQuest,201.0,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.010,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-5,MapQuest,201.0,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,0.010,...,False,False,False,False,True,False,Day,Day,Day,Day
2,A-7,MapQuest,201.0,2016-02-08 07:59:35,2016-02-08 08:29:35,39.758274,-84.230507,,,0.000,...,False,False,False,False,False,False,Day,Day,Day,Day
3,A-14,MapQuest,201.0,2016-02-08 08:37:07,2016-02-08 09:07:07,39.790760,-84.241547,,,0.010,...,False,False,False,False,True,False,Day,Day,Day,Day
4,A-22,MapQuest,201.0,2016-02-08 10:24:27,2016-02-08 10:54:27,39.773346,-84.224686,,,0.000,...,False,False,False,False,False,False,Day,Day,Day,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1269757,A-4239383,Bing,,2019-08-23 15:24:47,2019-08-23 15:54:05,34.629370,-118.153000,34.62937,-118.1530,0.000,...,False,False,False,False,False,False,Day,Day,Day,Day
1269758,A-4239389,Bing,,2019-08-23 15:45:43,2019-08-23 16:14:31,34.043650,-118.443700,34.04934,-118.4484,0.476,...,False,False,False,False,False,False,Day,Day,Day,Day
1269759,A-4239400,Bing,,2019-08-23 17:43:56,2019-08-23 18:12:27,33.741700,-117.837100,33.73917,-117.8300,0.443,...,False,False,False,False,False,False,Day,Day,Day,Day
1269760,A-4239402,Bing,,2019-08-23 18:03:25,2019-08-23 18:32:01,34.002480,-117.379400,33.99888,-117.3709,0.543,...,False,False,False,False,False,False,Day,Day,Day,Day


In [20]:
kaggle_df['Start_Time'] = pd.to_datetime(kaggle_df['Start_Time'], errors='coerce')
kaggle_df['End_Time'] = pd.to_datetime(kaggle_df['End_Time'], errors='coerce')

# Extract year, month, day, hour and weekday
kaggle_df['month']=kaggle_df['Start_Time'].dt.strftime('%b')
kaggle_df['Hour']=kaggle_df['Start_Time'].dt.hour
kaggle_df['Weekday']=kaggle_df['Start_Time'].dt.strftime('%a')

# Extract the amount of time in the unit of minutes for each accident, round to the nearest integer
td='Time_Duration(min)'
kaggle_df[td]=round((kaggle_df['End_Time']-kaggle_df['Start_Time'])/np.timedelta64(1,'m'))
kaggle_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1269762 entries, 0 to 1269761
Data columns (total 52 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   ID                     1269762 non-null  object        
 1   Source                 1269762 non-null  object        
 2   TMC                    814829 non-null   float64       
 3   Start_Time             1269762 non-null  datetime64[ns]
 4   End_Time               1269762 non-null  datetime64[ns]
 5   Start_Lat              1269762 non-null  float64       
 6   Start_Lng              1269762 non-null  float64       
 7   End_Lat                454933 non-null   float64       
 8   End_Lng                454933 non-null   float64       
 9   Distance.mi.           1269762 non-null  float64       
 10  Description            1269761 non-null  object        
 11  Number                 463175 non-null   float64       
 12  Street                 12697

In [25]:
feature_lst_test=['Start_Lng','Start_Lat',
             'Distance.mi.','State',
             'Temperature.F.','Humidity...',
             'Visibility.mi.','Amenity',
             'Crossing','Junction','No_Exit','Railway',
             'Station','Stop','Traffic_Signal',
             'Hour','Weekday', 'Time_Duration(min)']

In [26]:
# Select the dataset to include only the selected features
kag_sel=kaggle_df[feature_lst_test].copy()

In [28]:
kag_dummy = pd.get_dummies(kag_sel,drop_first=True)

In [34]:
null_df = kag_dummy.isna().sum().to_frame()
null_df.sort_values(by=0, ascending=False)

#fill Visibility.mi., Humidity..., Temperature.F.
list_nulls = ['Visibility.mi.', 'Humidity...', 'Temperature.F.']
for i in list_nulls:
    kag_dummy[i] = kag_dummy[i].fillna(kag_dummy[i].median())

kag_dummy.isna().sum().to_frame()

Unnamed: 0,0
Start_Lng,0
Start_Lat,0
Distance.mi.,0
Temperature.F.,0
Humidity...,0
...,...
Weekday_Sat,0
Weekday_Sun,0
Weekday_Thu,0
Weekday_Tue,0


In [35]:
kag_te = rand_forest.predict(kag_dummy)
kag_te

array([0., 0., 0., ..., 0., 0., 0.])

In [36]:
kag_te.to_csv('test_results.csv')

AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'

# Trying a different set of columns

In [13]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
TMC,1901648.0,208.348254,21.236047,200.0,201.0,201.0,201.0,406.0
Severity,2962779.0,2.30515,0.533466,1.0,2.0,2.0,3.0,4.0
Start_Lat,2962779.0,36.39779,4.964845,24.55527,33.52025,35.8255,40.18439,49.0022
Start_Lng,2962779.0,-95.466527,17.35451,-124.6238,-117.3566,-90.01895,-80.84698,-67.11317
End_Lat,1061131.0,36.898592,5.166983,24.57018,33.854,37.35016,40.72576,49.075
End_Lng,1061131.0,-98.597794,18.49527,-124.4978,-118.2073,-94.39028,-80.87304,-67.10924
Distance.mi.,2962779.0,0.336481,1.616485,0.0,0.0,0.0,0.089,441.75
Number,1081417.0,6131.773288,12495.01351,1.0,898.0,2893.0,7328.0,990415.0
Temperature.F.,2899590.0,61.488264,18.523924,-89.0,49.0,63.0,75.2,203.0
Wind_Chill.F.,1634781.0,54.903686,22.720811,-89.0,38.0,58.0,73.0,189.0


In [10]:
new_sub = df[['Severity','Start_Lat','Start_Lng','Distance.mi.','Traffic_Signal','Wind_Speed.mph.']]
new_sub.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
new_sub.isna().sum()

Severity                1
Start_Lat               1
Start_Lng               1
Distance.mi.            1
Traffic_Signal          1
Wind_Speed.mph.    229331
dtype: int64

In [12]:
#drop the rows with missing values
new_sub.dropna(subset=['Wind_Speed.mph.'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [13]:
X = new_sub.drop(columns=['Severity'],inplace=False)
Y = new_sub['Severity']

X_tr, X_te, Y_tr, Y_te = train_test_split(X, Y, test_size=0.2, random_state=42) 

In [15]:
rand_forest =RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
rand_forest.fit(X_tr,Y_tr)

y_hat2=rand_forest.predict(X_te)


# Get the accuracy score
acc=accuracy_score(Y_te, y_hat2)
acc

0.8199530854690154