In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Train Dataset**

In [2]:
import pandas as pd 
import datetime
import numpy as np
from sklearn.preprocessing import LabelEncoder
import numpy as np; np.random.seed(0)
import os
import matplotlib.pyplot as plt
import xml.etree.ElementTree as Xet

In [3]:
data_train = pd.read_csv("../input/car-crashes-severity-prediction/train.csv") 
dTrain = pd.DataFrame(data_train)
dTrain.head()

In [4]:
dTrain=dTrain.drop(columns=['Bump','Roundabout'])

In [5]:
dTrain.groupby('Side').count()

In [6]:
dTrain.head()

In [7]:
dTrain['Crossing'] = dTrain['Crossing'].astype('int')
dTrain['Give_Way'] = dTrain['Give_Way'].astype('int')
dTrain['Junction'] = dTrain['Junction'].astype('int')
dTrain['No_Exit'] = dTrain['No_Exit'].astype('int')
dTrain['Railway'] = dTrain['Railway'].astype('int')
dTrain['Stop'] = dTrain['Stop'].astype('int')
dTrain['Amenity'] = dTrain['Amenity'].astype('int')
dTrain.head()

In [8]:
enc = LabelEncoder()
enc.fit(dTrain['Side'])
dTrain['Side'] = enc.transform(dTrain['Side'])

In [9]:
dTrain.groupby('Side').count()

In [10]:
dTrain.dtypes

In [11]:
#convert timestamp to dateTime
dTrain['timestamp']=pd.to_datetime(dTrain['timestamp'])

In [12]:
dTrain.head()

In [13]:
#split date from time
dTrain['date'] = [d.date() for d in dTrain['timestamp']]

In [14]:
dTrain.head()

In [15]:
dTrain.dtypes

In [16]:
#get hours from time alone but int
dTrain['Hour'] = dTrain['timestamp'].dt.hour
dTrain.head()

In [17]:
#convert from int to str
dTrain['Hour']=dTrain['Hour'].apply(str)
dTrain["Hour"] = dTrain['Hour'].astype(str)+':00:00'

In [18]:
#put hour str in time formate
dTrain['Hour'] = pd.to_datetime(dTrain['Hour'],format= '%H:%M:%S' ).dt.time

In [19]:
dTrain["Hour"] = dTrain['Hour'].astype(str)

In [20]:
dTrain.head()

In [21]:
#date and time str to concatenate later
dTrain.dtypes

In [22]:
dTrain["Hour"]= dTrain["Hour"].str.split(":", n = 1, expand = True)
dTrain['Hour'] = dTrain['Hour'].astype(int)
dTrain.head()

# **Weather Dataset**

In [23]:
data_weather = pd.read_csv("../input/car-crashes-severity-prediction/weather-sfcsv.csv") 
dWeather = pd.DataFrame(data_weather)
# Preview the first 5 lines of the loaded data 
dWeather.head()

In [24]:
dWeather['Year'] = dWeather['Year'].apply(str)
dWeather['Month'] = dWeather['Month'].apply(str)
dWeather['Day'] = dWeather['Day'].apply(str)

In [25]:
dWeather["date"] = pd.to_datetime(dWeather['Year'].astype(str)+'-'+dWeather['Month']+'-'+dWeather['Day']).dt.date
dWeather.head()

In [26]:
dWeather['Weather_Condition'] = dWeather['Weather_Condition'].apply(str)

In [27]:
dWeather = dWeather.drop(columns=['Year', 'Month','Day'])

In [28]:
dWeather.head()

In [29]:
dWeather.to_csv('weather.csv')

# **Holiday Dataset**

In [30]:
cols = ["date", "description"]
rows = []
  
# Parsing the XML file
xmlparse = Xet.parse('../input/car-crashes-severity-prediction/holidays.xml')
root = xmlparse.getroot()
for i in root:
    date = i.find("date").text
    description = i.find("description").text
  
    rows.append({"date": date,
                 "description": description})
  
dholidays = pd.DataFrame(rows, columns=cols)
  
# Writing dataframe to csv
dholidays.to_csv('holidays.csv')

In [31]:
dholidays['date']=pd.to_datetime(dholidays['date']).dt.date
dholidays.head()

In [32]:
dholidays.dtypes

# **Merging Datasets**

> **Merge Train with Weather by date**


In [33]:
dTrain_dWeather = dTrain.merge(dWeather, left_on=["date","Hour"], right_on=["date","Hour"], how = 'left')

In [34]:
dTrain_dWeather.head()

> **Merge Train & Weather with Holidays by date**

In [35]:
dfinal = dTrain_dWeather.merge(dholidays, on="date", how = 'left')

In [36]:
dfinal.head()

In [37]:
dfinal['Wind_Chill(F)'] = dfinal['Wind_Chill(F)'].fillna((dfinal['Wind_Chill(F)'].mean()))
dfinal['Precipitation(in)'] = dfinal['Precipitation(in)'].fillna((dfinal['Precipitation(in)'].mean()))
dfinal['Temperature(F)'] = dfinal['Temperature(F)'].fillna((dfinal['Precipitation(in)'].mean()))

dfinal.head()

In [38]:
dfinal['description'] = dfinal['description'].fillna(0)
dfinal["description"]=dfinal["description"].apply(lambda x: 1 if x!=0 else 0)

In [39]:
enc = LabelEncoder()
enc.fit(dfinal['Weather_Condition'])
dfinal['Weather_Condition'] = enc.transform(dfinal['Weather_Condition'])

In [40]:
dfinal.head()

In [41]:
dfinal['date'] = dfinal['date'].apply(str)

In [42]:
dfinal.head()

In [43]:
dfinal.to_csv('finalDataset.csv')

# **Data Splitting**

In [44]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(dfinal, test_size=0.2, random_state=42) # Try adding `stratify` here

X_train = train_df.drop(columns=['ID', 'Severity'])
y_train = train_df['Severity']

X_val = val_df.drop(columns=['ID', 'Severity'])
y_val = val_df['Severity']

In [45]:
# This cell is used to select the numerical features. IT SHOULD BE REMOVED AS YOU DO YOUR WORK.
X_train = X_train[['Lat', 'Stop', 'Hour']]
X_val = X_val[['Lat', 'Stop', 'Hour']]

# **Model Training**

In [46]:
from sklearn.ensemble import RandomForestClassifier

# Create an instance of the classifier
classifier = RandomForestClassifier(max_depth=2, random_state=0)

# Train the classifier
classifier = classifier.fit(X_train, y_train)

In [47]:
print("The accuracy of the classifier on the validation set is ", (classifier.score(X_val, y_val)))

In [48]:
dfinal.shape

# **Submission File Generation**


In [49]:
test_df = pd.read_csv(os.path.join('../input/car-crashes-severity-prediction/test.csv'))
test_df.head()

In [50]:
X_test = test_df.drop(columns=['ID'])

# You should update/remove the next line once you change the features used for training
X_test = X_test[['Lat', 'Lng', 'Distance(mi)']]

y_test_predicted = classifier.predict(X_test)

test_df['Severity'] = y_test_predicted

test_df.head()

In [51]:
test_df.shape

In [52]:
test_df[['ID', 'Severity']].to_csv('kaggle_submission.csv', index=False)