In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing the required libraries
import xml.etree.ElementTree as Xet
def xml_toDF(path):
    
    cols = ["timestamp", "description"]
    rows = []

    xmlparse = Xet.parse(path)
    root = xmlparse.getroot()

    for i in root:
        date        = i.find("date").text
        description = i.find("description").text
        rows.append({"timestamp": date,"description": description,})

    return pd.DataFrame(rows, columns=cols)

In [None]:
train_full=pd.read_csv('../input/car-crashes-severity-prediction/train.csv')
print(train_full.info())
test_df = pd.read_csv('../input/car-crashes-severity-prediction/test.csv')
print(test_df.info())

In [None]:

from datetime import datetime
def generate_holidays_column(df):
    data = df[['ID' , 'timestamp']]
    
    df['day_name'] = pd.to_datetime(df['timestamp']).dt.day_name()
    
    df['weekend'] = ~ df['day_name'].isin(['Sunday' , 'Saturday'])
    df['weekend'].astype(int)
    d = [datetime.strptime(i.split('.')[0], '%Y-%m-%d %H:%M:%S') for i in data['timestamp'].values]
    data['timestamp'] = data['timestamp'].apply(lambda x : x.split(" ")[0])
    hol = xml_toDF('/kaggle/input/car-crashes-severity-prediction/holidays.xml')
    x = pd.merge(left=data , right=hol ,  on='timestamp', how = 'left')
    x['holidays'] = x['description'].isnull().astype(int)
    
    x['holidays'] = x['holidays'] * df['weekend']
#     return x[['ID','holidays']]
    return x[['ID','holidays']]

hd_df = generate_holidays_column(train_full.copy())
t1=generate_holidays_column(test_df.copy())
print(hd_df.info())
print(t1.info())
print(t1.head())


In [None]:
corrmatrix = hd_df.corr()
f, ax = plt.subplots(figsize=(15, 10))
sns.heatmap(corrmatrix, vmax=.8, square=True)

In [None]:
def dayOrNight(column , index):
    df=pd.DataFrame()
    df['Time_Hour'] = [datetime.strptime(str(d).split(".")[0], '%Y-%m-%d %H:%M:%S').hour for d in column]
    df['dOrN'] =((df['Time_Hour'] >= 6) & (df['Time_Hour'] <= 19)).astype(int)
    df['rush'] =((df['Time_Hour'] >= 7) & (df['Time_Hour'] <= 9)| (df['Time_Hour'] >= 16) & (df['Time_Hour'] <= 19)).astype(int)    
    df["ID"] = index
    return df[["ID" , 'dOrN',	'rush']]
don = dayOrNight(train_full['timestamp'].copy(),train_full.index)
t2  = dayOrNight(test_df['timestamp'].copy(),test_df['ID'])
 

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

def extract_weather(df): 
    df['timestamp'] = df['timestamp'].apply(lambda x:x[0:-6])
    weather = pd.read_csv('../input/car-crashes-severity-prediction/weather-sfcsv.csv')

    weather['Year'] = weather['Year'].apply(lambda x : str(x))
    weather['Month'] = weather['Month'].apply(lambda x : '0'+str(x) if x<10 else str(x))
    weather['Day'] = weather['Day'].apply(lambda x : '0'+str(x) if x<10 else str(x))
    weather['Hour'] = weather['Hour'].apply(lambda x : '0'+str(x) if x<10 else str(x))
    
    weather_date = weather['Year'].astype(str)+ "-" +weather['Month'].astype(str) + "-" +weather['Day'].astype(str) + " " +weather['Hour'].astype(str)
    
    weather=pd.concat([weather,weather_date],axis=1)
    weather.columns=['Year','Day','Month','Hour','Weather_Condition','Wind_Chill(F)',
    'Precipitation(in)','Temperature(F)','Humidity(%)','Wind_Speed(mph)','Visibility(mi)','Selected','timestamp']
    weather=weather.drop_duplicates(subset='timestamp')
   # print(weather.info())
    weather=weather.drop(['Year','Day','Month','Hour','Selected','Precipitation(in)'],axis=1)
    weather_copy=weather.copy()
   # print(weather.info())
 
    mergedStuff = pd.merge(df, weather_copy, on='timestamp',how='left')
    mergedStuff=mergedStuff.drop(['timestamp','Bump','No_Exit','Give_Way','Side','Roundabout'],axis=1)
    mergedStuff['Stop']=mergedStuff['Stop'].astype(int)
    mergedStuff['Crossing']=mergedStuff['Crossing'].astype(int)
    mergedStuff['Junction']=mergedStuff['Junction'].astype(int)
    mergedStuff['Railway']=mergedStuff['Railway'].astype(int)
    mergedStuff['Amenity']=mergedStuff['Amenity'].astype(int)
    mergedStuff['Wind_Chill(F)']=mergedStuff['Wind_Chill(F)'].astype(float)
    mergedStuff['Wind_Speed(mph)']=mergedStuff['Wind_Speed(mph)'].astype(float)
    mergedStuff['Visibility(mi)']=mergedStuff['Visibility(mi)'].astype(float)
    mergedStuff=mergedStuff.fillna(mergedStuff.mean())
    # create a dict with the order
    weather_dict = { 'Rain': 200, 
                     'Heavy Rain': 250, 
                     'Light Thunderstorms and Rain': 400,
                     'Cloudy / Windy':50,
                     'Fog':80,
                     'Shallow Fog':40,
                     'Mostly Cloudy / Windy':70,
                     'Partly Cloudy / Windy':50,
                     'Haze':60,
                     'Smoke':40,
                     'Partly Cloudy':70,
                     'Mostly Cloudy':90,
                     'Overcast':0,
                     'Fair':0,
                     'Clear':0,
                     'Scattered Clouds':15,
                     'Light Rain':300,
                     'Fair / Windy':40,
                     'Cloudy / Windy':30,
                     'Light Rain / Windy':350,
                     'Light Drizzle':60,
                     'Mist':110,
                     'Rain / Windy':200,
                     'Patches of Fog':50,
                     'Squalls':500,
                     'Fog / Windy':100
                }
    # create a copy of the DataFrame
    df = mergedStuff.copy()
    # map the order to the column
    df['Weather_Condition'] = df['Weather_Condition'].map(weather_dict)
    df['Weather_Condition']=df['Weather_Condition'].fillna(df['Weather_Condition'].mean())
    return df.copy()

g=extract_weather(train_full.copy())
t33=extract_weather(test_df.copy())

# import lablel encoder
from sklearn.preprocessing import LabelEncoder
# create a copy
data = g.copy()
t3=t33.copy()
# intiate LabelEncoder
le = LabelEncoder()
data['weather_encoded'] = le.fit_transform(data['Weather_Condition'].astype(str))
t3['weather_encoded']   = le.fit_transform(t3['Weather_Condition'].astype(str))
data.info()
t3.info()

In [None]:
corrmatrix = data.corr()
f, ax = plt.subplots(figsize=(30, 30))
sns.heatmap(corrmatrix, vmax=.8, square=True,annot=True);

In [None]:
print()
data_with_holidays = pd.merge(data , hd_df , on='ID' )
dwh=pd.merge(t3 , t1 , on='ID' )
dwh.info(50)

In [None]:
final_df=pd.merge(data_with_holidays , don , on='ID' )
#final_df=final_df.drop(['Weather_Condition'],axis=1)
print(dwh.info())
final_test=pd.merge(dwh , t2 , on='ID' )
#final_test=final_test.drop(['Visibility(mi)','Weather_Condition'],axis=1)
#final_df=np.nan_to_num(final_df)
final_df.info()
final_test.info()

In [None]:
corrmatrix = final_df.corr()
f, ax = plt.subplots(figsize=(30, 30))
sns.heatmap(corrmatrix, vmax=.8, square=True,annot=True)


In [None]:
final_df.isnull().sum()

In [None]:
final_df['Severity'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
y=final_df['Severity']

train_df, val_df = train_test_split(final_df, test_size=0.2, random_state=42,stratify=y) # Try adding `stratify` here

X_train = train_df.drop(columns=['ID','Severity'])
y_train = train_df['Severity']

X_val = val_df.drop(columns=[ 'ID','Severity'])
y_val = val_df['Severity']

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create an instance of the classifier
classifier = RandomForestClassifier(max_depth=2, random_state=0)

# Train the classifier
classifier = classifier.fit(X_train, y_train)

In [None]:
print("The accuracy of the classifier on the validation set is ", (classifier.score(X_val, y_val)))

In [None]:
final_test.drop(['ID'] , axis=1 , inplace=True)

In [None]:
predict = classifier.predict(final_test)
output = pd.DataFrame(predict , columns=['Severity'] , index = test_full['ID'])
output.to_csv('submission.csv')
output