In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import gc

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso

In [None]:
%matplotlib inline

In [None]:
path = r'C:\Users\nukis\Documents\Projects\08. Road Safety'

In [None]:
df_crash = pd.read_pickle(os.path.join(path, '01. Data', 'Prepared data', 'crash_cleaned.pkl'))

In [None]:
df_uncas = pd.read_pickle(os.path.join(path, '01. Data', 'Prepared data', 'unitcasualty_cleaned.pkl'))

### 2. Merging Dataframes

In [None]:
df = df_crash.merge(df_uncas, on = 'REPORT_ID', how = 'left')

In [None]:
# Command to maximize view of rows and columns

pd.options.display.max_rows = None
pd.options.display.max_columns = len(df.columns)

In [None]:
df.shape

In [None]:
df.info()

### 3. Data Cleaning

In [None]:
# Cleaning Space

cat = df.select_dtypes(include = ['object']).columns

for i in cat:
    try:
        df[i] = df[i].apply(lambda x: x.strip())
        df[i] = df[i].apply(lambda x: x.title())
        df.loc[(df[i] == 'Unknown') | (df[i] == 'N/A') | (df[i] == 'Xx') | (df[i] == 'Xxx') | (df[i] == 'Xxxx'), i] = np.NaN
    except:
        print(i)

In [None]:
# Check for missing values

df.isnull().sum()

In [None]:
# Check for missing values

pd.DataFrame(data = [round(i/len(df) * 100, 2) for i in df.isnull().sum().to_list()], index = df.columns, columns = ['Missing Values %']).T

In [None]:
# Dropping the missing values from data

df.dropna(inplace=True)
df.shape

In [None]:
# Check again for missing values

df.isnull().sum()

In [None]:
# Check for duplicates

dups = df.duplicated(keep = 'last')
dups.sum() 

In [None]:
# Drop duplicates

df = df.drop_duplicates()
dups = df.duplicated()
dups.sum()

In [None]:
# Check for mixed-type data in dataframe

for col in df.columns.tolist():
  weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df[weird]) > 0:
    print (col) # No mixed-type

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# Convert data type

df['Age'] = df['Age'].astype('int64')
df['DUI Involved'] = df['DUI Involved'].astype('int64')
df['Drugs Involved'] = df['Drugs Involved'].astype('int64')
df['Lat'] = df['Lat'].astype('float64')
df['Lon'] = df['Lon'].astype('float64')
df['Veh Year'] = df['Veh Year'].astype('float64')

### 4. Data Pre-Prossesing

#### Hour Grouping
##### Hour is converted into hourly basis.

In [None]:
hourly = []

for i in df['Hour']:
    n = 2
    i = i[:n]
        
    hourly.append(i)

In [None]:
df['Hourly'] = hourly
df['Hourly'] = df['Hourly'].astype('int64')

In [None]:
df = df.drop(columns = ['REPORT_ID', 'Hour'])

#### Day Grouping

In [None]:
df.loc[(df['Day'] == 'Monday') | (df['Day'] == 'Tuesday') | (df['Day'] == 'Wednesday') |
           (df['Day'] == 'Thursday') | (df['Day'] == 'Friday'), 'Day Group'] = 'Weekday'
df.loc[(df['Day'] == 'Saturday') | (df['Day'] == 'Sunday'), 'Day Group'] = 'Weekend'

#### Month Grouping

In [None]:
df.loc[(df['Month'] == 'January') | (df['Month'] == 'February') | (df['Month'] == 'March'), 'Month Group'] = 'Q1'
df.loc[(df['Month'] == 'April') | (df['Month'] == 'May') | (df['Month'] == 'June'), 'Month Group'] = 'Q2'
df.loc[(df['Month'] == 'July') | (df['Month'] == 'August') | (df['Month'] == 'September'), 'Month Group'] = 'Q3'
df.loc[(df['Month'] == 'October') | (df['Month'] == 'November') | (df['Month'] == 'December'), 'Month Group'] = 'Q1'

#### Target Column

In [None]:
# Changing name of the target column

df = df.rename(columns = {'CSEF Severity' : 'Target'})

In [None]:
target_trim = []

for i in df['Target']:
    target_trim.append(i[3:])

In [None]:
df['Target'] = target_trim

In [None]:
df.info()

In [None]:
# Export data to pkl

df.to_pickle(os.path.join(path, '01. Data', 'Prepared data', 'road_safety_cleaned.pkl'))
df.to_csv(os.path.join(path, '01. Data', 'Prepared data', 'road_safety_cleaned.csv'), sep = ',')

#### Accident Severity Analysis

In [None]:
labels = ['Property Damage Only', 'Minor Injury', 'Serious Injury', 'Fatality']
colors = ['#FEF9A7', '#FAC213', '#F77E21', '#D61C4E']

fig, ax = plt.subplots()
myexplode = (0.05, 0.05, 0.05, 0.2)

ax.pie(df['Target'].value_counts(), explode = myexplode, labels = labels,autopct='%1.1f%%', 
        wedgeprops={'linewidth': 3.0, 'edgecolor': 'white'}, startangle = 90, colors = colors)

ax.set_title('Accident Severity', fontsize = 14)
plt.show()

In [None]:
# Function for drawing countplot

def countplot(x):
    plt.figure(figsize = (8, 6))
    sns.countplot(data = df, x = x, palette = 'mako_r', 
                  order = df[x].value_counts().index)

In [None]:
countplot('Target')

In [None]:
# Export data to pkl

df.to_pickle(os.path.join(path, '01. Data', 'Prepared data', 'road_safety_cleaned.pkl'))
df.to_csv(os.path.join(path, '01. Data', 'Prepared data', 'road_safety_cleaned.csv'), sep = ',')