In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import os
import gc

In [None]:
%matplotlib inline

In [None]:
path = r'C:\Users\nukis\Documents\Projects\08. Bike Safety'

In [None]:
df_un = pd.read_csv(os.path.join(path, '01. Data', 'Original data', '2021_DATA_SA_Units.csv'), low_memory=False)

In [None]:
# Command to maximize view of rows and columns
pd.options.display.max_rows = None
pd.options.display.max_columns = len(df_un.columns)

In [None]:
df_un = df_un.reset_index()
df_un.drop('index', axis=1, inplace=True)
df_un.head() 

# Data Cleaning - Units

In [None]:
df_un = df_un.drop(columns = ['Unit No', 'No Of Cas', 'Veh Reg State', 'Lic State', 'Licence Class', 'Licence Type', 'Towing', 'Unit Movement', 'Number Occupants', 'Postcode', 'Rollover', 'Fire'], axis=1)

In [None]:
# Check for missing values

df_un.isnull().sum()

In [None]:
# Check for missing values

pd.DataFrame(data = [round(i/len(df_un) * 100, 2) for i in df_un.isnull().sum().to_list()], index = df_un.columns, columns = ['Missing Values %']).T

In [None]:
# Check for duplicates

dups = df_un.duplicated()
dups.sum()

In [None]:
# Drop duplicates

df_un = df_un.drop_duplicates()
dups = df_un.duplicated()
dups.sum()

In [None]:
df_un.loc[df_un['Unit Type'] == 'Motor Cars - Sedan', 'Unit Involved'] = 'Motor Vehicle'
df_un.loc[df_un['Unit Type'] == 'Station Wagon', 'Unit Involved'] = 'Motor Vehicle'
df_un.loc[df_un['Unit Type'] == 'Utility', 'Unit Involved'] = 'Motor Vehicle'
df_un.loc[df_un['Unit Type'] == 'RIGID TRUCK LGE GE 4.5T', 'Unit Involved'] = 'Motor Vehicle'
df_un.loc[df_un['Unit Type'] == 'Motor Cycle', 'Unit Involved'] = 'Motor Vehicle'
df_un.loc[df_un['Unit Type'] == 'Motor Vehicle - Type Unknown', 'Unit Involved'] = 'Motor Vehicle'
df_un.loc[df_un['Unit Type'] == 'Panel Van', 'Unit Involved'] = 'Motor Vehicle'
df_un.loc[df_un['Unit Type'] == 'OMNIBUS', 'Unit Involved'] = 'Motor Vehicle'
df_un.loc[df_un['Unit Type'] == 'SEMI TRAILER', 'Unit Involved'] = 'Motor Vehicle'
df_un.loc[df_un['Unit Type'] == 'BDOUBLE - ROAD TRAIN', 'Unit Involved'] = 'Motor Vehicle'
df_un.loc[df_un['Unit Type'] == 'Other Defined Special Vehicle', 'Unit Involved'] = 'Motor Vehicle'
df_un.loc[df_un['Unit Type'] == 'Taxi Cab', 'Unit Involved'] = 'Motor Vehicle'
df_un.loc[df_un['Unit Type'] == 'Light Truck LT 4.5T','Unit Involved'] = 'Motor Vehicle'
df_un.loc[df_un['Unit Type'] == 'Forward Control Passenger Van', 'Unit Involved'] = 'Motor Vehicle'
df_un.loc[df_un['Unit Type'] == 'Motor Cars - Tourer', 'Unit Involved'] = 'Motor Vehicle'
df_un.loc[df_un['Unit Type'] == 'Small Wheel Vehicle User', 'Unit Involved'] = 'Motor Vehicle'


df_un.loc[df_un['Unit Type'] == 'Pedal Cycle', 'Unit Involved'] = 'Bicycle/Scooter'
df_un.loc[df_un['Unit Type'] == 'Scooter', 'Unit Involved'] = 'Bicycle/Scooter'
df_un.loc[df_un['Unit Type'] == 'Powered Scooter (E-Scooter)', 'Unit Involved'] = 'Bicycle/Scooter'
df_un.loc[df_un['Unit Type'] == 'Motorised Wheelchair/Gopher', 'Unit Involved'] = 'Bicycle/Scooter'
df_un.loc[df_un['Unit Type'] == 'Power Asst. Bicycle', 'Unit Involved'] = 'Bicycle/Scooter'
df_un.loc[df_un['Unit Type'] == 'Wheelchair / Elec. Wheelchair', 'Unit Involved'] = 'Bicycle/Scooter'


df_un.loc[df_un['Unit Type'] == 'Tram', 'Unit Involved'] = 'Railway Vehicle'
df_un.loc[df_un['Unit Type'] == 'Railway Vehicle', 'Unit Involved'] = 'Railway Vehicle'


df_un.loc[df_un['Unit Type'] == 'Other Fixed Obstruction', 'Unit Involved'] = 'Obstruction'
df_un.loc[df_un['Unit Type'] == 'Tree', 'Unit Involved'] = 'Obstruction'
df_un.loc[df_un['Unit Type'] == 'Pedestrian on Road', 'Unit Involved'] = 'Obstruction'
df_un.loc[df_un['Unit Type'] == 'Stobie Pole', 'Unit Involved'] = 'Obstruction'
df_un.loc[df_un['Unit Type'] == 'Guard Rail', 'Unit Involved'] = 'Obstruction'
df_un.loc[df_un['Unit Type'] == 'Animal - Wild', 'Unit Involved'] = 'Obstruction'
df_un.loc[df_un['Unit Type'] == 'Pole - not Stobie', 'Unit Involved'] = 'Obstruction'
df_un.loc[df_un['Unit Type'] == 'Sign Post', 'Unit Involved'] = 'Obstruction'
df_un.loc[df_un['Unit Type'] == 'Other Inanimate Object', 'Unit Involved'] = 'Obstruction'
df_un.loc[df_un['Unit Type'] == 'Traffic Signal Pole', 'Unit Involved'] = 'Obstruction'
df_un.loc[df_un['Unit Type'] == 'Pedestrian on Footpath/Carpark', 'Unit Involved'] = 'Obstruction'
df_un.loc[df_un['Unit Type'] == 'Wire Rope Barrier', 'Unit Involved'] = 'Obstruction'
df_un.loc[df_un['Unit Type'] == 'Animal - Domestic - Not Ridden', 'Unit Involved'] = 'Obstruction'
df_un.loc[df_un['Unit Type'] == 'Bridge', 'Unit Involved'] = 'Obstruction'

In [None]:
df_un = df_un.drop(columns = ['Unit Type'], axis=1)
df_un.head()

In [None]:
# Check for mixed-type data in dataframe

for col in df_un.columns.tolist():
  weird = (df_un[[col]].applymap(type) != df_un[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_un[weird]) > 0:
    print (col) # No mixed-type

In [None]:
df_un['Veh Year'].fillna('N/A', inplace = True)
df_un['Direction Of Travel'].fillna('Unknown', inplace = True)
df_un['Sex'].fillna('Unknown', inplace = True)
df_un['Age'].fillna('Unknown', inplace = True)

In [None]:
df_un.loc[(df_un['Age'] == 'XXX') | (df_un['Age'] == 'X'), 'Age'] = 'Unknown'

In [None]:
strip_age = []

for i in df_un['Age']:
    if i == 'Unknown':
        strip_age.append(i)
    else:
        n = 1
        i = i[n:]
        
        strip_age.append(i)

In [None]:
df_un['Age'] = strip_age

In [None]:
df_un.head()

# Data Cleaning - Casualty

In [None]:
df_cas = pd.read_csv(os.path.join(path, '01. Data', 'Original data', '2021_DATA_SA_Casualty.csv'), low_memory=False)

In [None]:
df_cas.head()

In [None]:
df_cas = df_cas[['REPORT_ID', 'Sex', 'AGE']]
df_cas.head()

In [None]:
strip_age = []

for i in df_cas['AGE']:
    if i == 'Unknown':
        strip_age.append(i)
    else:
        n = 1
        i = i[n:]
        
        strip_age.append(i)

In [None]:
df_cas['AGE'] = strip_age

In [None]:
df_cas.rename(columns = {'Sex': 'SEX'}, inplace = True)

In [None]:
df_cas.head()

# Merging Units & Casualty

In [None]:
df_un_cas = df_un.merge(df_cas, on = 'REPORT_ID', how = 'left')

In [None]:
df_un_cas.head()

In [None]:
df_un_cas['Age'].fillna('Unknown', inplace=True)
df_un_cas['Sex'].fillna('Unknown', inplace=True)

df_un_cas['AGE'].fillna('Unknown', inplace=True)
df_un_cas['SEX'].fillna('Unknown', inplace=True)

In [None]:
df_un_cas.info()

In [None]:
age = []

for i in range(len(df_un_cas)):
    if df_un_cas.iloc[i, 4] == 'Unknown' and df_un_cas.iloc[i, 7] != 'Unknown':
        age.append(df_un_cas.iloc[i, 7])
    else:
        age.append(df_un_cas.iloc[i, 4])

In [None]:
len(age)

In [None]:
df_un_cas['Age'] = age

In [None]:
unknown_age = df_un_cas.loc[df_un_cas['Age'] == 'Unknown']
unknown_age.shape

In [None]:
unknown_sex = df_un_cas.loc[df_un_cas['Sex'] == 'Unknown']
unknown_sex .shape

In [None]:
sex = []

for i in range(len(df_un_cas)):
    if df_un_cas.iloc[i, 3] == 'Unknown' and df_un_cas.iloc[i, 6] != 'Unknown':
        sex.append(df_un_cas.iloc[i, 6])
    else:
        sex.append(df_un_cas.iloc[i, 3])

In [None]:
len(sex)

In [None]:
df_un_cas['Sex'] = sex

In [None]:
df_un_cas['Sex'].value_counts(dropna=False)

In [None]:
df_un_cas['Age'].value_counts(dropna=False)

In [None]:
df_un_cas = df_un_cas.drop(columns = ['SEX', 'AGE'], axis=1)
df_un_cas.head()

In [None]:
# Export data to pkl

df_un_cas.to_pickle(os.path.join(path, '01. Data', 'Prepared data', 'unitcasualty_cleaned.pkl'))
df_un_cas.to_csv(os.path.join(path, '01. Data', 'Prepared data', 'unitcasualty_cleaned.csv'), sep = ',')

In [None]:
gc.collect()