In [2]:
import pandas as pd 
import sqlite3 as sqlite 
import warnings
warnings.simplefilter(action='ignore')

connection = sqlite.connect('motor_vehicle_collisions.db')
connection


<sqlite3.Connection at 0x19ea533b2e0>

## **Cleaning The Vehicle Types Entries**


In [3]:
## Generate a summary table of the unique vehicle types
vehicle_types_query = """ 
    WITH vehicle_1_type AS (
        SELECT
            vehicle_1 AS vehicle_type,
            COUNT(*) AS collisions
        FROM
            vehicle_type
        WHERE
            vehicle_1 != 'None'
        GROUP BY
            vehicle_type
    ),
    vehicle_2_type AS (
        SELECT 
            vehicle_2 AS vehicle_type,
            COUNT(*) AS collisions
        FROM
            vehicle_type
        WHERE
            vehicle_2 != 'None'
        GROUP BY
            vehicle_type
    ),
    vehicle_3_type AS (
        SELECT
            vehicle_3 AS vehicle_type,
            COUNT(*) AS collisions
        FROM
            vehicle_type
        WHERE
            vehicle_3 != 'None'
        GROUP BY
            vehicle_type
    ),
    vehicle_4_type AS (
        SELECT
            vehicle_4 AS vehicle_type,
            COUNT(*) AS collisions
        FROM
            vehicle_type
        WHERE 
            vehicle_4 != 'None'
        GROUP BY
            vehicle_type
    ),
    vehicle_5_type AS (
        SELECT 
            vehicle_5 AS vehicle_type,
            COUNT(*) AS collisions
        FROM
            vehicle_type
        WHERE
            vehicle_5 != 'None'
        GROUP BY
            vehicle_type
    ), 
    merged_types AS (
        SELECT
             v_1_t_tbl.vehicle_type,
             v_1_t_tbl.collisions
        FROM
            vehicle_1_type AS v_1_t_tbl
        UNION
        SELECT
             v_2_t_tbl.vehicle_type,
             v_2_t_tbl.collisions
        FROM
            vehicle_2_type AS v_2_t_tbl
        UNION
        SELECT
             v_3_t_tbl.vehicle_type,
             v_3_t_tbl.collisions
        FROM
            vehicle_3_type AS v_3_t_tbl
        UNION
        SELECT
             v_4_t_tbl.vehicle_type,
             v_4_t_tbl.collisions
        FROM
            vehicle_4_type AS v_4_t_tbl
        UNION
        SELECT
             v_5_t_tbl.vehicle_type,
             v_5_t_tbl.collisions
        FROM
            vehicle_5_type AS v_5_t_tbl
    )
    SELECT 
        vehicle_type,
        SUM(collisions) AS vehicles
    FROM 
        merged_types
    GROUP BY
        vehicle_type
    ORDER BY
        vehicles DESC;
"""

vehicle_types = pd.read_sql_query(vehicle_types_query, connection).drop_duplicates('vehicle_type')
print(vehicle_types.vehicle_type.nunique())


963


In [4]:
# export the summary table to a csv file for cleaning
#vehicle_types.to_csv('vehicle_types.csv', index=False)


In [5]:
vehicle_type = pd.read_csv('vehicle_type.csv')
clnd_vehicle_type = vehicle_type.copy()


In [6]:
clnd_vehicle_type.columns


Index(['collision_id', 'vehicle_1', 'vehicle_2', 'vehicle_3', 'vehicle_4',
       'vehicle_5'],
      dtype='object')

In [7]:
vehicle_types = pd.read_csv('vehicle_types.csv')
ref_vehicle_types = vehicle_types.copy()


In [8]:
ref_vehicle_types.head()


Unnamed: 0,vehicle_type,Vehicle_Type
0,Sedan,Sedan
1,Station Wagon/Sport Utility Vehicle,Sport Utility Vehicle
2,Bike,Bike
3,Pick-up Truck,Pick-up Truck
4,Box Truck,Box Truck


In [9]:
# Replace the messy entries with new ones
mapping = ref_vehicle_types.set_index('vehicle_type')['Vehicle_Type']

# Now, apply the mapping to the columns in df1
columns_to_replace = ['vehicle_1', 'vehicle_2', 'vehicle_3', 'vehicle_4', 'vehicle_5']
for col in columns_to_replace:
    clnd_vehicle_type[col] = clnd_vehicle_type[col].map(mapping)


In [10]:
clnd_vehicle_type


Unnamed: 0,collision_id,vehicle_1,vehicle_2,vehicle_3,vehicle_4,vehicle_5
0,4491064,Sedan,Sedan,,,
1,4491066,Sedan,Sport Utility Vehicle,,,
2,4491068,Sport Utility Vehicle,Sport Utility Vehicle,,,
3,4491069,Sport Utility Vehicle,,,,
4,4491076,E-Bike,,,,
...,...,...,...,...,...,...
200462,4715923,,,,,
200463,4717852,Sport Utility Vehicle,Sport Utility Vehicle,,,
200464,4718805,Sport Utility Vehicle,Sedan,Sedan,,
200465,4719291,Sedan,Sport Utility Vehicle,,,


In [11]:
#Exporting the cleaned data entries to the sql database
clnd_vehicle_type.to_sql('vehicle_type', connection, if_exists='replace', index=False)


200467