### Header

In [35]:
import pandas as pd
import numpy as np
from thefuzz import process, fuzz    

set up the variables that will be passed as parameters on the read_excel function

In [36]:

valenzuela_filename = "./raw_data/Valenzuela_Data.xlsx"
zamboanga_filename = "./raw_data/Zamboanga_Data.xlsx"

to_col_names = ['code',
             'sex',
             'age',
             'grade',
             'class_shift',
             'respondent',
             'modes_of_transport',
             'main_mode',
             '-cycle_helmet',
             '-cycle_number',
             'front_seat',
             'seatbelt',
             'travel_time',
             'travel_time>30',
             'companion_bool',
             'companion',
             'baranggay',
             'incident_bool',
             'incident_location',
             'incident_specific_location',
             'incident_vehicle_bool',
             'incident_vehicle',
             'close_call_bool',
             'close_call_location',
             'close_call_specific_location',
             'close_call_vehicle_bool',
             'close_call_vehicle',
             ]

from_col_names = ['code',
                'sex',
                'age',
                'grade',
                'class_shift',
                'respondent',
                'modes_of_transport',
                'main_mode',
                '-cycle_helmet',
                '-cycle_number',
                'front_seat',
                'seatbelt',
                'travel_time',
                'travel_time>30',
                'companion_bool',
                'companion',
                'location_after_school_bool',
                'location_after_school',
                'baranggay',
                'incident_bool',
                'incident_location',
                'incident_specific_location',
                'incident_vehicle_bool',
                'incident_vehicle',
                'close_call_bool',
                'close_call_location',
                'close_call_specific_location',
                'close_call_vehicle_bool',
                'close_call_vehicle'
                ]

dtype_cols = {'-cycle_helmet': 'boolean',
             'front_seat': 'boolean',
             'companion_bool': 'boolean',
             'incident_bool': 'boolean',
             'close_call_bool': 'boolean',
             'incident_vehicle_bool': 'boolean',
             'main_mode': 'category',
             '-cycle_number': 'category',
             'travel_time': 'category',
             'companion': 'category',
             'incident_location': 'category',
             'incident_vehicle': 'category',
             'close_call_location': 'category',
             'close_call_vehicle': 'category'
             }




to_school_cols = "C:E,G:I,K:AE"
from_school_cols = "C:E,G:I,AF:BB"


### Import Valenzuel going to school dataset

bool columns do not include 'seatbelt' since I want to observe the distinction between seatbelt and childseat usage

In [37]:
valenzuela_to_school = pd.read_excel(valenzuela_filename,
                                    usecols=to_school_cols,
                                    names=to_col_names,
                                    dtype=dtype_cols,
                                    true_values=['Yes'],
                                    false_values=['No'])


### Cleaning the data

Columns needed to be cleaned are 'companion', 'baranggay', and 'close_call_vehicle_bool'

#### Cleaning 'close_call_vehicle_bool'

In [38]:
valenzuela_to_school['close_call_vehicle_bool'] = valenzuela_to_school['close_call_vehicle_bool'].str.replace('I cannot remember anymore (Hindi na maalala)', 'No', regex=False)
d = {'Yes': True, 'No': False}
valenzuela_to_school['close_call_vehicle_bool'] = valenzuela_to_school['close_call_vehicle_bool'].replace(d).astype('boolean')


#### Cleaning 'barangay'

In [46]:
baranggay_list = ['lingunan', 
                  'lawa', 
                  'lawang bato', 
                  'marulas', 
                  'viente reales', 
                  'gen. t. de leon', 
                  'ugong', 
                  'marulas', 
                  'meycauayan', 
                  'karuhatan',
                  'pinagbayan'
                ]

def extract(string, choices):
    string = string.lower()
    potential_matches = process.extract(string, choices, limit=2)
    for match in potential_matches:
        if match[1] >= 75:
            return match[0]
    return string

valenzuela_to_school['baranggay'].apply(extract, args=[baranggay_list]).unique()


array(['bilog balangkas', 'caloocan', 'punturin', 'ugong', 'rincon',
       'gen. t. de leon', 'marulas', 'lingunan', 'lawa', 'canumay west',
       'parada', 'viente reales', 'coloong 1', 'bbb marula',
       'brgy 176 bagong silang', 'karuhatan', 'tampoy 2', 'meycauayan',
       'p.gregorio st.', 'lazaro canumay', 'assumption ville',
       'lawang bato', 'baluyot park quezon city',
       'valinzea malinta dolong  tangki', 'assumption',
       '471 dulalia street valenzuela city', '688 t santaigo',
       'ciudad grande ph2', 'pacheco village gml', 'pinagbayan',
       'valenzuela ', 'barangay sto.niño, biñan laguna',
       'assumption ville, dulalia', 'canumay'], dtype=object)