In [31]:
import pandas as pd
import numpy as np
import re
import string

# Major Crime Data
Source: https://data.torontopolice.on.ca/datasets/TorontoPS::major-crime-indicators-open-data/about

In [20]:
crime = pd.read_csv('data/major_crime.csv', index_col='OBJECTID')
crime.head()

Unnamed: 0_level_0,X,Y,EVENT_UNIQUE_ID,REPORT_DATE,OCC_DATE,REPORT_YEAR,REPORT_MONTH,REPORT_DAY,REPORT_DOY,REPORT_DOW,...,UCR_CODE,UCR_EXT,OFFENCE,MCI_CATEGORY,HOOD_158,NEIGHBOURHOOD_158,HOOD_140,NEIGHBOURHOOD_140,LONG_WGS84,LAT_WGS84
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-8809036.0,5431523.0,GO-20141260264,2014/01/01 05:00:00+00,2014/01/01 05:00:00+00,2014,January,1,1,Wednesday,...,1430,100,Assault,Assault,143,West Rouge,131,Rouge (131),-79.132915,43.780413
2,-8814320.0,5435514.0,GO-20141260033,2014/01/01 05:00:00+00,2013/12/31 05:00:00+00,2014,January,1,1,Wednesday,...,1430,100,Assault,Assault,144,Morningside Heights,131,Rouge (131),-79.180387,43.806289
3,-8832825.0,5419631.0,GO-20141259834,2014/01/01 05:00:00+00,2014/01/01 05:00:00+00,2014,January,1,1,Wednesday,...,1420,100,Assault With Weapon,Assault,55,Thorncliffe Park,55,Thorncliffe Park (55),-79.346615,43.703234
4,-8847292.0,5429042.0,GO-20141264084,2014/01/01 05:00:00+00,2013/12/31 05:00:00+00,2014,January,1,1,Wednesday,...,1420,110,Assault Bodily Harm,Assault,27,York University Heights,27,York University Heights (27),-79.476579,43.764317
5,6.32778e-09,5.664924e-09,GO-20141260577,2014/01/01 05:00:00+00,2014/01/01 05:00:00+00,2014,January,1,1,Wednesday,...,1610,180,Robbery - Swarming,Robbery,NSA,NSA,NSA,NSA,0.0,0.0


In [21]:
crime.columns

Index(['X', 'Y', 'EVENT_UNIQUE_ID', 'REPORT_DATE', 'OCC_DATE', 'REPORT_YEAR',
       'REPORT_MONTH', 'REPORT_DAY', 'REPORT_DOY', 'REPORT_DOW', 'REPORT_HOUR',
       'OCC_YEAR', 'OCC_MONTH', 'OCC_DAY', 'OCC_DOY', 'OCC_DOW', 'OCC_HOUR',
       'DIVISION', 'LOCATION_TYPE', 'PREMISES_TYPE', 'UCR_CODE', 'UCR_EXT',
       'OFFENCE', 'MCI_CATEGORY', 'HOOD_158', 'NEIGHBOURHOOD_158', 'HOOD_140',
       'NEIGHBOURHOOD_140', 'LONG_WGS84', 'LAT_WGS84'],
      dtype='object')

In [22]:
# The OFFENCE column has very specific descriptions of the crimes, we are only interested in the larger category
# As for the report date columns, we are interested in when it happened - not when it was reported.
crime.drop(columns=['REPORT_DATE', 'REPORT_YEAR', 'REPORT_MONTH', 'REPORT_DAY', 'REPORT_DOY', 'REPORT_DOW', 'REPORT_HOUR', 'OFFENCE', 'HOOD_158', 'HOOD_140'], inplace=True)

In [23]:
# Create a new DataFrame with the unique neighbourhood158 values and their corresponding updated names
neighbourhood_mapping = crime.groupby('NEIGHBOURHOOD_158')['NEIGHBOURHOOD_140'].unique().reset_index()

new_neighbourhood_df = pd.DataFrame({
    'past_neighbourhood_158': neighbourhood_mapping['NEIGHBOURHOOD_158'],
    'neighbourhood': neighbourhood_mapping['NEIGHBOURHOOD_140'].apply(lambda x: x[0]) # assuming there's only one updated name
})

new_neighbourhood_df

Unnamed: 0,past_neighbourhood_158,neighbourhood
0,Agincourt North,Agincourt North (129)
1,Agincourt South-Malvern West,Agincourt South-Malvern West (128)
2,Alderwood,Alderwood (20)
3,Annex,Annex (95)
4,Avondale,Willowdale East (51)
...,...,...
154,Yonge-Doris,Willowdale East (51)
155,Yonge-Eglinton,Yonge-Eglinton (100)
156,Yonge-St.Clair,Yonge-St.Clair (97)
157,York University Heights,York University Heights (27)


In [24]:
# Remove the number and brackets at the end of the neighbourhood name
def clean_string(text):
    pattern = r'\s*\(\d+\)$'
    cleaned = re.sub(pattern, '', text)

    return cleaned

In [25]:
new_neighbourhood_df['neighbourhood'] = new_neighbourhood_df['neighbourhood'].apply(clean_string)

In [26]:
crime['NEIGHBOURHOOD_140'] = crime['NEIGHBOURHOOD_140'].apply(clean_string)

In [27]:
# Now we can drop the old neighbourhood_158 column
crime.drop(columns=['NEIGHBOURHOOD_158'], inplace=True)

In [28]:
crime.columns

Index(['X', 'Y', 'EVENT_UNIQUE_ID', 'OCC_DATE', 'OCC_YEAR', 'OCC_MONTH',
       'OCC_DAY', 'OCC_DOY', 'OCC_DOW', 'OCC_HOUR', 'DIVISION',
       'LOCATION_TYPE', 'PREMISES_TYPE', 'UCR_CODE', 'UCR_EXT', 'MCI_CATEGORY',
       'NEIGHBOURHOOD_140', 'LONG_WGS84', 'LAT_WGS84'],
      dtype='object')

In [29]:
# Converting the occurence data to a datetime variable for easy filtering
crime['OCC_DATE'] = pd.to_datetime(crime['OCC_DATE'], format='%Y/%m/%d %H:%M:%S+00')

In [30]:
# Filtering out crimes that happened before 01/01/2013 and after 31/12/2023
start_date = pd.to_datetime('2013-01-01')
end_date = pd.to_datetime('2023-12-31')

crime = crime[(crime['OCC_DATE'] >= start_date) & (crime['OCC_DATE'] <= end_date)]

In [31]:
crime = crime.sort_values(by='OCC_DATE')

In [33]:
# Create separate data frames for each year
crime_by_date = crime.groupby(crime['OCC_DATE'].dt.year)
yearly_crime_dfs = {year: group for year, group in crime_by_date}
# access each year by -> yearly_crime_dfs[x] where x is 2013, 2014, ..., 2023

In [34]:
years = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

In [18]:
monthly_summaries = {}
crime_types = ['Assault', 'Break and Enter', 'Theft Over', 'Robbery', 'Auto Theft']

# Iterate through each yearly DataFrame
for year, df_yearly in yearly_crime_dfs.items():
    df_yearly['MONTH'] = df_yearly['OCC_DATE'].dt.month
    
    # Group the yearly DataFrame by month and by crime category, then count the number of crimes
    monthly_summary_yearly = df_yearly.groupby(['MONTH', 'MCI_CATEGORY']).size().unstack(fill_value=0).reset_index()
    
    # Add a 'YEAR' column to the monthly summary DataFrame
    monthly_summary_yearly['YEAR'] = year
    monthly_summary_yearly = monthly_summary_yearly[['YEAR', 'MONTH'] + crime_types]
    
    monthly_summaries[year] = monthly_summary_yearly.to_dict(orient='records')

In [106]:
# crimes by neighbourhood
yearly_neighborhood_summaries = {}

for year, df_yearly in yearly_crime_dfs.items():
    neighborhood_summary = df_yearly['NEIGHBOURHOOD_140'].value_counts().reset_index()
    neighborhood_summary.columns = ['NEIGHBOURHOOD_140', 'num_crimes']
    
    yearly_neighborhood_summaries[year] = neighborhood_summary

In [108]:
# save as .csv
for year in years:
    yearly_crime_dfs[year].to_csv('data/crime/by_year/all_crimes' + str(year) + '.csv')
    pd.DataFrame(monthly_summaries[year]).to_csv('data/crime/by_year_month/crime_summary' + str(year) + '.csv')
    yearly_neighborhood_summaries[year].to_csv('data/crime/by_neighbourhood/' + str(year) + 'crimes_per_neighbourhood.csv')

In [35]:
crime.to_csv('data/crime/major_crimes.csv')
new_neighbourhood_df.to_csv('data/neighbourhoods.csv')

# Neighbourhood Profile Data
Source: https://open.toronto.ca/dataset/neighbourhood-profiles/

In [41]:
neighbourhoods = pd.read_csv('data/neighbourhoods_140.csv', index_col=0)
neighbourhoods.head(2)

Unnamed: 0_level_0,AREA_ID,AREA_ATTR_ID,PARENT_AREA_ID,AREA_SHORT_CODE,AREA_LONG_CODE,AREA_NAME,AREA_DESC,CLASSIFICATION,CLASSIFICATION_CODE,OBJECTID,geometry
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,2501286,26022481,,30,30,Brookhaven-Amesbury (30),Brookhaven-Amesbury (30),Not an NIA or Emerging Neighbourhood,,17818369,"{'type': 'MultiPolygon', 'coordinates': [[[[-7..."
2,2501285,26022480,,106,106,Humewood-Cedarvale (106),Humewood-Cedarvale (106),Not an NIA or Emerging Neighbourhood,,17818385,"{'type': 'MultiPolygon', 'coordinates': [[[[-7..."


In [42]:
neighbourhoods.columns

Index(['AREA_ID', 'AREA_ATTR_ID', 'PARENT_AREA_ID', 'AREA_SHORT_CODE',
       'AREA_LONG_CODE', 'AREA_NAME', 'AREA_DESC', 'CLASSIFICATION',
       'CLASSIFICATION_CODE', 'OBJECTID', 'geometry'],
      dtype='object')

In [43]:
neighbourhoods.drop(columns=['PARENT_AREA_ID', 'AREA_SHORT_CODE', 'AREA_LONG_CODE', 'AREA_DESC', 'CLASSIFICATION_CODE'], inplace=True)

In [45]:
neighbourhoods['AREA_NAME'] = neighbourhoods['AREA_NAME'].apply(clean_string)

In [47]:
neighbourhoods.to_csv('data/neighbourhoods_140.csv')

# Neighbourhood Profile 2016

In [41]:
census2016 = pd.read_csv('data/raw/neighbourhood-profiles-2016.csv', index_col=0)

In [35]:
census2016.head()

Unnamed: 0_level_0,Category,Topic,Data Source,Characteristic,City of Toronto,Agincourt North,Agincourt South-Malvern West,Alderwood,Annex,Banbury-Don Mills,...,Willowdale West,Willowridge-Martingrove-Richview,Woburn,Woodbine Corridor,Woodbine-Lumsden,Wychwood,Yonge-Eglinton,Yonge-St.Clair,York University Heights,Yorkdale-Glen Park
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Neighbourhood Information,Neighbourhood Information,City of Toronto,Neighbourhood Number,,129,128,20,95,42,...,37,7,137,64,60,94,100,97,27,31
2,Neighbourhood Information,Neighbourhood Information,City of Toronto,TSNS2020 Designation,,No Designation,No Designation,No Designation,No Designation,No Designation,...,No Designation,No Designation,NIA,No Designation,No Designation,No Designation,No Designation,No Designation,NIA,Emerging Neighbourhood
3,Population,Population and dwellings,Census Profile 98-316-X2016001,"Population, 2016",2731571,29113,23757,12054,30526,27695,...,16936,22156,53485,12541,7865,14349,11817,12528,27593,14804
4,Population,Population and dwellings,Census Profile 98-316-X2016001,"Population, 2011",2615060,30279,21988,11904,29177,26918,...,15004,21343,53350,11703,7826,13986,10578,11652,27713,14687
5,Population,Population and dwellings,Census Profile 98-316-X2016001,Population Change 2011-2016,4.50%,-3.90%,8.00%,1.30%,4.60%,2.90%,...,12.90%,3.80%,0.30%,7.20%,0.50%,2.60%,11.70%,7.50%,-0.40%,0.80%


In [46]:
# drop the following columns as they do not exist in the census 2021 file
census2016.drop(columns=['Category', 'Topic', 'Data Source'], inplace=True)

In [47]:
# Now transpose the dataframe so the columns are rows and vice versa
census2016.set_index('Characteristic', inplace=True)
census2016 = census2016.transpose()
census2016.head()

Characteristic,Neighbourhood Number,TSNS2020 Designation,"Population, 2016","Population, 2011",Population Change 2011-2016,Total private dwellings,Private dwellings occupied by usual residents,Population density per square kilometre,Land area in square kilometres,Children (0-14 years),...,Intraprovincial migrants,Interprovincial migrants,External migrants,Total - Mobility status 5 years ago - 25% sample data,Non-movers,Movers,Non-migrants,Migrants,Interprovincial migrants.1,External migrants.1
City of Toronto,,,2731571,2615060,4.50%,1179057,1112929,4334,630.2,398135,...,49990,15780,59945,2556120,1516110,1040015,639060,400950,42985,216835
Agincourt North,129.0,No Designation,29113,30279,-3.90%,9371,9120,3929,7.41,3840,...,275,75,605,27490,18865,8610,5445,3170,135,2280
Agincourt South-Malvern West,128.0,No Designation,23757,21988,8.00%,8535,8136,3034,7.83,3075,...,320,90,490,22325,13565,8775,5610,3145,220,2170
Alderwood,20.0,No Designation,12054,11904,1.30%,4732,4616,2435,4.95,1760,...,220,40,70,11370,8235,3130,2200,925,70,245
Annex,95.0,No Designation,30526,29177,4.60%,18109,15934,10863,2.81,2360,...,900,385,835,27715,12980,14735,8340,6390,1310,2460


In [49]:
income = census2016.iloc[:,1037:1054]

In [54]:
education = census2016.iloc[:, 1702:1714]

In [62]:
# There are lots of trailing white spaces which will affect the selection of columns so that is being taken out
census2016.columns = [col.strip() for col in census2016.columns]
income = [col.strip() for col in income.columns]
education = [col.strip() for col in education.columns]

In [55]:
pop_by_age_group = [
    'Male: 0 to 04 years',
    'Male: 05 to 09 years',
    'Male: 10 to 14 years',
    'Male: 15 to 19 years',
    'Male: 20 to 24 years',
    'Male: 25 to 29 years',
    'Male: 30 to 34 years',
    'Male: 35 to 39 years',
    'Male: 40 to 44 years',
    'Male: 45 to 49 years',
    'Male: 50 to 54 years',
    'Male: 55 to 59 years',
    'Male: 60 to 64 years',
    'Male: 65 to 69 years',
    'Male: 70 to 74 years',
    'Male: 75 to 79 years',
    'Female: 10 to 14 years',
    'Male: 80 to 84 years',
    'Male: 85 to 89 years',
    'Male: 90 to 94 years',
    'Male: 95 to 99 years',
    'Male: 100 years and over',
    'Female: 0 to 04 years',
    'Female: 05 to 09 years',
    'Female: 15 to 19 years',
    'Female: 20 to 24 years',
    'Female: 25 to 29 years',
    'Female: 30 to 34 years',
    'Female: 35 to 39 years',
    'Female: 40 to 44 years',
    'Female: 45 to 49 years',
    'Female: 50 to 54 years',
    'Female: 55 to 59 years',
    'Female: 60 to 64 years',
    'Female: 65 to 69 years',
    'Female: 70 to 74 years',
    'Female: 75 to 79 years',
    'Female: 80 to 84 years',
    'Female: 85 to 89 years',
    'Female: 90 to 94 years',
    'Female: 95 to 99 years',
    'Female: 100 years and over'
]

characteristic_items = ['Single-detached house',
    'Apartment in a building that has five or more storeys',
    'Other attached dwelling',
    'Average household size',
    'Married or living common law',
    'Not married and not living common law',
    'Lone-parent census families in private households',
    'One-person households',
    'Two-or-more person non-census-family households',
    'Prevalence of low income based on the Low-income measure, after tax (LIM-AT) (%)',
    'Non-immigrants',
    'Immigrants',
    'Owner',
    'Renter',
    'One person or fewer per room',
    'More than 1 person per room',
    'Suitable',
    'Not suitable',
    'Total population aged 15 years and over by work activity during the reference year - 25% sample data',
    'Did not work',
    'Worked',
    'Car, truck, van - as a driver',
    'Car, truck, van - as a passenger',
    'Public transit',
    'Walked',
    'Bicycle',
    'Other method'
    ]

keep = ['Characteristic'] + characteristic_items

# this is to aggregate all ages
population = census2016.loc[:, census2016.columns.isin(pop_by_age_group)]
census2016 = census2016.loc[:, census2016.columns.isin(keep)]
census2016.head()

Unnamed: 0,Single-detached house,Apartment in a building that has five or more storeys,Other attached dwelling,Average household size,Married or living common law,Not married and not living common law,Lone-parent census families in private households,One-person households,Two-or-more person non-census-family households,"Prevalence of low income based on the Low-income measure, after tax (LIM-AT) (%)",...,Not suitable,Total population aged 15 years and over by work activity during the reference year - 25% sample data,Did not work,Worked,"Car, truck, van - as a driver","Car, truck, van - as a passenger",Public transit,Walked,Bicycle,Other method
City of Toronto,269675,493275,349880,2.42,1181360,1152075,152595,359960,68010,20.2,...,134825,2294790,791795,1502995,575255,57170,463000,107665,34355,13610
Agincourt North,3345,2120,3645,3.16,14200,11075,1650,1350,290,21.3,...,1315,25000,11060,13940,7155,930,3350,265,70,45
Agincourt South-Malvern West,2790,3145,2200,2.88,11205,9475,1360,1615,445,22.6,...,970,20400,8270,12140,6135,665,2985,280,35,65
Alderwood,2840,255,1525,2.6,5735,4550,670,1105,185,8.2,...,325,10265,3245,7015,4090,355,1285,195,65,65
Annex,645,8165,7125,1.8,12275,15890,895,7885,1430,18.3,...,800,26305,6925,19365,3290,290,6200,3200,1675,225


In [56]:
# Rename the 0 to 04 year age groups for simplicity in code
population.rename(columns={'Male: 0 to 04 years': 'Male: 00 to 04 years', 'Female: 0 to 04 years': 'Female: 00 to 04 years'}, inplace=True)

In [57]:
for age_group in range(0, 101, 5):
    # Define column names for male and female
    male_col_name = f"Male: {age_group:02d} to {age_group + 4:02d} years"
    female_col_name = f"Female: {age_group:02d} to {age_group + 4:02d} years"
    
    if male_col_name in population.columns and female_col_name in population.columns:
        population[f"Population for {age_group:02d} to {age_group + 4:02d} years"] = population[male_col_name] + population[female_col_name]
        
        population.drop([male_col_name, female_col_name], axis=1, inplace=True)

In [58]:
population['Population for 100 years and over'] = population['Female: 100 years and over'] + population['Male: 100 years and over']
population.drop(columns=['Female: 100 years and over', 'Male: 100 years and over'], inplace=True)

In [59]:
population.head()

Unnamed: 0,Population for 00 to 04 years,Population for 05 to 09 years,Population for 10 to 14 years,Population for 15 to 19 years,Population for 20 to 24 years,Population for 25 to 29 years,Population for 30 to 34 years,Population for 35 to 39 years,Population for 40 to 44 years,Population for 45 to 49 years,...,Population for 55 to 59 years,Population for 60 to 64 years,Population for 65 to 69 years,Population for 70 to 74 years,Population for 75 to 79 years,Population for 80 to 84 years,Population for 85 to 89 years,Population for 90 to 94 years,Population for 95 to 99 years,Population for 100 years and over
City of Toronto,6989566105,6935065680,6494562165,7424071285,9741597330,113905119040,108895115675,94070102240,8653595860,90860100065,...,8814594660,7227081600,6036070180,4232051285,3273043430,2567034965,1566525135,618513500,12803475,650125
Agincourt North,660620,695620,660595,840865,1015975,10151005,835935,680775,760910,8901060,...,10601165,9251070,925985,590690,490575,380485,210350,100160,2560,100
Agincourt South-Malvern West,575530,540495,460475,780690,1000895,1045975,820835,625715,610745,760885,...,850915,710795,630690,425450,350405,240350,155205,45100,2020,0
Alderwood,360320,270325,225255,285290,355310,355350,410430,455450,420445,440430,...,540485,390400,300325,205210,155180,105210,65130,3570,105,50
Annex,445480,365400,325355,465550,12151520,20802265,16101675,10551040,835915,850855,...,855915,755940,780950,640700,485565,335425,230345,80260,2590,255


In [60]:
# Merge together the two dataframes
census2016 = pd.concat([census2016, population], axis=1)
census2016 = pd.concat([census2016, income], axis=1)
census2016 = pd.concat([census2016, education], axis=1)

In [63]:
for col in census2016.columns:
    print(col)

Single-detached house
Apartment in a building that has five or more storeys
Other attached dwelling
Average household size
Married or living common law
Not married and not living common law
Lone-parent census families in private households
One-person households
Two-or-more person non-census-family households
Prevalence of low income based on the Low-income measure, after tax (LIM-AT) (%)
Non-immigrants
Immigrants
Owner
Renter
One person or fewer per room
More than 1 person per room
Suitable
Not suitable
Total population aged 15 years and over by work activity during the reference year - 25% sample data
Did not work
Worked
Car, truck, van - as a driver
Car, truck, van - as a passenger
Public transit
Walked
Bicycle
Other method
Population for 00 to 04 years
Population for 05 to 09 years
Population for 10 to 14 years
Population for 15 to 19 years
Population for 20 to 24 years
Population for 25 to 29 years
Population for 30 to 34 years
Population for 35 to 39 years
Population for 40 to 44 

In [64]:
census2016.index = census2016.index.str.lower()
census2016.index = census2016.index.map(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [66]:
for col in census2016.columns:
    print(col)

Single-detached house
Apartment in a building that has five or more storeys
Other attached dwelling
Average household size
Married or living common law
Not married and not living common law
Lone-parent census families in private households
One-person households
Two-or-more person non-census-family households
Prevalence of low income based on the Low-income measure, after tax (LIM-AT) (%)
Non-immigrants
Immigrants
Owner
Renter
One person or fewer per room
More than 1 person per room
Suitable
Not suitable
Total population aged 15 years and over by work activity during the reference year - 25% sample data
Did not work
Worked
Car, truck, van - as a driver
Car, truck, van - as a passenger
Public transit
Walked
Bicycle
Other method
Population for 00 to 04 years
Population for 05 to 09 years
Population for 10 to 14 years
Population for 15 to 19 years
Population for 20 to 24 years
Population for 25 to 29 years
Population for 30 to 34 years
Population for 35 to 39 years
Population for 40 to 44 

In [67]:
# shorten column names

census2016.rename(columns={'Single-detached house': 'single_detached_house', 
                           'Apartment in a building that has five or more storeys': 'apart_5_plus',
                           'Other attached dwelling': 'other_dwelling',
                           'Average household size' : 'avg_household_size',
                           'Married or living common law' : ' married',
                           'Not married and not living common law' : 'not_married',
                           'Lone-parent census families in private households' : 'single_parents',
                           'One-person households' : 'one_person_household',
                           'Two-or-more person non-census-family households' : 'two_plus_person_household',
                           'Prevalence of low income based on the Low-income measure, after tax (LIM-AT) (%)' : 'low_income_percent',
                           'Non-immigrants' : 'non_immigrants',
                           'One person or fewer per room' : 'own_room',
                           'More than 1 person per room' : 'sharing_room',
                           'Suitable' : 'suitable_housing',
                           'Not suitable' : 'not_suitable_housing',
                           'Total population aged 15 years and over by work activity during the reference year - 25% sample data' : 'total_work_activity',
                           'Did not work' : 'unemployed',
                           'Worked' : 'employed',
                            'Car, truck, van - as a driver' : 'commute_drives',
                            'Car, truck, van - as a passenger' : 'commute_passenger',
                            'Public transit' : 'commute_public_transport',
                            'Walked' : 'commute_walk',
                            'Bicycle' : 'commute_cycle',
                            'Other method' : 'commute_other',
                            'Total - Household total income groups in 2015 for private households - 100% data' : 'household_income',
                            'Under $5,000' : 'income_under_5000',
                            '$5,000 to $9,999' : 'income_between_5000_9000',
                            '$10,000 to $14,999' : 'income_between_10000_14999',
                            '$20,000 to $24,999' : 'income_between_20000_24999',
                            '$25,000 to $29,999' : 'income_between_25000_29999',
                            '$30,000 to $34,999' : 'income_between_30000_34999',
                            '$35,000 to $39,999' : 'income_between_35000_39999',
                            '$40,000 to $44,999' : 'income_between_40000_44999',
                            '$45,000 to $49,999' : 'income_between_45000_49999',
                            '$50,000 to $54,999' : 'income_between_50000_54999',
                            '$55,000 to $59,999' : 'income_between_55000_59999',
                            '$60,000 to $64,999' : 'income_between_60000_64999',
                            '$65,000 to $69,999' : 'income_between_65000_69999',
                            '$70,000 to $74,999' : 'income_between_70000_74999',
                            '$75,000 to $79,999' : 'income_between_75000_79999',
                            '$80,000 to $84,999' : 'income_between_80000_84999',
                            '$85,000 to $89,999' : 'income_between_85000_89999',
                            '$90,000 to $94,999' : 'income_between_90000_94999',
                            '$100,000 and over' : 'income_above_100k',
                            'Total - Highest certificate, diploma or degree for the population aged 15 years and over in private households - 25% sample data' : 'total_highest_education',
                            'No certificate, diploma or degree' : 'no_certificate_diploma_degree',
                            'Secondary (high) school diploma or equivalency certificate' : 'highschool_diploma',
                            'Trades certificate or diploma other than Certificate of Apprenticeship or Certificate of Qualification' : 'trades_certificate',
                            'Certificate of Apprenticeship or Certificate of Qualification' : 'apprenticeship_certificate',
                            'College, CEGEP or other non-university certificate or diploma' : 'non-university_certificate',
                            'University certificate or diploma below bachelor level' : 'university_certificate',
                            'University certificate, diploma or degree at bachelor level or above' : 'bachelors_or_above',
                            'Bachelor\'s degree' : 'bachelors',
                            'University certificate or diploma above bachelor level' : 'above_bachelors',
                            'Degree in medicine, dentistry, veterinary medicine or optometry' : 'degree_medicine_vet_etc',
                            'Earned doctorate' : 'doctorate'
                           }, inplace=True)

In [68]:
census2016.columns = census2016.columns.str.lower()
census2016.columns = census2016.columns.map(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [69]:
census2016.to_csv('data/census2016.csv')