### Step 3: Variable preparation

__Step goal__: Select and scale variables for clustering.

__Step overview__:
1. Load the data;
2. Select and scale variables?;
3. Save the result.

In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from src.data.census.canada.prepare_ca_data import scale_ca_variables, select_ca_variables
from src.data.census.usa.prepare_us_data import scale_us_variables, select_us_variables
import os

## 1. Canada

#### 1.1 Load the data

In [8]:
%%time
# Load the data
extended_variables = True
city = 'Montréal'
df = pd.read_csv(f'../../data/processed/census/canada/{city}.csv') # data

Wall time: 1.81 s


#### 1.2 Select and scale the variables of interest

In [9]:
# scale variables: NEED TO SCALE FIRST because the scaling function assumes that the dataset is complete, anything can be selected afterwards
df = scale_ca_variables(df)

# select variables
variable_names, data = select_ca_variables(city, df, return_names=True, print_selected=True, include_occupation=False, include_commute=False, extended_variables=extended_variables)
data = data.apply(pd.to_numeric, axis=1)
print(f'Number of variables selected : {data.shape[1]}')

['Population density per square kilometre',
 'Average age of the population',
 'Canadian citizens',
 'Not Canadian citizens',
 'Non-immigrants',
 'Immigrants',
 'First generation',
 'Second generation',
 'Third generation or more',
 'Aboriginal identity',
 'Non-Aboriginal identity',
 'Total visible minority population',
 'Number of persons in private households',
 'Married',
 'Not married and not living common law',
 'Total couple families',
 'Total lone-parent families by sex of parent',
 'English only',
 'French only',
 'English and French',
 'Neither English nor French',
 'Average total income in 2015 among recipients ($)',
 'Spending less than 30% of income on shelter costs',
 'Spending 30% or more of income on shelter costs',
 'Owner',
 'Renter',
 'Average value of dwellings ($)',
 'Average monthly shelter costs for rented dwellings ($)',
 'Average number of rooms per dwelling',
 'No certificate, diploma or degree',
 'Secondary (high) school diploma or equivalency certificate',
 '

#### 1.3 Save selected variables

In [10]:
# Save the variables
directory = "../../variables/"
if not os.path.exists(directory):
    print(f'Succefully created new directory {directory}')
    os.makedirs(directory)
    
if extended_variables == False:
    data.to_csv(directory + f'{city.lower()}_short.csv', index=False)
    pd.DataFrame(variable_names, columns=['variable_name']).to_csv(directory + f'{city.lower()}_short_variable_names.csv', index=False)
else:
    data.to_csv(directory + f'{city.lower()}.csv', index=False)
    pd.DataFrame(variable_names, columns=['variable_name']).to_csv(directory + f'{city.lower()}_variable_names.csv', index=False)

## 2. United States

#### 2.1 Load the data

In [17]:
%%time
extended_variables = True
# Load the data
city = 'san_francisco'
df = pd.read_csv(f'../../data/processed/census/united_states/{city}.csv') # data

Wall time: 23.9 ms


#### 2.2 Select and scale the variables of interest

In [18]:
# scale variables: NEED TO SCALE FIRST because the scaling function assumes that the dataset is complete, anything can be selected afterwards
df = scale_us_variables(df)
# select variables
data = select_us_variables(df, print_selected=True, extended_variables = extended_variables)
data = data.apply(pd.to_numeric, axis=1)
print(f'Number of variables selected : {data.shape[1]}')

Index(['population', 'age', 'white', 'minority', 'tot_households',
       'average_household_size', 'married_households', 'nonfamily_households',
       'median_household_income', 'owner', 'renter', 'median_gross_rent',
       'tot_gross_rent_as_percent_of_income', 'less_than_30_of_income',
       'more_than_30_of_income', 'median_gross_rent_as_percent_of_income',
       'median_n_rooms', 'less_than_high_school', 'high_school',
       'bachelors_degree', 'masters_degree', 'professional_school_degree',
       'doctorate_degree', 'employed', 'unemployed'],
      dtype='object')
Number of variables selected : 25


### 2.3 Save selected variables

In [19]:
# Save the variables
directory = "../../variables/"
if not os.path.exists(directory):
    print(f'Succefully created new directory {directory}')
    os.makedirs(directory)
    
if extended_variables == False:
    data.to_csv(directory + f'{city.lower()}_short.csv', index=False)
else:
    data.to_csv(directory + f'{city.lower()}.csv', index=False)