In [2]:
import warnings

import pandas as pd
import matplotlib.pyplot as plt
from openpyxl import load_workbook
from sklearn import preprocessing

from data_preprocessing import read_table, merge_str

warnings.filterwarnings('ignore')

# Binary Prediction of Gentrified Tracts in Melbourne

This notebooks seeks to ensamble the set of features required to inform a machine-learning model in order to best classify the tracts or SA1 (2016 definition) in gentrified or non-gentrified. The training labels were obtained following Freeman's 5-step definition of gentrified areas.

In [3]:
def normalize(df, ft_list):
    
    for feature in ft_list:
        min_max_scaler = preprocessing.MinMaxScaler()
        feature_scaled = min_max_scaler.fit_transform(df[[feature]])
        df[feature+'_normalized']= feature_scaled
        
    return  df

# 2006 - 2016

## Process Public Transport Features

### Tram 

In [8]:
# Calculate the inverse of distance to closest stop for decay function
closest_stops = read_table('Data/PT_Network_Features/tracts_closest_tram_train_stops_2006.csv')

closest_stops['dist_closest_stop_Tram']= 1/closest_stops['dist_closest_stop_Tram']
closest_stops['dist_closest_stop_Train']= 1/closest_stops['dist_closest_stop_Train']

# Normalize the inverse of distance to closest stop
closest_stops = normalize(closest_stops, ['dist_closest_stop_Tram', 'dist_closest_stop_Train'])

# Merge tracts and the Network Properties of their closest tram stop
tram_properties = read_table('Data/PT_Network_Features/tram_properties_all_years.xlsx')

tram_features = merge_str(closest_stops[['SA1_MAINCO', 
                                         'SA1_7DIG16', 
                                         'closest_stop_Tram',
                                         'dist_closest_stop_Tram_normalized']], 
                          tram_properties[['STOP_ID',
                                         'Degree',
                                         'Betweenness', 
                                         'Closeness']], 
                          'closest_stop_Tram', 
                          'STOP_ID', 
                          how='left')

#  Apply decay function to the properties of the closest stations to each tract
for feature in ['Degree','Betweenness', 'Closeness']:
    
    tram_features[feature] = pd.to_numeric(tram_features[feature])
    tram_features[feature+'_tram_decayed'] = tram_features['dist_closest_stop_Tram_normalized']  \
                                             * tram_features[feature]
    
tram_features = tram_features[['SA1_MAINCO', 
                               'SA1_7DIG16',
                               'Degree_tram_decayed', 
                               'Betweenness_tram_decayed',
                               'Closeness_tram_decayed']]

### Train

In [9]:
# Merge tracts and the Network Properties of their closest tram stop
train_properties = read_table('Data/PT_Network_Features/train_properties_2006.xlsx')

train_features = merge_str(closest_stops[['SA1_MAINCO', 
                                          'SA1_7DIG16', 
                                          'closest_stop_Train',
                                          'dist_closest_stop_Train_normalized']], 
                           train_properties[['STOP_ID',
                                          'Degree',
                                          'Betweenness', 
                                          'Closeness']], 
                           'closest_stop_Train', 
                           'STOP_ID', 
                           how='left')

#  Apply decay function to the properties of the closest stations to each tract
for feature in ['Degree','Betweenness', 'Closeness']:
    
    train_features[feature] = pd.to_numeric(train_features[feature])
    train_features[feature+'_train_decayed'] = train_features['dist_closest_stop_Train_normalized']  \
                                              * train_features[feature]
    
train_features = train_features[['SA1_MAINCO', 
                                 'SA1_7DIG16',
                                 'Degree_train_decayed', 
                                 'Betweenness_train_decayed',
                                 'Closeness_train_decayed']] 

## Demographic Features

In [10]:
demo_features = read_table('Data/Demographic_Features/all_features_inter_2006.csv')

for col in  demo_features.columns:
    
    demo_features[col] = pd.to_numeric(demo_features[col], errors='coerce')

## Labels

In [11]:
labels = read_table('Data/Gentrification_Labels/labels_2006_2016.csv')