In [47]:
import warnings

import pandas as pd
import numpy as np
from openpyxl import load_workbook
from sklearn import preprocessing

from data_preprocessing import read_table, merge_str


In [22]:
np.random.seed(0)
random_state = 0

# Dataset for a Binary Classification Task of Gentrified Tracts in Melbourne

This notebook was created put together all the fields that necessary to train a machine learning model into a single dataset. The training labels were obtained following Freeman's 5-step definition of gentrified areas.

# Table of Contents

1. [Data Assembly 2006-2016](#Assembly2006)


2. [Data Assembly 2011-2016](#Assembly2011)

   

In [3]:
def normalize(df, ft_list):
    
    for feature in ft_list:
        min_max_scaler = preprocessing.MinMaxScaler()
        feature_scaled = min_max_scaler.fit_transform(df[[feature]])
        df[feature+'_normalized']= feature_scaled
        
    return  df

In [4]:
def assembly_PT_ft(closest_stops_path, network_properties_file, network):
    
    # Calculate the inverse of distance to closest stop for decay function
    closest_stops = read_table(closest_stops_path)
    
    closest_stops['dist_closest_stop_' + network]= 1/closest_stops['dist_closest_stop_' + network]

    # Normalize the inverse of distance to closest stop
    closest_stops = normalize(closest_stops, ['dist_closest_stop_' + network])

    # Merge tracts and the Network Properties of their closest tram stop
    network_properties = read_table(network_properties_file)

    network_features = merge_str(closest_stops[['SA1_MAINCO', 
                                             'SA1_7DIG16', 
                                             'closest_stop_' + network,
                                             'dist_closest_stop_' + network + '_normalized']], 
                              network_properties[['STOP_ID',
                                             'Degree',
                                             'Betweenness', 
                                             'Closeness']], 
                              'closest_stop_' + network, 
                              'STOP_ID', 
                              how='left')

    #  Apply decay function to the properties of the closest stations to each tract
    for feature in ['Degree','Betweenness', 'Closeness']:

        network_features[feature] = pd.to_numeric(network_features[feature])
        network_features[feature + '_' + network + '_decayed'] = \
                                              network_features['dist_closest_stop_'+ network + '_normalized']\
                                              * network_features[feature]

    network_features = network_features[['SA1_MAINCO', 
                                   'SA1_7DIG16',
                                   'Degree_'  + network + '_decayed', 
                                   'Betweenness_'  + network + '_decayed',
                                   'Closeness_'  + network + '_decayed']]
    return network_features

In [5]:
def features_join(tram_features, train_features, demo_features, labels):
    p1 = merge_str(tram_features.drop(['SA1_7DIG16'], axis=1),
                              train_features.drop(['SA1_7DIG16'], axis=1),
                              'SA1_MAINCO',
                              'SA1_MAINCO',
                              'left')
    
    p2 = merge_str(p1,
                   demo_features,
                   'SA1_MAINCO',
                   'SA1_MAINCO',
                   'left')
    
    complete =  merge_str(p2,
                          labels,
                          'SA1_MAINCO',
                          'SA1_MAINCO',
                          'left')
    return complete

## 2006 - 2016 Data Assembly <a name="Assembly2006"></a>

## Process Public Transport Features

### Tram 

In [6]:
closest_stops_path_2006 = 'Data/PT_Network_Features/tracts_closest_tram_train_stops_2006.csv'
tram_properties_file_2006 = 'Data/PT_Network_Features/tram_properties_all_years.xlsx'
tram_features_2006 = assembly_PT_ft(closest_stops_path_2006, tram_properties_file_2006, 'Tram')

### Train

In [7]:
closest_stations_path_2006 = 'Data/PT_Network_Features/tracts_closest_tram_train_stops_2006.csv'
train_properties_file_2006 = 'Data/PT_Network_Features/train_properties_2006.xlsx'
train_features_2006 = assembly_PT_ft(closest_stations_path_2006, train_properties_file_2006, 'Train')

## Demographic Features

In [8]:
demo_features_06_reduced = read_table('Data/Demographic_Features/all_features_inter_2006_reduced.csv')
for col in  demo_features_06_reduced.columns:
    
    demo_features_06_reduced[col] = pd.to_numeric(demo_features_06_reduced[col], errors='coerce')

    
demo_features_06_complete = read_table('Data/Demographic_Features/all_demographic_features_2006.csv')
for col in  demo_features_06_complete.columns:
    
    demo_features_06_complete[col] = pd.to_numeric(demo_features_06_complete[col], errors='coerce')    

## Labels

In [9]:
labels_06 = read_table('Data/Gentrification_Labels/labels_2006_2016.csv')

## Dataset Join

In [10]:
reduced_06_16 = features_join(tram_features_2006,
                              train_features_2006,
                              demo_features_06_reduced,
                              labels_06[['SA1_MAINCO','Gentrified_06_16']])

complete_06_16 = features_join(tram_features_2006,
                              train_features_2006,
                              demo_features_06_complete,
                              labels_06[['SA1_MAINCO','Gentrified_06_16']])

In [11]:
#reduced_06_16.to_csv('Data/dataset_reduced_06_16.csv')
#complete_06_16.to_csv('Data/dataset_complete_06_16.csv')

## 2011-2016 Data Assembly <a name="Assembly2011"></a>

## Process Public Transport Features
### Tram

In [12]:
closest_stops_path_2011 = 'Data/PT_Network_Features/tracts_closest_tram_train_stops_2016.csv'
tram_properties_file_2011 = 'Data/PT_Network_Features/tram_properties_all_years.xlsx'
tram_features_2011 = assembly_PT_ft(closest_stops_path_2011, tram_properties_file_2011, 'Tram')

In [13]:
closest_stations_path_2011 = 'Data/PT_Network_Features/tracts_closest_tram_train_stops_2016.csv'
train_properties_file_2011 = 'Data/PT_Network_Features/train_properties_2016.xlsx'
train_features_2011 = assembly_PT_ft(closest_stations_path_2011, train_properties_file_2011, 'Train')

## Demographic Features

In [14]:
demo_features_11_reduced = read_table('Data/Demographic_Features/all_features_inter_2011_reduced.csv')
for col in  demo_features_11_reduced.columns:
    
    demo_features_11_reduced[col] = pd.to_numeric(demo_features_11_reduced[col], errors='coerce')

    
demo_features_11_complete = read_table('Data/Demographic_Features/all_demo_features_2011.csv')
for col in  demo_features_11_complete.columns:
    
    demo_features_11_complete[col] = pd.to_numeric(demo_features_11_complete[col], errors='coerce')

## Labels

In [15]:
labels_11 = read_table('Data/Gentrification_Labels/labels_2011_2016.csv')

## Dataset Join

In [16]:
reduced_11_16 = features_join(tram_features_2011,
                              train_features_2011,
                              demo_features_11_reduced,
                              labels_11[['SA1_MAINCO','Gentrified_11_16']])

complete_11_16 = features_join(tram_features_2011,
                              train_features_2011,
                              demo_features_11_complete,
                              labels_11[['SA1_MAINCO','Gentrified_11_16']])

In [17]:
reduced_11_16.to_csv('Data/dataset_reduced_11_16.csv')
complete_11_16.to_csv('Data/dataset_complete_11_16.csv')