In [1]:
import pandas
import numpy

In [6]:
# Definitions

def read_and_arrange_dataframe(path):
    dataframe = pandas.read_csv(path, index_col='shid')
    cleaned_dataframe = dataframe.dropna(axis=1, how='all')
    return cleaned_dataframe

def correlate(input_dataframe, min_periods=100):
    correlated_df = input_dataframe.corr(method='pearson', min_periods=min_periods)
    cleaned_correlated_df = correlated_df.dropna(axis=0, how='all').dropna(axis=1, how='all')
    rearranged_columns = list(reversed(cleaned_correlated_df.columns.tolist()))
    rearranged_correlated_dataframe = cleaned_correlated_df[rearranged_columns]
    return rearranged_correlated_dataframe

def blank_diagonal(input_dataframe):
    mask = numpy.zeros(input_dataframe.shape, dtype='bool')
    mask[numpy.triu_indices(len(input_dataframe))] = True
    found_pairs = set()
    for y_index, row in input_dataframe.iterrows():
        for x_index, item in row.iteritems():
            key = frozenset((x_index, y_index))
            if key in found_pairs:
                input_dataframe[x_index][y_index] = None
            else:
                found_pairs.add(key)

In [None]:
# Omaha analysis
omaha_path = 'omaha-council-bluffs-msa/'
omaha_input_dataframe = read_and_arrange_dataframe(omaha_path + 'pivoted-characteristics.csv')
omaha_correlated_dataframe = correlate(omaha_input_dataframe, 150)
omaha_correlated_dataframe

In [None]:
# Persist Omaha results
omaha_correlated_dataframe.to_csv(omaha_path + 'correlations.csv')

In [None]:
# NYC analysis
nyc_path = 'new-york-five-boroughs/'
nyc_input_dataframe = read_and_arrange_dataframe(nyc_path + 'pivoted-characteristics.csv')
nyc_correlated_dataframe = correlate(nyc_input_dataframe, 1100)
nyc_correlated_dataframe

In [None]:
# Persist NYC results
blank_diagonal(nyc_correlated_dataframe)
nyc_correlated_dataframe.to_csv(nyc_path + 'correlations.csv')

In [4]:
# San Francisco Analysis
sf_path = 'san-francisco/'
sf_input_dataframe = read_and_arrange_dataframe(sf_path + 'pivoted-characteristics.csv')
sf_correlated_dataframe = correlate(sf_input_dataframe, 100)
sf_correlated_dataframe

Unnamed: 0,3 - pop-total-total_pop,2 - houseunit-total-total_housing_units,1 - households__total_households,54 - lai-ht-transportation-costs,53 - lai__median_commute_distance,52 - pop-total-commute_mean_travel_time,51 - broadband-total-number_of_providers,50 - broadband-techtype-dsl,49 - broadband-techtype-cable,46 - broadband-techtype-fiber,...,10 - civilian-employment-health-diagnosis,9 - civilian-employment-art-entertain-sports,8 - civilian-employment-edu-library,7 - civilian-employment-legal,6 - civilian-employment-social-service,5 - civilian-employment-social-science,4 - civilian-employment-architecture-engineer,3 - civilian-employment-computer-math,2 - civilian-employment-finance,1 - civilian-employment-management
1 - civilian-employment-management,-0.135719,0.010022,0.008645,-0.204496,-0.354846,-0.101846,0.050618,0.099410,0.018674,0.043885,...,0.330677,0.483380,0.308076,0.636271,-0.034615,0.338448,0.235495,0.589684,0.645523,1.000000
2 - civilian-employment-finance,-0.007562,0.096914,0.100167,-0.279033,-0.363125,-0.051849,-0.021271,0.117628,-0.053017,-0.034643,...,0.110539,0.381925,0.204838,0.575854,-0.089704,0.160291,0.081648,0.447963,1.000000,0.645523
3 - civilian-employment-computer-math,-0.003992,0.097140,0.102917,-0.347663,-0.462184,-0.124807,0.217285,0.160534,0.151759,0.164744,...,0.104798,0.538644,0.316452,0.500156,0.025929,0.326122,0.064609,1.000000,0.447963,0.589684
4 - civilian-employment-architecture-engineer,-0.125435,-0.091602,-0.089351,-0.000966,-0.140175,-0.513608,-0.048145,0.006342,-0.037423,-0.039926,...,0.845816,0.135812,0.064287,0.056200,-0.049279,0.565025,1.000000,0.064609,0.081648,0.235495
5 - civilian-employment-social-science,-0.210522,-0.168556,-0.162053,0.008322,-0.114044,-0.286475,-0.004309,-0.080849,0.128878,-0.041946,...,0.638344,0.324888,0.206699,0.251297,0.058667,1.000000,0.565025,0.326122,0.160291,0.338448
6 - civilian-employment-social-service,-0.013505,-0.015884,-0.015993,-0.003287,-0.052578,0.000857,0.066128,-0.088039,0.101418,0.005025,...,-0.038115,0.066832,0.118885,0.001396,1.000000,0.058667,-0.049279,0.025929,-0.089704,-0.034615
7 - civilian-employment-legal,-0.054079,0.050567,0.052124,-0.220395,-0.333289,-0.015152,0.072002,0.047222,0.051644,0.054027,...,0.193124,0.472284,0.277707,1.000000,0.001396,0.251297,0.056200,0.500156,0.575854,0.636271
8 - civilian-employment-edu-library,-0.080279,-0.052130,-0.052575,0.047462,-0.007416,0.170318,-0.193444,-0.067543,-0.066930,-0.173183,...,0.140471,0.273368,1.000000,0.277707,0.118885,0.206699,0.064287,0.316452,0.204838,0.308076
9 - civilian-employment-art-entertain-sports,-0.000099,0.054780,0.067182,-0.281886,-0.436118,-0.023024,0.021226,0.092096,0.063566,-0.058877,...,0.160945,1.000000,0.273368,0.472284,0.066832,0.324888,0.135812,0.538644,0.381925,0.483380
10 - civilian-employment-health-diagnosis,-0.236651,-0.175352,-0.172906,0.102407,-0.017033,-0.409265,-0.117719,-0.063635,-0.073254,-0.081635,...,1.000000,0.160945,0.140471,0.193124,-0.038115,0.638344,0.845816,0.104798,0.110539,0.330677


In [7]:
blank_diagonal(sf_correlated_dataframe)
sf_correlated_dataframe.to_csv(sf_path + 'correlations.csv')