In [None]:
%matplotlib inline

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from collections import OrderedDict

# Data Wrangling

All data on cities and counties were sourced from https://www.census.gov/quickfacts.

Because we will be analyzing the impact, if any, of variously ranked colleges on the respective cities and counties they are located in, we want the data to be organized based on ranking, allowing us to easily isolate the analysis.

The U.S. Census Bureau provides us tables with the following data.

In [None]:
cities_dataset_path = './datasets/cities/'
counties_dataset_path = './datasets/counties/'

# create separate dictionaries for each category of colleges that the cities are home to
top_cities = dict()
mid_cities = dict()
bot_cities = dict()
other_cities = dict() # misc areas for further analysis

# populate dicts based on filename indicators
for c in os.listdir(cities_dataset_path):
    # filenames truncated to city-state format for simplicity
    truncated_file_name = '-'.join((c.split('-')[:2]))
    if c.endswith("top-city.csv"):
        top_cities[truncated_file_name] = pd.read_csv(cities_dataset_path + c)
    elif c.endswith("mid-city.csv"):
        mid_cities[truncated_file_name] = pd.read_csv(cities_dataset_path + c)
    elif c.endswith("bot-city.csv"):
        bot_cities[truncated_file_name] = pd.read_csv(cities_dataset_path + c)
    elif '-' in c:
        other_cities[truncated_file_name] = pd.read_csv(cities_dataset_path + c)

# similar process for counties
top_counties = dict()
mid_counties = dict()
bot_counties = dict()
other_counties = dict()

for c in os.listdir(counties_dataset_path):
    # filenames truncated to county-state format for simplicity
    truncated_file_name = '-'.join((c.split('-')[:2]))
    if c.endswith("top-county.csv"):
        top_counties[truncated_file_name] = pd.read_csv(counties_dataset_path + c)
    elif c.endswith("mid-county.csv"):
        mid_counties[truncated_file_name] = pd.read_csv(counties_dataset_path + c)
    elif c.endswith("bot-county.csv"):
        bot_counties[truncated_file_name] = pd.read_csv(counties_dataset_path + c)
    elif '-' in c:
        other_counties[truncated_file_name] = pd.read_csv(counties_dataset_path + c)

# testing
print('============== TOP TEST ===============')
print(top_cities['stanford-ca'])
print('\n============== MID TEST ===============')
print(mid_cities['whitewater-wi'])
print('\n============== BOT TEST ===============')
print(bot_counties['westmoreland-pa'])

# Data Cleaning

The first part of the data we wanted to clean was simply the name of the column that holds all the statistic values. The name was originally just the name of the county/city. We believe that changing it to "fact_value" (given that the column with the name of the statistic is "fact", would be more clear.

In [None]:
all_csv = [top_cities, mid_cities, bot_cities, other_cities, top_counties, mid_counties, bot_counties, other_counties]
for d in all_csv:
    for k, v in d.items():
        v.rename(index = str, columns = {v.columns[2] : 'Fact Value'}, inplace = True)

assert top_cities['cambridge-ma'].columns[2] == 'Fact Value'

Currently, the column that should hold the fact value is of type object (str). We want this to be converted to ints so that we do not need to constantly type cast when performing our analysis.

To do this, we need to first delete rows that a fact value that indicates this statistic is not useful such as NA for not available. Further descriptions of the invalid value strings are given in a following section.

In [None]:
assert len(bot_cities['redding-ca'].loc[bot_cities['redding-ca']['Fact Value'] == 'X']) != 0
invalid_values = ['X', 'NA', 'D', '-', 'FN', 'F', 'S', 'Z']
for d in all_csv:
    for k, v in d.items():
        for i, r in v.iterrows():
            if r['Fact Value'] in invalid_values:
                v.drop(i, inplace = True)
                

assert len(bot_cities['redding-ca'].loc[bot_cities['redding-ca']['Fact Value'] == 'X']) == 0

Now, we can iterate through the Fact Value column, strip all non-numeric or . (indicating a decimal number) characters and retype each value from str to float64. We print out the pre and post cleaning types of the Fact Value column to verify our code.

In [None]:
print('Before cleaning:')
print(mid_counties['walworth-wi']['Fact Value'].dtypes)

for d in all_csv:
    for k, v in d.items():
        v.loc[:, 'Fact Value'] = v.loc[:, 'Fact Value'].str.strip('%"$')
        v.loc[:, 'Fact Value'] = v.loc[:, 'Fact Value'].str.replace(',', '')   
        v.loc[:, 'Fact Value'] = v.loc[:, 'Fact Value'].astype('float64')
        
print('\nAfter cleaning:')
print(mid_counties['walworth-wi']['Fact Value'].dtypes)

As seen in the initial output of our data wrangling, there are a couple of entirely or largely unneeded *columns* in the data where all the values are either NaN or irrelevant. 

We will be dropping or modifying the following columns to clean the data:

###### Fact Note:

This indicates:

**(a)** Includes persons reporting only one race

**(b)** Hispanics may be of any race, so also are included in applicable race categories

**(c)** Economic Census - Puerto Rico data are not comparable to U.S. Economic Census Data

As none of these factors influence the demographic data of the areas we are analyzing, we can remove this data.

###### Value Note 

This indicates: 

**(-)** Either no or too few sample observations were available to compute an estimate, or a ratio of medians cannot be calculated because one or both of the median estimates falls in the lowest or upper interval of an open ended distribution.	

**(D)**	Suppressed to avoid disclosure of confidential information	

**(F)**	Fewer than 25 firms		

**(FN)** Footnote on this item in place of data	

**(NA)** Not available		

**(S)**	Suppressed; does not meet publication standards

**(X)**	Not applicable

**(Z)**	Value greater than zero but less than half unit of measure shown

Value notes are indicated in the Fact Value column so we do not need the Value Note column at all.

In [None]:
assert top_cities['stanford-ca'].shape == (67, 4)

for d in all_csv:
    for k, v in d.items():
        v.drop([v.columns[1], v.columns[3]], axis = 1, inplace = True)

assert top_cities['stanford-ca'].shape == (67, 2)

We also observe that there are also many *rows* that have a NaN value.

In [None]:
for d in all_csv:
    for k, v in d.items():
        print('NaN values in each column for ' + k + ':\n'+ str(v.isna().sum()))

These NaN rows are not useful, so we will drop these rows entirely.

In [None]:
for d in all_csv:
    for k, v in d.items():
        v.dropna(inplace = True)
        print('NaN values in each column for ' + k + ':\n'+ str(v.isna().sum()))

Because our analysis will only need statistics about demographics, we can drop all rows with irrelevant statistics. We decided to do this last so that, in case we do need to use other statistics, the data for that row will already be cleaned up to this point.

In [None]:
demographic_rows = ['White alone, percent', 'Black or African American alone, percent', 'American Indian and Alaska Native alone, percent', 'Asian alone, percent', 'Native Hawaiian and Other Pacific Islander alone, percent', 'Two or More Races, percent', 'Hispanic or Latino, percent', 'White alone, not Hispanic or Latino, percent']

for d in all_csv:
    for k, v in d.items():
        d[k] = v.loc[(v['Fact'].isin(demographic_rows))]
        # reset index to start from 0 since rows before may have been dropped
        d[k].reset_index(drop = True, inplace = True)

for d in all_csv:
    for k, v in d.items():
        assert d[k].shape == (7, 2) or d[k].shape == (8, 2)

We are now left with dataframes that are of shape either (7, 2) or (8, 2), have no NaN values, have values that are ready to analyze and work with (float type) and only contain relevant demographic statistics.

#### Here is the final cleaned data:

In [None]:
for d in all_csv:
    for k, v in d.items():
        print(k + ' | shape: ' + str(v.shape))
        print(v)
        print('\n')

#### Let's convert the data from dictionaries to dataframes. 

In [None]:
#reorder cities dictionaries to correspond with rankings

order_top_cities = OrderedDict()
order_top_cities['cambridge-ma'] = top_cities['cambridge-ma']
order_top_cities['stanford-ca'] = top_cities['stanford-ca']
order_top_cities['newhaven-ct'] = top_cities['newhaven-ct']
top_cities = order_top_cities

order_mid_cities = OrderedDict()
order_mid_cities['westerville-oh'] = mid_cities['westerville-oh']
order_mid_cities['whitewater-wi'] = mid_cities['whitewater-wi']
order_mid_cities['jacksonville-il'] = mid_cities['jacksonville-il']
mid_cities = order_mid_cities

order_bot_cities = OrderedDict()
order_bot_cities['greensburg-pa'] = bot_cities['greensburg-pa']
order_bot_cities['sanbernardino-ca'] = bot_cities['sanbernardino-ca']
order_bot_cities['redding-ca'] = bot_cities['redding-ca']
bot_cities = order_bot_cities

In [None]:
#convert all city csv to dataframe
columns = demographic_rows

df_cities = pd.DataFrame()
i1 = 0
city_csv = [top_cities, mid_cities, bot_cities, other_cities]
for a in city_csv:
    index = 0
    for b in a.keys():
        i2 = 0
        for x in range(0, 8):
            if x < len(city_csv[i1].get(b)["Fact Value"]):
                #if a value is not present (always pacific islander data), we set it equal to zero
                if (len(city_csv[i1].get(b)["Fact Value"]) < 7) and x ==4:
                     df_cities.loc[x,b] = 0
                else:
                    df_cities.loc[x,b] = (city_csv[i1].get(b)["Fact Value"][i2])
            i2 = i2 + 1
    i1 = i1 + 1
    index = index + 1
df_cities = df_cities.transpose()
df_cities

In [None]:
#reorder county dictionaries to correspond with rankings

order_top_counties = OrderedDict()
order_top_counties['middlesex-ma'] = top_counties['middlesex-ma']
order_top_counties['santaclara-ca'] = top_counties['santaclara-ca']
order_top_counties['newhaven-ct'] = top_counties['newhaven-ct']
top_counties = order_top_counties

order_mid_counties = OrderedDict()
order_mid_counties['delaware-oh'] = mid_counties['delaware-oh']
order_mid_counties['walworth-wi'] = mid_counties['walworth-wi']
order_mid_counties['morgan-il'] = mid_counties['morgan-il']
mid_counties = order_mid_counties

order_bot_counties = OrderedDict()
order_bot_counties['westmoreland-pa'] = bot_counties['westmoreland-pa']
order_bot_counties['sanbernardino-ca'] = bot_counties['sanbernardino-ca']
order_bot_counties['shasta-ca'] = bot_counties['shasta-ca']
bot_counties = order_bot_counties

In [None]:
#convert all county csv to dataframe
df_counties = pd.DataFrame()
i1 = 0
county_csv = [top_counties, mid_counties, bot_counties, other_counties]
for a in county_csv:
    index = 0
    for b in a.keys():
        bool = False
        i2 = 0
        for x in range(0, 8):
            if len(county_csv[i1].get(b)["Fact Value"]) < 8 and x==4:
                bool = True
                df_counties.loc[x,b] = 0
            elif bool == True:
                df_counties.loc[x,b] = (county_csv[i1].get(b)["Fact Value"][i2-1])
            else:
                df_counties.loc[x,b] = (county_csv[i1].get(b)["Fact Value"][i2])
            i2 = i2 + 1
    i1 = i1 + 1
    index = index + 1
df_counties = df_counties.transpose()
df_counties

In [None]:
for indexr, city in df_cities.iterrows():
    for indexc, newvalue in city.iteritems():
        if "." in str(newvalue):
            oldvalue = (newvalue)
            newvalue = (float(newvalue)) / 100
            df_cities.loc[indexr,indexc] = round(newvalue, 2)

df_cities.columns = ["White all", "African American", "Native American", "Asian", "Pacific Islander", "Multiracial", "Hispanic", "White"]
df_cities['Unknown'] = 1 - df_cities["White"] - df_cities["African American"]-df_cities["Native American"] -df_cities["Asian"]-df_cities["Pacific Islander"] -df_cities["Multiracial"]
df_cities = df_cities.drop("White all", axis=1)

cols = ['African American', 'Asian', 'Hispanic', 'Multiracial',
       'Native American', 'Pacific Islander', 'Unknown', 'White']
df_cities = df_cities[cols]
df_cities

In [None]:
for indexr, county in df_counties.iterrows():
    for indexc, newvalue in county.iteritems():
        if "." in str(newvalue):
            oldvalue = (newvalue)
            newvalue = (float(newvalue)) / 100
            df_counties.loc[indexr,indexc] = round(newvalue, 2)

df_counties.columns = ["White all", "African American", "Native American", "Asian", "Pacific Islander", "Multiracial", "Hispanic", "White"]
df_counties['Unknown'] = 1 - df_counties["White"] - df_counties["African American"]-df_counties["Native American"] -df_counties["Asian"]-df_counties["Pacific Islander"] -df_counties["Multiracial"]
df_counties = df_counties.drop("White all", axis=1)
cols = ['African American', 'Asian', 'Hispanic', 'Multiracial',
       'Native American', 'Pacific Islander', 'Unknown', 'White']
df_counties = df_counties[cols]
df_counties