In [41]:
%matplotlib inline

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# City Datasets

In [64]:
cities_dataset_path = './datasets/cities/'
counties_dataset_path = './datasets/counties/'

# create separate dictionaries for each category of colleges that the cities are home to
top_cities = dict()
mid_cities = dict()
bot_cities = dict()
other_cities = dict() # misc areas for further analysis

# populate dicts based on filename indicators
for c in os.listdir(cities_dataset_path):
    # filenames truncated to city-state format for simplicity
    truncated_file_name = '-'.join((c.split('-')[:2]))
    if c.endswith("top-city.csv"):
        top_cities[truncated_file_name] = pd.read_csv(cities_dataset_path + c)
    elif c.endswith("mid-city.csv"):
        mid_cities[truncated_file_name] = pd.read_csv(cities_dataset_path + c)
    elif c.endswith("bot-city.csv"):
        bot_cities[truncated_file_name] = pd.read_csv(cities_dataset_path + c)
    elif '-' in c:
        other_cities[truncated_file_name] = pd.read_csv(cities_dataset_path + c)
print(top_cities.keys())
print(mid_cities.keys())
print(bot_cities.keys())
print(other_cities.keys())


# similar process for counties
top_counties = dict()
mid_counties = dict()
bot_counties = dict()
other_counties = dict()

for c in os.listdir(counties_dataset_path):
    # filenames truncated to county-state format for simplicity
    truncated_file_name = '-'.join((c.split('-')[:2]))
    if c.endswith("top-county.csv"):
        top_counties[truncated_file_name] = pd.read_csv(counties_dataset_path + c)
    elif c.endswith("mid-county.csv"):
        mid_counties[truncated_file_name] = pd.read_csv(counties_dataset_path + c)
    elif c.endswith("bot-county.csv"):
        bot_counties[truncated_file_name] = pd.read_csv(counties_dataset_path + c)
    elif '-' in c:
        other_counties[truncated_file_name] = pd.read_csv(counties_dataset_path + c)

# testing
print('============== TOP TEST ===============')
print(top_cities['stanford-ca'])
print('\n============== MID TEST ===============')
print(mid_cities['whitewater-wi'])
print('\n============== BOT TEST ===============')
print(bot_counties['westmoreland-pa'])

dict_keys(['cambridge-ma', 'newhaven-ct', 'stanford-ca'])
dict_keys(['jacksonville-il', 'westerville-oh', 'whitewater-wi'])
dict_keys(['greensburg-pa', 'redding-ca', 'sanbernardino-ca'])
dict_keys(['berkeley-ca'])
                                                 Fact  \
0        Population estimates, July 1, 2018,  (V2018)   
1   Population estimates base, April 1, 2010,  (V2...   
2   Population, percent change - April 1, 2010 (es...   
3                   Population, Census, April 1, 2010   
4                      Persons under 5 years, percent   
5                     Persons under 18 years, percent   
6                  Persons 65 years and over, percent   
7                             Female persons, percent   
8                                White alone, percent   
9            Black or African American alone, percent   
10   American Indian and Alaska Native alone, percent   
11                               Asian alone, percent   
12  Native Hawaiian and Other Pacific Islande

# Data Cleaning

The first part of the data we wanted to clean was simply the name of the column that holds all the statistic values. The name was originally just the name of the county/city. We believe that changing it to "fact_value" (given that the column with the name of the statistic is "fact", would be more clear.

In [71]:
all_csv = [top_cities, mid_cities, bot_cities, other_cities, top_counties, mid_counties, bot_counties, other_counties]
for d in all_csv:
    for k, v in d.items():
        v.rename(index = str, columns = {v.columns[2] : 'Fact Value'}, inplace = True)

assert top_cities['cambridge-ma'].columns[2] == 'Fact Value'

Currently, the column that should hold the fact value is of type object. We want this to be converted to ints so that we do not need to constantly type cast.

We don't need all these rows. Only interested in demographic data.

In [72]:
print(top_cities['stanford-ca'].dtypes)

for d in all_csv:
    for k, v in d.items():

Fact                                        object
Fact Note                                   object
Fact Value                                  object
Value Note for Stanford CDP, California    float64
dtype: object


As seen in the above output, there are a couple of entirely or largely unneeded columns in the data where all the values are either NaN or irrelevant. 

We will be dropping or modifying the following columns to clean the data:

Fact Note:

This indicates
(a) Includes persons reporting only one race
(b) Hispanics may be of any race, so also are included in applicable race categories
(c) Economic Census - Puerto Rico data are not comparable to U.S. Economic Census Data.

As none of these factors influence the demographic data of the areas we are analyzing, we can remove this data.

Value Note: 

This indicates 
(-) Either no or too few sample observations were available to compute an estimate, or a ratio of medians cannot be calculated because one or both of the median estimates falls in the lowest or upper interval of an open ended distribution.		
(D)	Suppressed to avoid disclosure of confidential information		
(F)	Fewer than 25 firms		
(FN) Footnote on this item in place of data		
(NA) Not available		
(S)	Suppressed; does not meet publication standards		
(X)	Not applicable		
(Z)	Value greater than zero but less than half unit of measure shown

Value notes are indicated in the fact value column so we do not need the value note column at all.