# Calling libraries

In [1]:
import pandas as pd
import numpy as np
import re

# Functions

In [2]:
# This function takes latitude and longitude from two points and gives distance in miles
# 0.0175 is pi/180 (1 degree in radian)
# 3959 is earth radius in miles

def euclid(x1, y1, x2, y2):
    deltax = (x1 - x2) * 0.0175 * np.cos(y1 * 0.0175) * 3959
    deltay = (y1 - y2) * 0.0175 * 3959
    dist= np.sqrt(deltax**2 + deltay**2)
    return dist

In [5]:
# This function takes a string and process it as follow:
# 1. Takes out all the non-alphanumeric characters
# 2. Converts the spaces -of any length- into underscores
# 3. Lowercase alphabets

def unitext(text_in):
    text_01 = re.sub('[^a-zA-Z0-9\s]','',text_in) #Takes out all the non-alphanumeric characters
    text_02 = re.sub('\s+','_',text_01) # Converts the spaces -of any length- into underscores
    text_03 = str.lower(text_02)
    text_04 = re.sub('\_$','',text_03) # Trims ending space characters
    text_05 = re.sub('^\_','',text_04) # Trims begining space characters
    return text_05    

In [6]:
# DO NOT DELETE THIS UNTIL THE DATA READING FINALIZED - USE FOR TESTING
x = unitext('  $100,000     tU Uo $199,999  ............................................................')
x

'100000_tu_uo_199999'

# Reading raw spreadsheets

In [21]:
# CSVs
tab04_df = pd.read_csv('data/table_4.csv', encoding = "utf-8", names=list('abcdefghijklmnopqrst'))
tab05_df = pd.read_csv('data/table_5.csv')
tab12_df = pd.read_csv('data/tabula-12.csv')
tab13_df = pd.read_csv('data/tabula-13.csv')
tab071_df = pd.read_csv('data/table_71.csv')
tab072_df = pd.read_csv('data/table_72.csv')

In [459]:
# XLSX
zip_us_df = pd.read_excel('data/zip_code_database.xlsx')
hq_df = pd.read_excel('data/hqs.xlsx', header=None)
sold_df = pd.read_excel('data/18.xlsx')

In [509]:
hq_df = pd.read_excel('data/hqs.xlsx', header=None)

# tab04_df

Population and Households in counties

In [248]:
# Remove blank rows

#Replace NaNs with 0s
tab04_df.fillna(0, inplace=True)

# open a list for blank rows
blank_rows=[]

# this loop makes a list of blank rows
for i in np.arange(tab04_df.shape[0]):
    if tab04_df.iloc[i,0] == 0:
        blank_rows.append(i)
    else:
        tab04_df.iloc[i,0] = unitext(str(tab04_df.iloc[i,0]))
    
# delete blank rows
tab04_df.drop(blank_rows, inplace=True)

In [276]:
# Split the dataframe on original PDF page break
tab04_df_1 = tab04_df[77:]
tab04_df_2 = tab04_df[:77]

In [278]:
# Renaming columns

new_names=[]
for i in np.arange(tab04_df_1.shape[1]):
    if i < 8:
        snippet = 'Population_'
    else:
        snippet = 'Housing_'
    
    new_names.append(snippet+str(tab04_df_1.iloc[1,i])) 

tab04_df_1.columns=new_names

In [279]:
# Deleting extra columns
bad_cols=[]
for i in np.arange(tab04_df_1.shape[1]):
    if '_0' in tab04_df_1.columns.values[i]:
        bad_cols.append(i)

tab04_df_1b = tab04_df_1.drop(tab04_df_1.columns[bad_cols], axis=1)
tab04_df_1b.columns.values[0] = 'County'

In [280]:
# Deleting extra rows
bad_rows=[]
for i in np.arange(tab04_df_1b.shape[0]):
    if '_county' in tab04_df_1b.iloc[i,0]:
        continue
    else:
        bad_rows.append(i)
tab04_df_1b.drop(tab04_df_1b.index[bad_rows], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [281]:
tab04_df_1b['County'] = tab04_df_1b['County'].str.replace('_county','')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [285]:
# Renaming columns

new_names=[]
for i in np.arange(tab04_df_2.shape[1]):
    if i < 7:
        snippet = 'Population_'
    else:
        snippet = 'Housing_'
    
    new_names.append(snippet+str(tab04_df_2.iloc[1,i])) 

tab04_df_2.columns=new_names

In [286]:
# Deleting extra columns
bad_cols=[]
for i in np.arange(tab04_df_2.shape[1]):
    if '_0' in tab04_df_2.columns.values[i]:
        bad_cols.append(i)

tab04_df_2b = tab04_df_2.drop(tab04_df_2.columns[bad_cols], axis=1)
tab04_df_2b.columns.values[0] = 'County'

In [287]:
# Deleting extra rows
bad_rows=[]
for i in np.arange(tab04_df_2b.shape[0]):
    if '_county' in tab04_df_2b.iloc[i,0]:
        continue
    else:
        bad_rows.append(i)
tab04_df_2b.drop(tab04_df_2b.index[bad_rows], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [288]:
tab04_df_2b['County'] = tab04_df_2b['County'].str.replace('_county','')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [293]:
pop_hous_df = pd.concat([tab04_df_2b,tab04_df_1b])

pop_hous_df.to_csv('data/pop_hous_df.csv')

# tab05_df
change in population density

In [189]:
clean_tab05_df = tab05_df.drop(['Unnamed: 1', 'Housing'], axis=1)

In [190]:
tab05_df.head(3)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Housing,in square,Population,unit,2000 to,1990 to,1980 to,2000 to.1,1990 to.1,1980 to.1
0,,Population,units,miles,density,density,2010,2000,1990,2010,2000,1990
1,Tennessee,6346105,2812133,"41,234 .90",153 .9,68 .2,11 .5,16 .7,6 .2,15 .3,20 .4,15 .9
2,Anderson County,75129,34717,337 .16,222 .8,103 .0,5 .3,4 .5,1 .3,7 .0,10 .7,13 .4


In [191]:
clean_tab05_df.columns = ['County', 'Land/sq.mi', 'Popul_Dens', 'Hous_Dens', 'Pop%00-10', 'Pop%90-00', 'Hous%80-90', 'Hous%00-10', 'Hous%90-00', 'Hous%80-90']

In [192]:
clean_tab05_df.drop([0,1,71,72], axis=0, inplace=True)

In [122]:
clean_tab05_df.drop([0,1,71,72], axis=0, inplace=True)

In [193]:
for i in np.arange(clean_tab05_df.shape[0]):
    clean_tab05_df.iloc[i,0] = unitext(clean_tab05_df.iloc[i,0])
    clean_tab05_df.iloc[i,0] = clean_tab05_df.iloc[i,0].replace('_county','')

In [131]:
x=–1

SyntaxError: invalid character in identifier (<ipython-input-131-1a5ac4908297>, line 1)

In [194]:
clean_tab05_df = clean_tab05_df.replace('–','0')

In [196]:
for i in np.arange(clean_tab05_df.shape[0]):
    for j in np.arange(1, clean_tab05_df.shape[1]):
        clean_tab05_df.iloc[i,j] = clean_tab05_df.iloc[i,j].replace('–','-')
        clean_tab05_df.iloc[i,j] = clean_tab05_df.iloc[i,j].replace(' ','')
        clean_tab05_df.iloc[i,j] = clean_tab05_df.iloc[i,j].replace(',','')
        clean_tab05_df.iloc[i,j] = float(clean_tab05_df.iloc[i,j])
        

In [153]:
clean_tab05_df.replace('–','-', inplace=True)
clean_tab05_df.replace(' ','', inplace=True)

In [197]:
clean_tab05_df

Unnamed: 0,County,Land/sq.mi,Popul_Dens,Hous_Dens,Pop%00-10,Pop%90-00,Hous%80-90,Hous%00-10,Hous%90-00,Hous%80-90.1
2,anderson,337.16,222.8,103,5.3,4.5,1.3,7,10.7,13.4
3,bedford,473.64,95.1,38.8,19.9,23.6,8.9,22.5,18.6,16.9
4,benton,394.14,41.8,22.8,-0.3,13.9,-2.5,4.4,20.9,8.9
5,bledsoe,406.42,31.7,14.1,4.1,27.9,2,11.2,36.4,10.7
6,blount,558.71,220.2,98.9,16.2,23.1,10.5,17.4,28.8,18.5
7,bradley,328.76,301,125.9,12.5,19.3,9.1,12.4,24.6,19.7
8,campbell,480.19,84.8,41.6,2.2,13.6,0.4,7.8,25,11.8
9,cannon,265.64,52,22.7,7.6,22.5,2.3,11.4,24.1,9.1
10,carroll,599.25,47.6,22,-3.2,7.1,-2.7,1,10.8,4.2
11,carter,341.2,168.3,81.3,1.2,10.2,2.6,7,19,12.8


In [198]:
clean_tab05_df.to_csv('data/density_pct.csv')

# tab071_df
% of rural population (split on page break)

In [73]:
# Removing extra columns
clean_tab071_df = tab071_df.drop(['Unnamed: 1', 'Unnamed: 2', 'Urban', 'Unnamed: 4',
       'Unnamed: 5', 'Rur ercenal', 'Unnamed: 7', 'Unnamed: 8'], axis = 1)

# Removing extra rows
clean_tab071_df.drop([0,1,2,3], axis = 0, inplace=True)

# Setting header
clean_tab071_df.columns = ['County', '% of Rural Population']

# Cleaning the data

# Remove junk characters, word 'county' and convert string to numbers

for i in np.arange(clean_tab071_df.shape[0]):
    clean_tab071_df.iloc[i,0] = unitext(clean_tab071_df.iloc[i,0]) 
    clean_tab071_df.iloc[i,0] = clean_tab071_df.iloc[i,0].replace('_county','')
    clean_tab071_df.iloc[i,1] = float(clean_tab071_df.iloc[i,1].replace(' ',''))
    

# tab072_df
% of rural population (split on page break)

In [87]:
# Same procedure as tab071_df above

clean_tab072_df = tab072_df.drop(['Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5',
       'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8'], axis = 1)

clean_tab072_df.drop([0,1], axis = 0, inplace=True)

clean_tab072_df.columns = ['County', '% of Rural Population']

for i in np.arange(clean_tab072_df.shape[0]):
    clean_tab072_df.iloc[i,0] = unitext(clean_tab072_df.iloc[i,0]) 
    clean_tab072_df.iloc[i,0] = clean_tab072_df.iloc[i,0].replace('_county','')
    clean_tab072_df.iloc[i,1] = float(clean_tab072_df.iloc[i,1].replace(' ','.'))


In [97]:
pct_rural = pd.concat([clean_tab071_df, clean_tab072_df])

pct_rural.to_csv('data/pct_rural_df.csv')

# zip_tn_df

In [428]:
# Filtering TN zipcodes
zip_tn_df = zip_us_df[zip_us_df['state'] == 'TN']

In [429]:
# Removing extra columns
zip_tn_df.drop(['type', 'decommissioned', 'state', 'timezone', 'area_codes','world_region', 'country', 'irs_estimated_population_2014'], axis= 1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [430]:
# Aviod using 'zip' since it's a python reserved word
zip_tn_df.rename(columns={"zip": "zipcode"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [431]:
# There is a bad data in county column
zip_tn_df.isnull().sum()[zip_tn_df.isnull().sum() != 0]

acceptable_cities      672
unacceptable_cities    617
county                   1
dtype: int64

In [432]:
# Marking the NaN county
zip_tn_df['county'].fillna('?', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [435]:
# A search revealed the missing county is Obion
zip_tn_df.loc[16779,'county'] = 'obion'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [436]:
# Standardizing city and county names
for i in np.arange(zip_tn_df.shape[0]):
    zip_tn_df.iloc[i,1] = unitext(zip_tn_df.iloc[i,1])
    zip_tn_df.iloc[i,4] = unitext(zip_tn_df.iloc[i,4])
    zip_tn_df.iloc[i,4] = zip_tn_df.iloc[i,4].replace('_county','')
    
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [440]:
zip_tn_df.to_csv('data/zip_tn.csv')

In [438]:
zip_tn_df[zip_tn_df['zipcode'] == 38227]

Unnamed: 0,zipcode,primary_city,acceptable_cities,unacceptable_cities,county,latitude,longitude
16779,38227,elbridge,,,obion,36.26,-89.32


# Industries

re index the column

add lat & long

### Create a list of Counties

calculate distance from each industry

In [487]:
zip_tn_df.head(3)

Unnamed: 0,zipcode,primary_city,acceptable_cities,unacceptable_cities,county,latitude,longitude
16164,37010,adams,,,montgomery,36.58,-87.06
16165,37011,antioch,,,davidson,36.04,-86.64
16166,37012,alexandria,,,dekalb,36.07,-86.03


In [513]:
hq_df.head(5)

Unnamed: 0,Company,City,Size,Lat,Long
0,FedEx,Memphis,357000,,
1,HCA,Nashville,210500,,
2,Community Health Systems,Franklin,108000,,
3,Cracker Barrel,Lebanon,73000,,
4,AutoZone,Memphis,66780,,


In [511]:
hq_df['lat'] = ''
hq_df['long'] = ''
hq_df.columns = ['idx', 'Company', 'City', 'Size', 'Lat', 'Long']
hq_df.drop(['idx'], axis=1, inplace=True)

In [514]:
bad_rows = list(hq_df[hq_df['City'].isnull()].index)

In [515]:
bad_rows

[19, 45, 80]

In [None]:
hq

In [480]:
hq_df['City'] = unitext(hq_df['City'])

TypeError: expected string or bytes-like object

# Housing Prices

remove extra rows

rename columns

standardize counties

In [442]:
sold_df

Unnamed: 0,Unnamed: 1.1,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,,2007,2008,2009,2010,2011,2012,2013,2014,2015,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
1,Column1,Column7,Column8,Column9,Column10,Column11,Column2,Column3,Column32,Column33,...,Column18,Column19,Column20,Column21,Column22,Column13,Column14,Column15,Column142,Column17
2,Tennessee,13248,12821,8345,7250,6591,8185,9615,8961,12295,...,196449,199950,182500,186500,195280,214000,239000,256200,264500,274900
3,,,,,,Number of Homes Sold,,,,,...,,,,,Median Sales Price,,,,,
4,Anderson,4,42,21,27,19,3,25,22,26,...,135950,176950,204000,279154,149900,171020,135000,172900,174900,209250
5,Bedford,116,100,56,29,17,12,11,36,51,...,140950,136250,134950,128900,124000,111950,109900,122200,138200,147000
6,Benton,2,7,3,3,1,1,0,3,0,...,249750,163500,108000,127000,300000,120000,0,191000,0,110000
7,Bledsoe,4,0,1,0,0,1,0,1,0,...,140500,0,104000,0,0,195000,0,129800,0,0
8,Blount,59,181,107,122,53,114,185,166,194,...,198500,193500,159900,169900,174000,180400,209900,214950,219900,222600
9,Bradley,169,173,86,85,86,78,102,89,126,...,176000,181500,172500,156000,150650,152000,169250,192900,189900,196108
