# Calling libraries

In [2]:
import pandas as pd
import numpy as np
import re

# Functions

In [3]:
# This function takes latitude and longitude from two points and gives distance in miles
# 0.0175 is pi/180 (1 degree in radian)
# 3959 is earth radius in miles

def euclid(y1, x1, y2, x2):
    deltax = (x1 - x2) * 0.0175 * np.cos(y1 * 0.0175) * 3959
    deltay = (y1 - y2) * 0.0175 * 3959
    dist= np.sqrt(deltax**2 + deltay**2)
    return dist

In [4]:
# This function takes a string and process it as follow:
# 1. Takes out all the non-alphanumeric characters
# 2. Converts the spaces -of any length- into underscores
# 3. Lowercase alphabets
# Note: NANs converted into zero-length string to avoid error

def unitext(text_in):
    if pd.isnull(text_in) == False:
        text_in == text_in
    else:
        text_in = ' '
    
    text_01 = re.sub('[^a-zA-Z0-9\s]','',text_in) #Takes out all the non-alphanumeric characters
    text_02 = re.sub('\s+','_',text_01) # Converts the spaces -of any length- into underscores
    text_03 = str.lower(text_02)
    text_04 = re.sub('\_$','',text_03) # Trims ending space characters
    text_05 = re.sub('^\_','',text_04) # Trims begining space characters
    return text_05    

In [585]:
# DO NOT DELETE THIS UNTIL THE DATA READING FINALIZED - USE FOR TESTING


#                                           Clark     Memph.    Chatt      Knox
# Memphis     35.076616, -90.082608         168.35.    -         258.96    
# Clarksville 36.521391, -87.385077                              145.88    195.66
# Chattanooga 35.003217, -85.257926                                        94.55
# Knoxville   35.915572, -83.916165

euclid(35.915572, -83.916165, 35.003217, -85.257926)

98.233907077521295

# Reading raw spreadsheets

In [5]:
# CSVs
tab04_df = pd.read_csv('data/table_4.csv', encoding = "utf-8", names=list('abcdefghijklmnopqrst'))
tab05_df = pd.read_csv('data/table_5.csv')
tab12_df = pd.read_csv('data/tabula-12.csv', header=None)
tab13_df = pd.read_csv('data/tabula-13.csv', header=None)
tab071_df = pd.read_csv('data/table_71.csv')
tab072_df = pd.read_csv('data/table_72.csv')

In [6]:
# XLSX
zip_us_df = pd.read_excel('data/zip_code_database.xlsx')
hq_df = pd.read_excel('data/hqs.xlsx', header=None)
sold_df = pd.read_excel('data/18.xlsx')

# tab04_df

Population and Households in counties

In [248]:
# Remove blank rows

#Replace NaNs with 0s
tab04_df.fillna(0, inplace=True)

# open a list for blank rows
blank_rows=[]

# this loop makes a list of blank rows
for i in np.arange(tab04_df.shape[0]):
    if tab04_df.iloc[i,0] == 0:
        blank_rows.append(i)
    else:
        tab04_df.iloc[i,0] = unitext(str(tab04_df.iloc[i,0]))
    
# delete blank rows
tab04_df.drop(blank_rows, inplace=True)

In [276]:
# Split the dataframe on original PDF page break
tab04_df_1 = tab04_df[77:]
tab04_df_2 = tab04_df[:77]

In [278]:
# Renaming columns

new_names=[]
for i in np.arange(tab04_df_1.shape[1]):
    if i < 8:
        snippet = 'Population_'
    else:
        snippet = 'Housing_'
    
    new_names.append(snippet+str(tab04_df_1.iloc[1,i])) 

tab04_df_1.columns=new_names

In [279]:
# Deleting extra columns
bad_cols=[]
for i in np.arange(tab04_df_1.shape[1]):
    if '_0' in tab04_df_1.columns.values[i]:
        bad_cols.append(i)

tab04_df_1b = tab04_df_1.drop(tab04_df_1.columns[bad_cols], axis=1)
tab04_df_1b.columns.values[0] = 'County'

In [280]:
# Deleting extra rows
bad_rows=[]
for i in np.arange(tab04_df_1b.shape[0]):
    if '_county' in tab04_df_1b.iloc[i,0]:
        continue
    else:
        bad_rows.append(i)
tab04_df_1b.drop(tab04_df_1b.index[bad_rows], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [281]:
tab04_df_1b['County'] = tab04_df_1b['County'].str.replace('_county','')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [285]:
# Renaming columns

new_names=[]
for i in np.arange(tab04_df_2.shape[1]):
    if i < 7:
        snippet = 'Population_'
    else:
        snippet = 'Housing_'
    
    new_names.append(snippet+str(tab04_df_2.iloc[1,i])) 

tab04_df_2.columns=new_names

In [286]:
# Deleting extra columns
bad_cols=[]
for i in np.arange(tab04_df_2.shape[1]):
    if '_0' in tab04_df_2.columns.values[i]:
        bad_cols.append(i)

tab04_df_2b = tab04_df_2.drop(tab04_df_2.columns[bad_cols], axis=1)
tab04_df_2b.columns.values[0] = 'County'

In [287]:
# Deleting extra rows
bad_rows=[]
for i in np.arange(tab04_df_2b.shape[0]):
    if '_county' in tab04_df_2b.iloc[i,0]:
        continue
    else:
        bad_rows.append(i)
tab04_df_2b.drop(tab04_df_2b.index[bad_rows], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [288]:
tab04_df_2b['County'] = tab04_df_2b['County'].str.replace('_county','')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [293]:
pop_hous_df = pd.concat([tab04_df_2b,tab04_df_1b])

pop_hous_df.to_csv('data/pop_hous_df.csv')

# tab05_df
change in population density

In [189]:
clean_tab05_df = tab05_df.drop(['Unnamed: 1', 'Housing'], axis=1)

In [191]:
clean_tab05_df.columns = ['County', 'Land/sq.mi', 'Popul_Dens', 'Hous_Dens', 'Pop%00-10', 'Pop%90-00', 'Hous%80-90', 'Hous%00-10', 'Hous%90-00', 'Hous%80-90']

In [192]:
clean_tab05_df.drop([0,1,71,72], axis=0, inplace=True)

In [122]:
clean_tab05_df.drop([0,1,71,72], axis=0, inplace=True)

In [193]:
for i in np.arange(clean_tab05_df.shape[0]):
    clean_tab05_df.iloc[i,0] = unitext(clean_tab05_df.iloc[i,0])
    clean_tab05_df.iloc[i,0] = clean_tab05_df.iloc[i,0].replace('_county','')

In [194]:
clean_tab05_df = clean_tab05_df.replace('–','0')

In [196]:
for i in np.arange(clean_tab05_df.shape[0]):
    for j in np.arange(1, clean_tab05_df.shape[1]):
        clean_tab05_df.iloc[i,j] = clean_tab05_df.iloc[i,j].replace('–','-')
        clean_tab05_df.iloc[i,j] = clean_tab05_df.iloc[i,j].replace(' ','')
        clean_tab05_df.iloc[i,j] = clean_tab05_df.iloc[i,j].replace(',','')
        clean_tab05_df.iloc[i,j] = float(clean_tab05_df.iloc[i,j])
        

In [153]:
clean_tab05_df.replace('–','-', inplace=True)
clean_tab05_df.replace(' ','', inplace=True)

In [198]:
clean_tab05_df.to_csv('data/density_pct.csv')

# tab071_df
% of rural population (split on page break)

In [73]:
# Removing extra columns
clean_tab071_df = tab071_df.drop(['Unnamed: 1', 'Unnamed: 2', 'Urban', 'Unnamed: 4',
       'Unnamed: 5', 'Rur ercenal', 'Unnamed: 7', 'Unnamed: 8'], axis = 1)

# Removing extra rows
clean_tab071_df.drop([0,1,2,3], axis = 0, inplace=True)

# Setting header
clean_tab071_df.columns = ['County', '% of Rural Population']

# Cleaning the data

# Remove junk characters, word 'county' and convert string to numbers

for i in np.arange(clean_tab071_df.shape[0]):
    clean_tab071_df.iloc[i,0] = unitext(clean_tab071_df.iloc[i,0]) 
    clean_tab071_df.iloc[i,0] = clean_tab071_df.iloc[i,0].replace('_county','')
    clean_tab071_df.iloc[i,1] = float(clean_tab071_df.iloc[i,1].replace(' ',''))
    

# tab072_df
% of rural population (split on page break)

In [87]:
# Same procedure as tab071_df above

clean_tab072_df = tab072_df.drop(['Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5',
       'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8'], axis = 1)

clean_tab072_df.drop([0,1], axis = 0, inplace=True)

clean_tab072_df.columns = ['County', '% of Rural Population']

for i in np.arange(clean_tab072_df.shape[0]):
    clean_tab072_df.iloc[i,0] = unitext(clean_tab072_df.iloc[i,0]) 
    clean_tab072_df.iloc[i,0] = clean_tab072_df.iloc[i,0].replace('_county','')
    clean_tab072_df.iloc[i,1] = float(clean_tab072_df.iloc[i,1].replace(' ','.'))


In [97]:
pct_rural = pd.concat([clean_tab071_df, clean_tab072_df])

pct_rural.to_csv('data/pct_rural_df.csv')

# zip_tn_df

In [428]:
# Filtering TN zipcodes
zip_tn_df = zip_us_df[zip_us_df['state'] == 'TN']

In [429]:
# Removing extra columns
zip_tn_df.drop(['type', 'decommissioned', 'state', 'timezone', 'area_codes','world_region', 'country', 'irs_estimated_population_2014'], axis= 1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [430]:
# Aviod using 'zip' since it's a python reserved word
zip_tn_df.rename(columns={"zip": "zipcode"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [431]:
# There is a bad data in county column
zip_tn_df.isnull().sum()[zip_tn_df.isnull().sum() != 0]

acceptable_cities      672
unacceptable_cities    617
county                   1
dtype: int64

In [432]:
# Marking the NaN county
zip_tn_df['county'].fillna('?', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [435]:
# A search revealed the missing county is Obion
zip_tn_df.loc[16779,'county'] = 'obion'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [436]:
# Standardizing city and county names
for i in np.arange(zip_tn_df.shape[0]):
    zip_tn_df.iloc[i,1] = unitext(zip_tn_df.iloc[i,1])
    zip_tn_df.iloc[i,4] = unitext(zip_tn_df.iloc[i,4])
    zip_tn_df.iloc[i,4] = zip_tn_df.iloc[i,4].replace('_county','')
    
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [440]:
zip_tn_df.to_csv('data/zip_tn.csv')

# hq_df
### industries

add lat & long

### Create a list of Counties with average distance from HQs

calculate distance from each industry

In [511]:
# Cleaning the headers
hq_df['lat'] = ''
hq_df['long'] = ''
hq_df.columns = ['idx', 'Company', 'City', 'Size', 'Lat', 'Long']
hq_df.drop(['idx'], axis=1, inplace=True)

In [514]:
# Some rows lack data. Removed
bad_rows = list(hq_df[hq_df['City'].isnull()].index)

hq_df.drop(bad_rows, axis=0, inplace=True)

In [517]:
hq_df.drop(bad_rows, axis=0, inplace=True)

In [529]:
for i in np.arange(hq_df.shape[0]):
    hq_df.iloc[i,1] = unitext(hq_df.iloc[i,1])

In [538]:
hq_coor = pd.merge(hq_df, zip_tn_df, how='left', left_on='City', right_on='primary_city')

In [539]:
hq_coor.drop_duplicates(subset='Company', keep='first', inplace=True)

In [540]:
hq_coor.columns

Index(['Company', 'City', 'Size', 'Lat', 'Long', 'zipcode', 'primary_city',
       'acceptable_cities', 'unacceptable_cities', 'county', 'latitude',
       'longitude'],
      dtype='object')

In [541]:
hq_coor.drop(['City', 'Size', 'Lat', 'Long', 'zipcode', 'primary_city','acceptable_cities', 'unacceptable_cities', 'county'], axis=1, inplace=True)

In [553]:
bad_rows = list(hq_coor[hq_coor['latitude'].isnull() | hq_coor['longitude'].isnull()].index)
hq_coor.drop(bad_rows, axis=0, inplace=True)

In [554]:
hq_coor.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96 entries, 0 to 2754
Data columns (total 3 columns):
Company      96 non-null object
latitude     96 non-null float64
longitude    96 non-null float64
dtypes: float64(2), object(1)
memory usage: 3.0+ KB


In [555]:
county_coor = zip_tn_df.drop_duplicates(subset='county', keep='first')

In [558]:
county_coor.drop(['zipcode', 'primary_city', 'acceptable_cities', 'unacceptable_cities'], axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [561]:
county_coor.sort_values(by='county', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [563]:
county_coor['Avg_Dist'] = ''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [589]:
for i in np.arange(county_coor.shape[0]):
    dist_sum = 0
    for j in np.arange(hq_coor.shape[0]):
        dist_sum = dist_sum + euclid(county_coor.iloc[i,1], county_coor.iloc[i,2], hq_coor.iloc[i,1], hq_coor.iloc[i,2])
    county_coor.iloc[i,3] = dist_sum

    
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


# sold_df
#### number of houses and median price over years

remove extra rows

rename columns

standardize counties

In [622]:
# Renaming columns

new_names=[]
for i in np.arange(sold_df.shape[1]):
    if i < 12:
        snippet = 'Houses_sold_'
    else:
        snippet = 'Median_Price_'
    
    new_names.append(snippet+str(sold_df.iloc[0,i])) 


sold_df.columns=new_names

# removing extra rows
sold_df.drop([0,1,2,3,99,100,101],axis=0, inplace=True)

# More cleaning and renaming on columns
sold_df.drop(['Houses_sold_nan'], axis=1, inplace=True)

sold_df.rename(columns={'Houses_sold_ ': 'County'}, inplace=True)

for i in np.arange(sold_df.shape[0]):
    sold_df.iloc[i,0] = unitext(sold_df.iloc[i,0])

In [634]:
sold_df.to_csv('data/home_sold.csv')

# tab12_df
### area and number of lands subdivided into price and area within counties

Tables successfully transformed. Next Steps:
remove NaN rows columns
rename columns
make sure all values are numeric

In [652]:
tab12_df = pd.read_csv('data/tabula-12.csv', header=None)

In [653]:
# Removing rows that only contain NaN
nan_rows=[]
for i in np.arange(tab12_df.shape[0]):
    check_sum = tab12_df.iloc[i].isnull().sum()
    if check_sum == 14: # All NaN
        nan_rows.append(i)
tab12_df.drop(nan_rows,axis=0, inplace=True)        

tab12_df.reset_index(inplace=True)
tab12_df.drop(['index'], axis=1, inplace=True)

In [654]:
# Standardize row headers
for i in np.arange(tab12_df.shape[0]):
    tab12_df.iloc[i,0] = unitext(tab12_df.iloc[i,0])
    

In [655]:
# Looking for the word 'item' - top left corner for transpose

corners = []
start_over = []

for i in np.arange(tab12_df.shape[0]):
    if unitext(tab12_df.iloc[i,0]) == 'item':
        corners.append(i)
        if unitext(tab12_df.iloc[i,1]) == 'tennessee': # Start_Over reveals that there are two tables in one dataframes
            start_over.append(i)
            

corners.append(tab12_df.shape[0])

# Start_Over reveals that there are two tables in one dataframes
# These two tables have to be handled separately

corners1=corners[:corners.index(start_over[1])+1]
corners2=corners[corners.index(start_over[1]):]

In [656]:
# First table is blank1_df

blank1_df = pd.DataFrame()

for i in np.arange(len(corners1)-2):
    temp = tab12_df[corners1[i] : corners1[i+1]]
    temp.reset_index(inplace=True)
    temp.drop(['index'], axis=1, inplace=True)
    temp_tran = temp.transpose()
    blank1_df = pd.concat([blank1_df,temp_tran])

    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [657]:
# Second table is blank2_df

blank2_df = pd.DataFrame()

for i in np.arange(len(corners2)-2):
    temp = tab12_df[corners2[i] : corners2[i + 1]]
    temp.reset_index(inplace=True)
    temp.drop(['index'], axis=1, inplace=True)
    temp_tran = temp.transpose()
    blank2_df = pd.concat([blank2_df,temp_tran])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [658]:
# BLANK1_DF 

# Renaming the columns
blank1_df.columns = blank1_df.iloc[0]

In [659]:
blank1_df.reset_index(inplace=True)

In [660]:
blank1_df.drop(['index'], axis=1, inplace=True)

In [661]:
# Removing rows that only contain NaN
nan_rows=[]
for i in np.arange(blank1_df.shape[0]):
    if unitext(str(blank1_df.iloc[i,0])) == 'nan' or unitext(str(blank1_df.iloc[i,0])) == 'item':
        nan_rows.append(i)
blank1_df.drop(nan_rows,axis=0, inplace=True)        


In [662]:
blank1_df.drop([1], axis=0, inplace=True)

In [663]:
# Standardize row headers
for i in np.arange(blank1_df.shape[0]):
    blank1_df.iloc[i,0] = unitext(blank1_df.iloc[i,0])


In [664]:
bad_cols = blank1_df.isnull().sum()[blank1_df.isnull().sum() != 0]

In [665]:
old_names = list(blank1_df.columns)

In [666]:
head_cols = list(bad_cols.index)

In [667]:
new_names = []
for i in np.arange(17,26):
    ren_col = str(blank1_df.columns[16] + "_" + blank1_df.columns[i])
    new_names.append(ren_col)

old_names[17:26] = new_names

In [668]:
head07 = []
headac = []
for i in np.arange(len(old_names)):
    if old_names[i] == '2007':
        head07.append(i)
    elif old_names[i] == 'acres':
        headac.append(i)
    else:
        continue        

In [669]:
for i in head07:
    leftapp = old_names[i-1]
    leftapp = leftapp[:-4]
    old_names[i] = leftapp + old_names[i]   

In [670]:
for i in headac:
    leftapp = old_names[i-1]
    leftapp = leftapp[:-5]
    old_names[i] = leftapp + old_names[i]   

In [671]:
blank1_df.columns = old_names

In [672]:
new_names = []
for i in np.arange(29,53,2):
    ren_col = str(blank1_df.columns[28] + "_" + blank1_df.columns[i])
    new_names.append(ren_col)
    ren_col2 = str(blank1_df.columns[28] + "_" + blank1_df.columns[i+1])
    new_names.append(ren_col2)

old_names[29:53] = new_names

In [673]:
new_names = []
for i in np.arange(54,78,2):
    ren_col = str(blank1_df.columns[53] + "_" + blank1_df.columns[i])
    new_names.append(ren_col)
    ren_col2 = str(blank1_df.columns[53] + "_" + blank1_df.columns[i+1])
    new_names.append(ren_col2)

old_names[54:78] = new_names

# STOP

In [674]:
for i in [79,83,89]:
    old_names[i+2] = old_names[i][:-10] + old_names[i+2]
    old_names[i+3] = old_names[i+1][:-10] + old_names[i+3]
    

In [675]:
blank1_df.columns = old_names

In [676]:
old_names

['item',
 'farms_and_land_in_farms',
 'farms_number_2012',
 'farms_number_2007',
 'land_in_farms_acres_2012',
 'land_in_farms_acres_2007',
 'average_size_of_farm_acres_2012',
 'average_size_of_farm_acres_2007',
 'estimated_market_value_of_land_and_buildings_farms_2012',
 'estimated_market_value_of_land_and_buildings_farms_2007',
 '1000_2012',
 '1000_2007',
 'average_per_farm_dollars_2012',
 'average_per_farm_dollars_2007',
 'average_per_acre_dollars_2012',
 'average_per_acre_dollars_2007',
 '2012_farms_by_value_group',
 '2012_farms_by_value_group_1_to_49999',
 '2012_farms_by_value_group_50000_to_99999',
 '2012_farms_by_value_group_100000_to_199999',
 '2012_farms_by_value_group_200000_to_499999',
 '2012_farms_by_value_group_500000_to_999999',
 '2012_farms_by_value_group_1000000_to_1999999',
 '2012_farms_by_value_group_2000000_to_4999999',
 '2012_farms_by_value_group_5000000_to_9999999',
 '2012_farms_by_value_group_10000000_or_more',
 'approximate_land_area_acres_2012',
 'proportion_in_f

In [677]:
blank1_df.drop(list(bad_cols.index), axis=1, inplace=True)

In [680]:
for i in np.arange(blank1_df.shape[0]):
    for j in np.arange(1, blank1_df.shape[1]):
        blank1_df.iloc[i,j] = blank1_df.iloc[i,j].replace(',','')


In [683]:
pd.set_option("display.max_rows",100)
pd.set_option("display.max_columns",100)
blank1_df

Unnamed: 0,item,farms_number_2012,farms_number_2007,land_in_farms_acres_2012,land_in_farms_acres_2007,average_size_of_farm_acres_2012,average_size_of_farm_acres_2007,estimated_market_value_of_land_and_buildings_farms_2012,estimated_market_value_of_land_and_buildings_farms_2007,1000_2012,1000_2007,average_per_farm_dollars_2012,average_per_farm_dollars_2007,average_per_acre_dollars_2012,average_per_acre_dollars_2007,2012_farms_by_value_group_1_to_49999,2012_farms_by_value_group_50000_to_99999,2012_farms_by_value_group_100000_to_199999,2012_farms_by_value_group_200000_to_499999,2012_farms_by_value_group_500000_to_999999,2012_farms_by_value_group_1000000_to_1999999,2012_farms_by_value_group_2000000_to_4999999,2012_farms_by_value_group_5000000_to_9999999,2012_farms_by_value_group_10000000_or_more,approximate_land_area_acres_2012,proportion_in_farms_percent_2012,2012_size_of_farm_1_to_9_acres_farms,2012_size_of_farm_1_to_9_acres_acres,2012_size_of_farm_10_to_49_acres_farms,2012_size_of_farm_10_to_49_acres_acres,2012_size_of_farm_50_to_69_acres_farms,2012_size_of_farm_50_to_69_acres_acres,2012_size_of_farm_70_to_99_acres_farms,2012_size_of_farm_70_to_99_acres_acres,2012_size_of_farm_100_to_139_acres_farms,2012_size_of_farm_100_to_139_acres_acres,2012_size_of_farm_140_to_179_acres_farms,2012_size_of_farm_140_to_179_acres_acres,2012_size_of_farm_180_to_219_acres_farms,2012_size_of_farm_180_to_219_acres_acres,2012_size_of_farm_220_to_259_acres_farms,2012_size_of_farm_220_to_259_acres_acres,2012_size_of_farm_260_to_499_acres_farms,2012_size_of_farm_260_to_499_acres_acres,2012_size_of_farm_500_to_999_acres_farms,2012_size_of_farm_500_to_999_acres_acres,2012_size_of_farm_1000_to_1999_acres_farms,2012_size_of_farm_1000_to_1999_acres_acres,2012_size_of_farm_2000_acres_or_more_farms,2012_size_of_farm_2000_acres_or_more_acres,2007_size_of_farm_1_to_9_acres_farms,2007_size_of_farm_1_to_9_acres_acres,2007_size_of_farm_10_to_49_acres_farms,2007_size_of_farm_10_to_49_acres_acres,2007_size_of_farm_50_to_69_acres_farms,2007_size_of_farm_50_to_69_acres_acres,2007_size_of_farm_70_to_99_acres_farms,2007_size_of_farm_70_to_99_acres_acres,2007_size_of_farm_100_to_139_acres_farms,2007_size_of_farm_100_to_139_acres_acres,2007_size_of_farm_140_to_179_acres_farms,2007_size_of_farm_140_to_179_acres_acres,2007_size_of_farm_180_to_219_acres_farms,2007_size_of_farm_180_to_219_acres_acres,2007_size_of_farm_220_to_259_acres_farms,2007_size_of_farm_220_to_259_acres_acres,2007_size_of_farm_260_to_499_acres_farms,2007_size_of_farm_260_to_499_acres_acres,2007_size_of_farm_500_to_999_acres_farms,2007_size_of_farm_500_to_999_acres_acres,2007_size_of_farm_1000_to_1999_acres_farms,2007_size_of_farm_1000_to_1999_acres_acres,2007_size_of_farm_2000_acres_or_more_farms,2007_size_of_farm_2000_acres_or_more_acres,total_cropland_farms_2012,total_cropland_farms_2007,total_cropland_acres_2012,total_cropland_acres_2007,harvested_cropland_farms_2012,harvested_cropland_farms_2007,harvested_cropland_acres_2012,harvested_cropland_acres_2007,improvements_see_text_farms_2012,improvements_see_text_farms_2007,improvements_see_text_acres_2012,improvements_see_text_acres_2007
2,anderson,441,538,35845,40135,81,75,441,538,209899,214397,475960,398508,5856,5342,21,28,101,181,63,32,13.0,2.0,,215784,16.6,35,,196,5676.0,62,3598,49,4082,34,3993,25,3765.0,7,1398.0,10.0,2333.0,17,5889.0,5.0,3426.0,1.0,,,,61,,234,6272.0,74,4290,43,3550,54,6549.0,24,3787.0,10.0,1983.0,8.0,1853.0,28.0,9902.0,2,,,,,,308,419,10163,17025,293,350,8933,9295,23.0,143,771.0,6703.0
3,bedford,1411,1554,232381,231206,165,149,1411,1554,887879,855072,629255,550239,3821,3698,57,68,241,580,234,144,75.0,10.0,2.0,303128,76.7,53,278.0,458,12417.0,157,9092,153,12877,154,17752,89,14038.0,86,17092.0,45.0,10564.0,109,37054.0,72.0,46209.0,29.0,37713.0,6.0,17295.0,109,608.0,566,15004.0,160,9320,146,12197,126,14880.0,85,13539.0,68.0,13406.0,66.0,15768.0,138.0,48458.0,55,36057.0,29.0,37575.0,6.0,14394.0,1028,1144,88563,106886,951,944,70961,63198,118.0,406,12989.0,34975.0
4,benton,463,500,87902,72522,190,145,463,500,197008,162086,425503,324173,2241,2235,27,65,88,174,75,22,8.0,4.0,,252255,34.8,11,57.0,104,2830.0,57,3333,58,4801,51,5987,41,6557.0,31,6121.0,13.0,3066.0,66,22661.0,22.0,15846.0,4.0,4259.0,5.0,12384.0,27,128.0,134,3594.0,64,3753,53,4311,75,8703.0,30,4627.0,23.0,4574.0,20.0,,56.0,20494.0,11,7492.0,6.0,7507.0,1.0,,359,394,38546,37596,254,259,25166,16651,57.0,125,5132.0,11337.0
5,bledsoe,579,580,102255,92043,177,159,579,580,358502,293147,619174,505425,3506,3185,36,43,108,223,89,51,22.0,3.0,4.0,260112,39.3,21,95.0,144,4071.0,71,4004,63,5176,83,9545,55,8553.0,31,6184.0,26.0,6275.0,54,19053.0,15.0,10126.0,10.0,12258.0,6.0,16915.0,23,135.0,145,4256.0,41,2357,75,6312,88,10224.0,59,9395.0,28.0,5571.0,32.0,7663.0,61.0,21518.0,21,13589.0,5.0,,2.0,,461,494,42551,43041,409,414,32935,24986,65.0,166,7418.0,14606.0
6,blount,980,1154,100717,98403,103,85,980,1154,686026,547117,700027,474105,6811,5560,45,31,111,376,271,94,33.0,16.0,3.0,357569,28.2,73,,437,11618.0,92,5328,107,8765,90,10370,44,6848.0,30,5951.0,24.0,5548.0,57,20334.0,17.0,11619.0,7.0,9225.0,2.0,,101,549.0,528,13548.0,138,8123,117,9592,85,9866.0,63,9881.0,23.0,4565.0,32.0,7578.0,45.0,15643.0,15,9635.0,7.0,9423.0,,,735,904,42753,51547,691,759,34544,33535,66.0,319,5618.0,14705.0
7,bradley,807,959,86585,95602,107,100,807,959,470919,459245,583543,478879,5439,4804,26,63,119,339,139,91,26.0,1.0,3.0,210408,41.2,42,214.0,367,9874.0,81,4756,70,5659,77,8832,45,6814.0,36,7120.0,20.0,4697.0,35,12442.0,26.0,15944.0,8.0,10233.0,,,86,,444,11234.0,127,7268,71,5759,64,7407.0,39,6061.0,19.0,3613.0,17.0,4059.0,61.0,20662.0,21,13542.0,8.0,10105.0,2.0,,531,686,28570,40542,494,536,24121,24925,48.0,259,1931.0,12699.0
8,campbell,370,404,33487,34174,91,85,370,404,125121,118697,338164,293806,3736,3473,29,49,78,144,50,14,6.0,,,307316,10.9,15,68.0,140,3998.0,49,2915,51,4192,47,5394,24,3655.0,22,4397.0,5.0,1195.0,12,4007.0,5.0,3666.0,,,,,18,73.0,159,4284.0,70,4137,51,4173,40,4664.0,30,4596.0,11.0,2134.0,3.0,717.0,13.0,4337.0,9,5059.0,,,,,306,322,11486,14872,282,272,9536,8831,28.0,117,1057.0,4764.0
15,cannon,717,880,96262,116720,134,133,717,880,298089,377306,415744,428757,3097,3233,42,73,161,307,87,27,16.0,3.0,1.0,170006,56.6,29,123.0,234,6078.0,75,4303,86,7259,86,9812,54,8371.0,43,8496.0,20.0,4670.0,64,22007.0,20.0,13764.0,3.0,4445.0,3.0,6934.0,60,,269,7003.0,86,4940,101,8391,107,12490.0,61,9436.0,47.0,9444.0,42.0,9837.0,68.0,24646.0,32,19191.0,6.0,8532.0,1.0,,495,601,38470,48624,447,482,33938,31440,54.0,202,2867.0,14587.0
16,carroll,732,971,177931,179703,243,185,732,971,435896,456647,595487,470286,2450,2541,50,101,248,232,41,20,17.0,12.0,11.0,383519,46.4,16,110.0,213,6475.0,87,4991,83,6873,92,10619,66,10447.0,41,8033.0,35.0,8176.0,41,13391.0,26.0,16191.0,14.0,17673.0,18.0,74952.0,36,170.0,344,9920.0,101,5809,122,9974,93,10966.0,62,9690.0,59.0,11707.0,26.0,6149.0,70.0,23328.0,30,19157.0,11.0,15183.0,17.0,57650.0,572,796,121429,121978,403,520,104241,91842,67.0,233,4118.0,12413.0
17,carter,493,516,40266,39374,82,76,493,516,210899,159116,427786,308364,5238,4041,27,43,106,187,88,32,8.0,2.0,,218370,18.4,40,205.0,215,5633.0,64,3673,41,3449,52,6168,31,4835.0,17,3423.0,9.0,2143.0,16,5308.0,8.0,5429.0,,,,,46,,259,6986.0,57,3276,52,4321,28,3265.0,29,4584.0,15.0,2997.0,8.0,1888.0,12.0,4494.0,9,6150.0,1.0,,,,356,401,12381,18252,343,347,10408,11283,32.0,133,984.0,5556.0


In [682]:
for i in np.arange(1, blank1_df.shape[1]):
    temp_col = list(blank1_df[blank1_df.columns[i]])
    blank1_df[blank1_df.columns[i]] = pd.to_numeric(temp_col, errors='coerece')
    
    

In [56]:
# BLANK2_DF 

# Renaming the columns
blank2_df.columns = blank2_df.iloc[0]

In [57]:
blank2_df.reset_index(inplace=True)

In [58]:
blank2_df.drop(['index'], axis=1, inplace=True)

In [59]:
# Removing rows that only contain NaN
nan_rows=[]
for i in np.arange(blank2_df.shape[0]):
    if unitext(str(blank2_df.iloc[i,0])) == 'nan' or unitext(str(blank2_df.iloc[i,0])) == 'item':
        nan_rows.append(i)
blank2_df.drop(nan_rows,axis=0, inplace=True)        


In [60]:
blank2_df.drop([1], axis=0, inplace=True)

In [61]:
# Standardize row headers
for i in np.arange(blank2_df.shape[0]):
    blank2_df.iloc[i,0] = unitext(blank2_df.iloc[i,0])


In [62]:
bad_cols = blank2_df.isnull().sum()[blank2_df.isnull().sum() != 0]

In [63]:
old_names = list(blank2_df.columns)

In [64]:
head07 = []
headac = []
for i in np.arange(len(old_names)):
    if old_names[i] == '2007':
        head07.append(i)
    elif old_names[i] == 'acres_2012':
        headac.append(i)
    else:
        continue        

In [65]:
for i in headac:
    leftapp = old_names[i-2]
    leftapp = leftapp[:-10]
    old_names[i] = leftapp + old_names[i]   

In [66]:
for i in head07:
    leftapp = old_names[i-1]
    leftapp = leftapp[:-4]
    old_names[i] = leftapp + old_names[i]   

In [67]:
blank2_df.columns = old_names

In [68]:
blank2_df.drop(list(bad_cols.index), axis=1, inplace=True)

In [69]:
blank2_df

Unnamed: 0,item,other_cropland_farms_2012,other_cropland_farms_2007,other_cropland_acres_2012,other_cropland_acres_2007,not_pastured_or_grazed_farms_2012,not_pastured_or_grazed_farms_2007,not_pastured_or_grazed_acres_2012,not_pastured_or_grazed_acres_2007,cropland_on_which_all_crops_failed_farms_2012,cropland_on_which_all_crops_failed_farms_2007,cropland_on_which_all_crops_failed_acres_2012,cropland_on_which_all_crops_failed_acres_2007,cropland_in_cultivated_summer_fallow_farms_2012,cropland_in_cultivated_summer_fallow_farms_2007,cropland_in_cultivated_summer_fallow_acres_2012,cropland_in_cultivated_summer_fallow_acres_2007,total_woodland_farms_2012,total_woodland_farms_2007,total_woodland_acres_2012,total_woodland_acres_2007,woodland_pastured_farms_2012,woodland_pastured_farms_2007,woodland_pastured_acres_2012,woodland_pastured_acres_2007,woodland_not_pastured_farms_2012,woodland_not_pastured_farms_2007,woodland_not_pastured_acres_2012,woodland_not_pastured_acres_2007,cropland_and_woodland_pastured_see_text_farms_2012,cropland_and_woodland_pastured_see_text_farms_2007,cropland_and_woodland_pastured_see_text_acres_2012,cropland_and_woodland_pastured_see_text_acres_2007,facilities_ponds_roads_wasteland_etc_farms_2012,facilities_ponds_roads_wasteland_etc_farms_2007,facilities_ponds_roads_wasteland_etc_acres_2012,facilities_ponds_roads_wasteland_etc_acres_2007,pastureland_all_types_farms_2012,pastureland_all_types_farms_2007,pastureland_all_types_acres_2012,pastureland_all_types_acres_2007,reserve_enhancement_programs_farms_2012,reserve_enhancement_programs_farms_2007,reserve_enhancement_programs_acres_2012,reserve_enhancement_programs_acres_2007,land_enrolled_in_crop_insurance_programs_farms_2012,land_enrolled_in_crop_insurance_programs_farms_2007,land_enrolled_in_crop_insurance_programs_acres_2012,land_enrolled_in_crop_insurance_programs_acres_2007
2,anderson,35,64,459,1027,26,53,368,795,10,18,88,232,3,-,3,-,314,326,10658,10799,153,161,3636,2891,216,241,7022,7908,347,327,13512,10515,275,251,1512,1796,365,434,17919,20109,2,1,(D),(D),2,-,(D),-
3,bedford,125,169,4613,8713,110,106,3324,4150,21,74,1289,4563,-,-,-,-,852,836,41335,36152,576,586,26459,20856,397,391,14876,15296,1086,999,95015,80446,921,765,7468,7722,1178,1302,134463,136277,21,29,655,1541,50,32,18499,9883
4,benton,137,165,8248,9608,127,142,7381,8738,12,27,816,870,3,-,51,-,334,298,23867,17270,148,119,4487,3164,256,238,19380,14106,279,260,22644,14737,290,209,2845,2919,321,350,32263,29238,96,124,5763,7216,20,4,12205,2855
5,bledsoe,72,82,2198,3449,65,63,1825,2693,14,26,373,756,-,-,-,-,345,319,20898,17045,178,205,4637,5456,244,189,16261,11589,436,368,36056,28944,350,257,2750,3013,477,485,48111,49006,6,20,217,453,15,6,2419,1105
7,blount,102,136,2591,3307,84,89,1786,2074,22,51,(D),1227,2,4,(D),6,567,596,18733,15338,328,380,6504,6929,331,341,12229,8409,720,767,35526,27704,650,512,3705,3814,769,979,47648,49338,8,-,328,-,16,10,4492,2577
8,bradley,87,100,2518,2918,73,75,2215,2283,17,33,(D),635,1,-,(D),-,495,455,18490,19111,230,246,5732,7528,330,287,12758,11583,613,610,35805,32462,510,435,3720,3487,637,776,43468,52689,11,9,495,292,16,6,2451,1145
9,campbell,58,44,893,1277,51,26,726,410,10,24,(D),867,2,-,(D),-,236,254,6973,6483,151,169,2274,2568,156,153,4699,3915,294,289,13992,12095,220,137,1036,724,310,352,17323,19427,1,1,(D),(D),4,1,478,(D)
15,cannon,63,81,1665,2597,50,59,1381,1655,17,26,284,942,-,-,-,-,473,531,28266,30212,272,344,13616,19024,280,271,14650,11188,487,604,26589,34738,420,346,2937,3146,547,749,43072,68349,5,11,137,513,26,29,14902,11683
17,carroll,240,286,13070,17723,229,262,12633,14144,21,42,(D),(D),1,1,(D),(D),493,534,33288,32398,188,211,5822,6152,382,419,27466,26246,380,413,18122,17142,478,498,5092,8185,432,600,28062,35707,206,314,11211,17143,52,70,67428,56907
18,carter,55,74,989,1413,46,57,809,848,14,30,180,565,-,-,-,-,337,327,10857,9312,220,212,5023,4836,187,174,5834,4476,393,332,15848,10858,259,180,1180,952,411,419,21855,21250,1,-,(D),-,12,9,831,302
