In [1]:
import pandas as pd
import numpy as np
import geocoder
import requests
import time
from geopy.geocoders import Nominatim
import json 
from sqlalchemy.sql import text
import boto3

from File_Utilities import FileTools
import DB_Utilities
DBTools = DB_Utilities.DBTools()  # instantiate the class
FileTools.MYDIR = "./data/"

##

### Working on Economic Indicators

* CAINC4__ALL_AREAS_1969_2020.csv
* https://virginia.box.com/s/v70g9niz9att0381tapsnmtffksq7ahw

In [None]:
# Read in the data, there are some encoding issues so we need to explicitly specify the encoding.
_CAINC4__ALL_AREAS_1969_2020_df = pd.read_csv(FileTools.get_full_file_path('CAINC4__ALL_AREAS_1969_2020.csv'),low_memory=False, encoding = "ISO-8859-1" )

# Clean up the description field a bit.
_CAINC4__ALL_AREAS_1969_2020_df['Description'] = _CAINC4__ALL_AREAS_1969_2020_df.Description.str.strip().str.replace("/","")

# Seems like there is double data for 'Employer contributions for government social insurance' but they have different LineCodes so we can take out one of them.
# The other descriptions are in 30's and on so we will take out 62.
_CAINC4__ALL_AREAS_1969_2020_df = _CAINC4__ALL_AREAS_1969_2020_df[~(_CAINC4__ALL_AREAS_1969_2020_df.LineCode == 62)]
# DBTools.truncate_and_insert_df(_CAINC4__ALL_AREAS_1969_2020_df, "CAINC4__ALL_AREAS_1969_2020_df")

### Pivot the table so that county is row and all predictors are columns by year

In [None]:
all_data_ei = pd.DataFrame()
for i in range(1969,2021):  
# for i in range(1969,1970):
    year = str(i)
    print(f'Working on year : {year}')
    temp_CAINC4__ALL_AREAS_1969_2020_df = _CAINC4__ALL_AREAS_1969_2020_df[[year, 'GeoFIPS',  'Description']].copy()
    pivot_CAINC4__ALL_AREAS_1969_2020_df = pd.pivot_table(temp_CAINC4__ALL_AREAS_1969_2020_df, 
                                                    values=year
                                                    ,index=['GeoFIPS']
                                                    ,columns=['Description']
                                                    ,aggfunc=np.sum, fill_value=0)

    pivot_CAINC4__ALL_AREAS_1969_2020_df['year'] = year
    pivot_CAINC4__ALL_AREAS_1969_2020_df.reset_index(inplace=True)
    all_data_ei = pd.concat([all_data_ei, pivot_CAINC4__ALL_AREAS_1969_2020_df])   

all_data_ei.GeoFIPS = all_data_ei.GeoFIPS.str.replace('"','')

FileTools.save_df_as_parquet(all_data_ei, 'CAINC4_ALL_AREAS_1969_2020_PIVOT.gzip')
FileTools.save_df_as_csv(all_data_ei, 'CAINC4_ALL_AREAS_1969_2020_PIVOT.csv')
# DBTools.truncate_and_insert_df(all_data_ei, "CAINC4__ALL_AREAS_1969_2020_PIVOT")

In [None]:
all_data_ei.head()

In [None]:
# test = FileTools.load_df_from_csv('CAINC4_ALL_AREAS_1969_2020_PIVOT.csv')
test = pd.read_csv(FileTools.get_full_file_path('CAINC4_ALL_AREAS_1969_2020_PIVOT.csv'),low_memory=False, encoding = "ISO-8859-1" )
test.head()