In [1]:
__all__ = ['data_t','st_df','ind_df','gdp_df']

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas import read_csv
from pathlib import Path

In [3]:
cwd = Path.cwd().parent

data_folder = cwd / Path('data/')
n1 = 'SAGDP2S__ALL_AREAS_1963_1997.csv'
n2 = 'SAGDP2N__ALL_AREAS_1997_2017.csv'
n3 = 'Tornados.csv'

sagdp2s = data_folder / Path(n1)
sagdp2n = data_folder / Path(n2)
tornados = data_folder / Path(n3)

assert sagdp2n.exists() and sagdp2s.exists() and tornados.exists()

In [4]:
data_63 = read_csv(sagdp2s)
data_97 = read_csv(sagdp2n)
data_t = read_csv(tornados)

In [5]:
# --------------- State #
states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

states_data = {'StateID':[], 'StateName':[]}
for i,j in states.items():
    states_data['StateID'].append(i)
    states_data['StateName'].append(j)

st_df = pd.DataFrame.from_dict(states_data)

In [6]:
# ------------------ Industry #
industry_data = {'IndustryID':[], 'IndustryName':[]}

assert len(data_97.IndustryId.unique()) == len(data_97.Description.unique())

industry_data['IndustryID'].extend(data_97.IndustryId.unique())
industry_data['IndustryName'].extend(data_97.Description.unique())

ind_df = pd.DataFrame.from_dict(industry_data)

In [7]:
# ------------------ Gdp #
gdp_data = {'GdpID':[], 'GDP':[], 'Yr':[], 'StateID':[], 'IndustryID':[]}

sagdp2s_years = list(range(1963,1997))
sagdp2s_years_starting_index = 9
sagdp2n_years = list(range(1997,2017))
sagdp2n_years_starting_index = 8

# states
states_list = st_df.StateName.tolist()

def populate_gdp_data(gdp_dict, dataset, states_list, years, first_year_index_in_header, iId=1, debug=False):
    count = len(gdp_dict['GdpID'])
    
    if count == 0:
        count = 1
        
    for state in states_list:
        current_row = dataset[(dataset['GeoName']==state) & (dataset['IndustryId']==iId)].values.tolist()

        if debug:
            print(state)
            print(current_row)
                
        if current_row != []:
            
            assert len(current_row) == 1
            current_row = current_row[0]
            
            sId = st_df[st_df['StateName']==state].StateID.values.tolist()[0]
            
            if debug:
                print(sId)
                print(len(current_row[8:]))
            
            for year, value in zip(years,current_row[first_year_index_in_header:]):
                if debug:
                    print(f"YEAR: {year}, VALUE: {value}")
                else:
                    gdp_dict['GdpID'].append(count)
                    gdp_dict['GDP'].append(float(value))
                    gdp_dict['Yr'].append(year)
                    gdp_dict['StateID'].append(sId)
                    gdp_dict['IndustryID'].append(iId)
                    count += 1

populate_gdp_data(gdp_data, data_63, states_list, sagdp2s_years, sagdp2s_years_starting_index)#, debug=True)
populate_gdp_data(gdp_data, data_97, states_list, sagdp2n_years, sagdp2n_years_starting_index)#, debug=True)

gdp_df = pd.DataFrame.from_dict(gdp_data)

assert len(gdp_df[gdp_df['Yr']==2016].StateID) == len(gdp_df[gdp_df['Yr']==2016].StateID.unique())