In [1]:
# basic libraries
import pandas as pd
import numpy as np
import random

# sklearn functions
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import ElasticNetCV, LinearRegression

In [2]:
# read in the data
states = pd.read_csv(
    "C:/Users/MTV5033/Desktop/python_workspace/state_gdp.csv"
)

# create set of industry codes of interest
codes = list(range(3, 13)) + \
    list(range(34, 84))

# remove some extra columns
states = states.drop(
    ['GeoName', 'Description'],
    axis = 1
)

# gather the columns to make df panel data
states = states.melt(
    id_vars = ['GeoFips', 'LineCode'],
    var_name = 'quarter',
    value_name = 'gdp'
)

# rename some columns
states = states.rename(
    columns = {'GeoFips':'geo', 'LineCode':'industry'}
)

# remove extra industries
states = states[
    states.industry.isin(codes)
]

# change industry code and gdp to integer
states.industry = states.industry.astype('int')

# rename industries from codes
states.industry = states.industry.map(
    {
        3:'agriculture',
        6:'mining',
        10:'utilities',
        11:'construction',
        12:'manufacturing',
        34:'wholesale',
        35:'retail',
        36:'transport',
        45:'information',
        51:'finance',
        56:'realestate',
        60:'profscitech',
        64:'management',
        65:'adminwaste',
        69:'education',
        70:'healthcare',
        76:'entertainment',
        79:'accomodation',
        82:'other',
        83:'government'
    }
)

# replace NA/NaN in gdp column
states.gdp = states.gdp.fillna(value = 0)

# sort the data frame
states = states.sort_values(by = ['geo', 'quarter']).reset_index(drop = True)

# scale gdp to be a geography quarter percentage of gdp rather than level
states.gdp = states.gdp / \
    states.gdp.groupby(
        states.geo.astype('string') + ':' + states.quarter
    ).transform('sum')

# fix time columns
states['year'] = states.quarter.str.replace(r"([0-9]{4}).*", "\\1").astype('int')
states['quarter'] = states.quarter.str.replace(r".*Q([0-9])", "\\1").astype('int')

# rearrange columns
states = states[['geo', 'year', 'quarter', 'industry', 'gdp']]

# spread so industries are columns
states = states.pivot(
    index = ['geo', 'year', 'quarter'],
    columns = 'industry',
    values = 'gdp'
).reset_index()

# remove the name for the columns
states.columns.name = None

  states['year'] = states.quarter.str.replace(r"([0-9]{4}).*", "\\1").astype('int')
  states['quarter'] = states.quarter.str.replace(r".*Q([0-9])", "\\1").astype('int')


In [36]:
# model and pipeline assignment
model = ElasticNetCV(l1_ratio = 1)

check = dml(
    X = np.array(states[list(set(states.columns) - set(['government', 'retail']))]),
    y = np.array(states['government']).reshape(-1, 1),
    d = np.array(states['retail']).reshape(-1, 1),
    ymod = model,
    splits = 10,
)

print(check['dml1']['coef_se'])
print(check['dml2']['coef_se'])

[-0.88315091  0.12270727]
[-0.88559182  0.05820758]
