# Data Preparation

In [47]:
import os
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller

# path for the folder "project"
path = "C:\\Users\\pedro\\OneDrive\\NYU\\CSS\\II. Data Skills\\project"
os.chdir(path)

## Importing data

In [48]:
# inflation
cpi = pd.read_parquet("data\\inflation\\data_bls_cpi.parquet")
gas = pd.read_parquet("data\\inflation\\data_fred_gasprice.parquet")

manheim = pd.read_csv("data\\inflation\\manheim.csv")
manheim["date"] = pd.to_datetime("01-" + manheim["date"])
manheim = manheim.rename(columns = {"manheim":"value"})
manheim["series_id"] = "Manheim"
manheim = manheim[["series_id","date","value"]]

cs = pd.read_parquet("data\\housing\\data_fred_caseshiller.parquet")

# employment
ces = pd.read_parquet("data\\employment\\data_bls_ces.parquet")
cps = pd.read_parquet("data\\employment\\data_bls_cps.parquet")

wage = pd.read_parquet("data\\employment\\data_atlanta_fed_wage_tracker.parquet")

# activity
pce = pd.read_parquet("data\\activity\\data_bea_pce.parquet")
income = pd.read_parquet("data\\activity\\data_bea_income.parquet")

# business surveys
# ism data:
ism = pd.read_parquet("data\\business_surveys\\data_ism.parquet")

# NFIB (small companies survey)
nfib = pd.read_parquet("data\\business_surveys\\data_nfib.parquet")

# Regional feds PMIs:
philly = pd.read_parquet("data\\business_surveys\\data_philly_fed_mpmi.parquet")
empire = pd.read_parquet("data\\business_surveys\\data_ny_fed_mpmi.parquet")
richmond = pd.read_parquet("data\\business_surveys\\data_richmond_fed_mpmi.parquet")

# Consumer Confidence:
michigan = pd.read_parquet("data\\consumer_confidence\\data_michigan_consumer_confidence.parquet")
conf_board = pd.read_parquet("data\\consumer_confidence\\data_conf_board.parquet")

# financial
ffunds = pd.read_parquet("data\\monetary_market\\data_fred_fed_funds.parquet")
breakeven = pd.read_parquet("data\\monetary_market\\data_fred_breakeven_rates.parquet")

In [49]:
total_cpi = (
    cpi[cpi["series_id"]=="CUSR0000SA0"].replace("CUSR0000SA0","CPI"))

total_cpi["value"] = np.log(total_cpi["value"])
total_cpi["value"] = total_cpi["value"].diff(1)*100
total_cpi = total_cpi.dropna()
total_cpi.head()

Unnamed: 0,series_id,date,value
1,CPI,1947-02-01,0.649654
2,CPI,1947-03-01,1.742364
3,CPI,1947-04-01,0.0
4,CPI,1947-05-01,-0.227531
5,CPI,1947-06-01,0.590508


In [50]:
gas = gas[gas["series_id"]=="GASREGW"]
gas = gas[["date","value"]].set_index("date").resample("MS").mean().reset_index()
gas["series_id"] = "Gas"
gas = gas[["series_id","date","value"]]

gas["value"] = np.log(gas["value"])
gas["value"] = gas["value"].diff(1)*100
gas = gas.dropna()

gas.head()

Unnamed: 0,series_id,date,value
1,Gas,1990-09-01,3.231299
2,Gas,1990-10-01,5.970771
3,Gas,1990-11-01,-0.857341
4,Gas,1990-12-01,1.275815
5,Gas,1991-01-01,-12.790117


In [51]:
manheim["value"] = np.log(manheim["value"])
manheim["value"] = manheim["value"].diff(1)*100
manheim = manheim.dropna()
manheim.tail()

Unnamed: 0,series_id,date,value
330,Manheim,2022-07-01,-0.136519
331,Manheim,2022-08-01,-3.711001
332,Manheim,2022-09-01,-3.412972
333,Manheim,2022-10-01,-2.225061
334,Manheim,2022-11-01,-0.300451


In [53]:
cs.tail()

Unnamed: 0,series_id,date,value,series_name,unit,seasonal
568,CSUSHPISA,2022-05-01,305.312,CS Home Price Index,Index,S
569,CSUSHPISA,2022-06-01,306.042,CS Home Price Index,Index,S
570,CSUSHPISA,2022-07-01,304.364,CS Home Price Index,Index,S
571,CSUSHPISA,2022-08-01,301.601,CS Home Price Index,Index,S
572,CSUSHPISA,2022-09-01,299.324,CS Home Price Index,Index,S


In [54]:
cs = cs[cs["series_id"]=="CSUSHPISA"].replace("CSUSHPISA","Case Shiller")[["series_id","date","value"]]
cs["value"] = np.log(cs["value"])
cs["value"] = cs["value"].diff(1)*100
cs = cs.dropna()
cs.head()

Unnamed: 0,series_id,date,value
145,Case Shiller,1987-02-01,0.715029
146,Case Shiller,1987-03-01,0.48313
147,Case Shiller,1987-04-01,0.611397
148,Case Shiller,1987-05-01,0.661078
149,Case Shiller,1987-06-01,0.771829


In [55]:
payroll = ces[ces["series_id"]=="CES0000000001"].replace("CES0000000001","Payroll")
payroll["value"] = np.log(payroll["value"])
payroll["value"] = payroll["value"].diff(1)*100
payroll = payroll.dropna()

urate = cps[cps["series_id"]=="LNS14000000"].replace("LNS14000000","Unemployment Rate")

wage_atlanta = wage[wage["series_id"].isin(["Unweighted Overall","Job Stayer","Job Switcher"])]

In [56]:
wage_atlanta.tail()

Unnamed: 0,series_id,date,value
2767,Job Switcher,2022-06-01,7.9
2768,Job Switcher,2022-07-01,8.5
2769,Job Switcher,2022-08-01,8.4
2770,Job Switcher,2022-09-01,7.9
2771,Job Switcher,2022-10-01,7.6


In [57]:
pce = pce[pce["series_id"]=="T20806_DPCERX"].replace("T20806_DPCERX","Real PCE")# real pce
pce = pce[["series_id","date","value"]]

pce["value"] = np.log(pce["value"])
pce["value"] = pce["value"].diff(1)*100
pce = pce.dropna()

income = income[income["series_id"]=="T20600_A067RX"].replace("T20600_A067RX","Real Income")# real income
income = income[["series_id","date","value"]]

income["value"] = np.log(income["value"])
income["value"] = income["value"].diff(1)*100
income = income.dropna()

In [58]:
income.tail()

Unnamed: 0,series_id,date,value
28163,Real Income,2022-06-01,-0.421908
28164,Real Income,2022-07-01,0.515882
28165,Real Income,2022-08-01,0.066467
28166,Real Income,2022-09-01,-0.024092
28167,Real Income,2022-10-01,0.373846


In [59]:
ism = (ism[ism["series_id"]
          .isin(["ISM Manufacturing Prices",
                 "ISM Nonmanufacturing Prices"])])[["series_id","date","value"]]

#ism["date"] = ism["date"] + pd.Timedelta(days=1)
#ism["date"] = ism["date"] - pd.DateOffset(months=1)

#ppaid(current)/ppaid(future)/prec(current)/prec(future)
philly = (
    philly[philly["series_id"]
           .isin(["ppcdfsa","ppfdfsa","prcdfsa","prfdfsa"])]
    .replace(["ppcdfsa","ppfdfsa","prcdfsa","prfdfsa"],
             ["Philly Prices Paid (current)","Philly Prices Paid (future)",
              "Philly Prices Received (current)",
              "Philly Prices Received (future)"]))[["series_id","date","value"]]

philly = philly.dropna()

empire = (empire[empire["series_id"].isin(["ppcdisa","ppfdisa","prcdisa","prfdisa"])]
          .replace(["ppcdisa","ppfdisa","prcdisa","prfdisa"],
             ["Empire Prices Paid (current)","Empire Prices Paid (future)",
              "Empire Prices Received (current)",
              "Empire Prices Received (future)"]))[["series_id","date","value"]]

empire = empire.dropna()

richmond = (richmond[richmond["series_id"]
                    .isin(["sa_mfg_pct_chg_prcs_pd_c","sa_mfg_pct_chg_prcs_pd_e",
                           "sa_mfg_pct_chg_prcs_recd_c","sa_mfg_pct_chg_prcs_recd_e"])]
            .replace(["sa_mfg_pct_chg_prcs_pd_c","sa_mfg_pct_chg_prcs_pd_e",
                           "sa_mfg_pct_chg_prcs_recd_c","sa_mfg_pct_chg_prcs_recd_e"],
             ["Richmond Prices Paid (current)","Richmond Prices Paid (future)",
              "Richmond Prices Received (current)",
              "Richmond Prices Received (future)"]))[["series_id","date","value"]]

richmond = richmond.dropna()

In [60]:
philly.tail()

Unnamed: 0,series_id,date,value
85145,Philly Prices Received (future),2022-07-01,34.6
85146,Philly Prices Received (future),2022-08-01,45.0
85147,Philly Prices Received (future),2022-09-01,39.4
85148,Philly Prices Received (future),2022-10-01,23.4
85149,Philly Prices Received (future),2022-11-01,15.7


In [61]:
# nfib:
#"NFIB_NFIB Price Change_SA_Diffusion","NFIB_NFIB Compensation_SA_Diffusion","NFIB_Inflation_NSA_Percent"

nfib = (
    nfib[nfib["series_id"]
           .isin(["NFIB_NFIB Price Change_SA_Diffusion",
                  "NFIB_NFIB Compensation_SA_Diffusion",
                  "NFIB_Inflation_NSA_Percent"])]
    .replace(["NFIB_NFIB Price Change_SA_Diffusion",
              "NFIB_NFIB Compensation_SA_Diffusion",
              "NFIB_Inflation_NSA_Percent"],
             ["NFIB Price Change",
              "NFIB Compensation",
              "NFIB Inflation Problem"]))[["series_id","date","value"]]

nfib["date"] = nfib["date"] + pd.Timedelta(days=1)
nfib["date"] = nfib["date"] - pd.DateOffset(months=1)

nfib = nfib.dropna()
nfib.tail()

Unnamed: 0,series_id,date,value
12399,NFIB Inflation Problem,2022-07-01,37.0
12400,NFIB Inflation Problem,2022-08-01,29.0
12401,NFIB Inflation Problem,2022-09-01,30.0
12402,NFIB Inflation Problem,2022-10-01,33.0
12403,NFIB Inflation Problem,2022-11-01,32.0


In [62]:
# michigan
# px1_mean_all: do you think that prices in general will go up, or go down (12 months)
# px5_mean_all: do you think that prices in general will go up, or go down (5y)
# durrn_np_all: House durable goods inflation
# vehrn_np_all: automobiles

michigan = (
    michigan[michigan["series_id"]
           .isin(["px1_mean_all",
                  "px5_mean_all",
                  "durrn_np_all","vehrn_np_all"])]
    .replace(["px1_mean_all",
                  "px5_mean_all",
                  "durrn_np_all","vehrn_np_all"],
             ["Michigan Inflation Expectation 12m",
              "Michigan Inflation Expectation 5y",
              "Michigan House Durable Goods","Michigan Automobiles"]))[["series_id","date","value"]]

michigan = michigan.dropna()
michigan.head()



Unnamed: 0,series_id,date,value
117822,Michigan Inflation Expectation 12m,1978-01-01,6.1
117823,Michigan Inflation Expectation 12m,1978-02-01,8.5
117824,Michigan Inflation Expectation 12m,1978-03-01,7.5
117825,Michigan Inflation Expectation 12m,1978-04-01,8.0
117826,Michigan Inflation Expectation 12m,1978-05-01,8.9


In [63]:
ffunds = ffunds[ffunds["series_id"]=="RIFSPFFNB"]
ffunds = ffunds[["date","value"]].set_index("date").resample("MS").mean().reset_index()
ffunds["series_id"] = "Fed Funds"
ffunds = ffunds[["series_id","date","value"]]
ffunds.head()

Unnamed: 0,series_id,date,value
0,Fed Funds,1954-07-01,0.751429
1,Fed Funds,1954-08-01,1.207727
2,Fed Funds,1954-09-01,1.0395
3,Fed Funds,1954-10-01,0.8455
4,Fed Funds,1954-11-01,0.828421


## Merging

In [64]:
data = pd.concat([total_cpi, gas, manheim, cs, payroll, urate, wage_atlanta,
                  pce, income, ism, philly, empire, richmond, ffunds, nfib, michigan])

data.head()

Unnamed: 0,series_id,date,value
1,CPI,1947-02-01,0.649654
2,CPI,1947-03-01,1.742364
3,CPI,1947-04-01,0.0
4,CPI,1947-05-01,-0.227531
5,CPI,1947-06-01,0.590508


In [65]:
data = pd.pivot_table(data, columns = "series_id", values="value", index="date").reset_index()
data.rename_axis(None, inplace=True, axis=1)
data[data["date"]>="2002"].head()

Unnamed: 0,date,CPI,Case Shiller,Empire Prices Paid (current),Empire Prices Paid (future),Empire Prices Received (current),Empire Prices Received (future),Fed Funds,Gas,ISM Manufacturing Prices,...,Philly Prices Received (current),Philly Prices Received (future),Real Income,Real PCE,Richmond Prices Paid (current),Richmond Prices Paid (future),Richmond Prices Received (current),Richmond Prices Received (future),Unemployment Rate,Unweighted Overall
755,2002-01-01,0.168967,0.589901,-5.2,3.1,-15.5,-14.4,1.750952,1.937824,43.9,...,-6.6,15.9,2.046882,,0.72,1.12,0.36,1.0,5.7,4.9
756,2002-02-01,0.168682,0.596625,-3.0,10.0,-10.0,-13.0,1.7455,0.585324,41.5,...,-5.2,15.9,0.147371,0.395746,0.51,1.18,0.31,1.03,5.7,4.7
757,2002-03-01,0.280505,0.711957,5.5,19.3,-8.3,-2.8,1.72619,11.481067,51.9,...,-0.4,17.5,0.067403,-0.020113,0.48,1.13,0.11,0.74,5.7,4.6
758,2002-04-01,0.447178,0.774667,12.0,23.0,-10.0,-2.0,1.758636,11.178371,60.3,...,0.3,23.2,0.339551,0.532456,0.29,1.48,0.16,1.09,5.9,4.3
759,2002-05-01,0.111483,0.92705,17.3,14.5,-4.5,-0.9,1.759091,-0.376513,63.0,...,3.5,22.3,0.405762,-0.387268,0.74,1.64,0.28,1.14,5.8,4.4


In [66]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1007 entries, 0 to 1006
Data columns (total 34 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   date                                1007 non-null   datetime64[ns]
 1   CPI                                 909 non-null    float64       
 2   Case Shiller                        428 non-null    float64       
 3   Empire Prices Paid (current)        257 non-null    float64       
 4   Empire Prices Paid (future)         257 non-null    float64       
 5   Empire Prices Received (current)    257 non-null    float64       
 6   Empire Prices Received (future)     257 non-null    float64       
 7   Fed Funds                           822 non-null    float64       
 8   Gas                                 387 non-null    float64       
 9   ISM Manufacturing Prices            899 non-null    float64       
 10  ISM Nonmanufacturing Pri

In [67]:
# saving
data.to_parquet("project_data.parquet")