In [None]:
# Packages

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import Ridge
from sklearn.base import clone
import matplotlib.pyplot as plt
import itertools
import time
import imodels
from imodels import HSTreeRegressor
import warnings
warnings.filterwarnings("ignore")


In [None]:
data = 'FRED-MD.csv'
df = pd.read_csv(data)
df = df.drop(0)

In [None]:
df['sasdate'] = pd.to_datetime(df['sasdate'])
df = df.set_index('sasdate')

# Drop columns with NaN values
df = df.dropna(axis=1)

# Specify the number of lags
num_lags = 4

# Create lags for each variable
lag_columns = []

for column in df.columns:
	lag_cols = [f'{column}_lag_{lag}' for lag in range(1, num_lags + 1)]
	lag_df = pd.concat([df[column].shift(lag) for lag in range(1, num_lags + 1)], axis=1, keys=lag_cols)
	df = pd.concat([df, lag_df], axis=1)
	lag_columns.extend(lag_cols)

# Reset the index to keep 'sasdate' as a column
df.reset_index(inplace=True)

# Drop the first 4 rows
df = df.drop(df.index[0:num_lags])

# Drop all variables except the lags and 'CPIAUCSL'
df = df[['sasdate', 'CPIAUCSL'] + lag_columns]

print(df)


In [None]:
# Create a dummy for 11/1/2008 and beyond
df['dummy_recession'] = np.where(df['sasdate'] >= '2008-11-01', 1, 0)

# Create a dummy for 3/1/2020 and beyond
df['dummy_covid'] = np.where(df['sasdate'] >= '2020-03-01', 1, 0)

# Create a dummy for 3/1/2022 and beyond
df['dummy_ukraine'] = np.where(df['sasdate'] >= '2022-03-01', 1, 0)

# remove the date column
df = df.drop(columns=['sasdate'])

