### Import dependencies

In [1]:
import pandas as pd
from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

### Merge datasets

In [2]:
# Read in data
cpi = pd.read_csv("resources/cpi_final.csv")
gdp = pd.read_csv("resources/gdp_final.csv")
gdp_pct = pd.read_csv("resources/gdp_pct_chg_final.csv")
houst = pd.read_csv("resources/housing_starts_final.csv")
opg = pd.read_csv("resources/output_gap_final.csv")
rec_dt = pd.read_csv("resources/recession_dates_final.csv")
unrate = pd.read_csv("resources/unemployment_rate_final.csv")
fed_funds = pd.read_csv("resources/fed_funds_final.csv")
yield10_2 = pd.read_csv("resources/10YT_minus_2YT_final.csv")
fred = pd.read_csv("resources/FRED_data.csv")

In [3]:
# Combine all data sets into one data frame
dfs = [cpi, gdp, gdp_pct, houst, opg, rec_dt, unrate, fed_funds, yield10_2, fred]
df = reduce(lambda left,right: pd.merge(left,right,on=['quarter'],how='outer'), dfs)
df.head()

Unnamed: 0,quarter,avg_consumer_price_index,date_x,gdp,date_y,gdp_pct_change,avg_housing_starts,date_x.1,output_gap,date_y.1,...,10YT_minus_2YT_percent_change_prev_quarter,real_disp_pers_inc,personal_consumption_exp_excl_food_energy,cpi_US_total,tot_public_debt_as_pct_of_gdp,gross_private_domestic_invest,M2_velocity,median_sls_price_houses_sold_US,nat_rate_of_unemp_long_term,personal_consumption_expenditures
0,1947Q1,21.7,1947-01-01,243.164,,,,,,1947-01-01,...,,,,,,,,,,
1,1947Q2,22.01,1947-04-01,245.968,1947-04-01,4.7,,,,1947-04-01,...,,,,,,,,,,
2,1947Q3,22.49,1947-07-01,249.585,1947-07-01,6.0,,,,1947-07-01,...,,,,,,,,,,
3,1947Q4,23.126667,1947-10-01,259.745,1947-10-01,17.3,,,,1947-10-01,...,,,,,,,,,,
4,1948Q1,23.616667,1948-01-01,265.742,1948-01-01,9.6,,,,1948-01-01,...,,,,,,,,,,


In [4]:
# Drop date columns
df = df.drop(columns=['date_x','date_y'])

In [5]:
# Sort data frame by quarter
df = df.sort_values(by=['quarter'])

In [6]:
# Check dataset before removing nulls
df.tail()

Unnamed: 0,quarter,avg_consumer_price_index,gdp,gdp_pct_change,avg_housing_starts,output_gap,target,avg_unemployment_rate,fed_funds_avg_rate,fed_funds_percent_change_prev_quarter,...,10YT_minus_2YT_percent_change_prev_quarter,real_disp_pers_inc,personal_consumption_exp_excl_food_energy,cpi_US_total,tot_public_debt_as_pct_of_gdp,gross_private_domestic_invest,M2_velocity,median_sls_price_houses_sold_US,nat_rate_of_unemp_long_term,personal_consumption_expenditures
287,2018Q4,252.759,20897.804,2.9,1185.0,0.592021,0.0,3.566667,2.217097,0.152641,...,-0.078947,2.8,1.9,2.203131,105.15026,3725.234,1.462,322800.0,4.582,14211.92
288,2019Q1,253.311333,21098.827,3.9,1213.0,0.848147,0.0,4.133333,2.401311,0.083088,...,-0.271429,4.5,1.6,1.644936,104.40334,3783.364,1.458,313000.0,4.577,14266.25
289,2019Q2,255.139333,21340.267,4.7,1255.666667,0.828815,,3.5,2.397813,-0.001457,...,0.254902,2.4,1.6,1.811376,103.2006,3749.471,1.457,322500.0,4.572,14511.176
290,2019Q3,256.273,,,1282.0,,,3.7,2.197813,-0.083409,...,-0.484375,,,,,,,,,
311,2019Q4,,,,,,,,1.845625,-0.160245,...,,,,,,,,,,


In [7]:
# Drop rows with missing values
df = df.dropna()

In [8]:
# Check dataset after removing nulls
df.tail()

Unnamed: 0,quarter,avg_consumer_price_index,gdp,gdp_pct_change,avg_housing_starts,output_gap,target,avg_unemployment_rate,fed_funds_avg_rate,fed_funds_percent_change_prev_quarter,...,10YT_minus_2YT_percent_change_prev_quarter,real_disp_pers_inc,personal_consumption_exp_excl_food_energy,cpi_US_total,tot_public_debt_as_pct_of_gdp,gross_private_domestic_invest,M2_velocity,median_sls_price_houses_sold_US,nat_rate_of_unemp_long_term,personal_consumption_expenditures
284,2018Q1,249.250333,20163.159,5.0,1320.666667,0.202456,0.0,4.333333,1.448966,0.204683,...,-0.113861,6.9,1.8,2.214194,104.59493,3542.412,1.451,331800.0,4.597,13728.357
285,2018Q2,250.578667,20510.177,7.1,1259.666667,0.589182,0.0,3.833333,1.727176,0.192007,...,-0.251397,2.7,2.0,2.711887,103.33928,3561.592,1.461,315600.0,4.592,13939.828
286,2018Q3,251.828667,20749.752,4.8,1233.0,0.821959,0.0,3.866667,1.923492,0.113663,...,-0.432836,3.3,2.0,2.64094,103.69309,3683.981,1.462,330900.0,4.587,14114.559
287,2018Q4,252.759,20897.804,2.9,1185.0,0.592021,0.0,3.566667,2.217097,0.152641,...,-0.078947,2.8,1.9,2.203131,105.15026,3725.234,1.462,322800.0,4.582,14211.92
288,2019Q1,253.311333,21098.827,3.9,1213.0,0.848147,0.0,4.133333,2.401311,0.083088,...,-0.271429,4.5,1.6,1.644936,104.40334,3783.364,1.458,313000.0,4.577,14266.25


In [9]:
# Set index to quarter
df = df.set_index('quarter')

In [10]:
# Rename target column
df = df.rename(columns={'target':'recession_actual'})
df.head()

Unnamed: 0_level_0,avg_consumer_price_index,gdp,gdp_pct_change,avg_housing_starts,output_gap,recession_actual,avg_unemployment_rate,fed_funds_avg_rate,fed_funds_percent_change_prev_quarter,fed_funds_st_dev_rate,...,10YT_minus_2YT_percent_change_prev_quarter,real_disp_pers_inc,personal_consumption_exp_excl_food_energy,cpi_US_total,tot_public_debt_as_pct_of_gdp,gross_private_domestic_invest,M2_velocity,median_sls_price_houses_sold_US,nat_rate_of_unemp_long_term,personal_consumption_expenditures
quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1976Q3,57.3,1886.558,7.6,1557.0,-2.199151,0.0,7.6,5.283478,0.016956,0.100618,...,0.370833,3.2,6.0,5.518087,33.64333,328.307,1.717,44400.0,6.217,1158.806
1976Q4,58.133333,1934.273,10.5,1691.333333,-2.246705,0.0,7.333333,4.874239,-0.077456,0.211941,...,0.337386,2.6,6.0,5.069403,33.78753,337.65,1.699,45500.0,6.223,1192.408
1977Q1,59.2,1988.648,11.7,1844.333333,-1.877175,0.0,8.233333,4.660667,-0.043817,0.148254,...,-0.095455,0.9,6.2,5.857741,33.65136,360.313,1.689,46300.0,6.227,1228.212
1977Q2,60.233333,2055.909,14.2,1918.666667,-0.776696,0.0,6.933333,5.157473,0.106595,0.332835,...,-0.052764,3.8,6.5,6.847698,32.80422,389.703,1.701,48900.0,6.232,1255.98
1977Q3,61.066667,2118.473,12.7,2009.0,0.186001,0.0,6.8,5.816413,0.127764,0.344309,...,-0.342175,5.7,6.6,6.682162,32.98791,414.134,1.713,48800.0,6.235,1286.905


### Shift data with sliding window technique

In [11]:
df['recession_1q_out'] = df['recession_actual'].shift(-1)
df['recession_2q_out'] = df['recession_actual'].shift(-2)
df['recession_4q_out'] = df['recession_actual'].shift(-4)

In [12]:
# Create three datasets -- 1 for each model (recession 1Qtr out, 2Qtrs out, 4Qtrs out)
df_q1 = df.drop(columns=['recession_2q_out','recession_4q_out','recession_actual'])
df_q2 = df.drop(columns=['recession_4q_out','recession_1q_out','recession_actual'])
df_q4 = df.drop(columns=['recession_1q_out','recession_2q_out','recession_actual'])

In [13]:
# Delete missing values
df_q1 = df_q1.dropna()
df_q2 = df_q2.dropna()
df_q4 = df_q4.dropna()
df_q4.tail()

Unnamed: 0_level_0,avg_consumer_price_index,gdp,gdp_pct_change,avg_housing_starts,output_gap,avg_unemployment_rate,fed_funds_avg_rate,fed_funds_percent_change_prev_quarter,fed_funds_st_dev_rate,10YT_minus_2YT_avg,...,real_disp_pers_inc,personal_consumption_exp_excl_food_energy,cpi_US_total,tot_public_debt_as_pct_of_gdp,gross_private_domestic_invest,M2_velocity,median_sls_price_houses_sold_US,nat_rate_of_unemp_long_term,personal_consumption_expenditures,recession_4q_out
quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017Q1,243.83,19190.431,4.2,1230.666667,-0.861917,4.866667,0.698889,0.55949,0.09883,1.203333,...,4.9,1.8,2.539321,103.41831,3288.229,1.438,313100.0,4.616,13104.419,0.0
2017Q2,244.065,19356.649,3.5,1169.333333,-0.752038,4.233333,0.947363,0.355527,0.098588,0.97,...,2.7,1.6,1.901991,102.52061,3334.971,1.432,318200.0,4.611,13212.501,0.0
2017Q3,245.368333,19611.704,5.4,1175.333333,-0.396155,4.4,1.153696,0.217797,0.024029,0.88,...,2.3,1.5,1.966925,103.22866,3401.815,1.435,320500.0,4.607,13345.053,0.0
2017Q4,247.273333,19918.91,6.4,1259.666667,0.033653,3.9,1.202778,0.042543,0.09912,0.673333,...,3.7,1.7,2.117557,102.88087,3457.732,1.442,337900.0,4.602,13586.267,0.0
2018Q1,249.250333,20163.159,5.0,1320.666667,0.202456,4.333333,1.448966,0.204683,0.083902,0.596667,...,6.9,1.8,2.214194,104.59493,3542.412,1.451,331800.0,4.597,13728.357,0.0


In [14]:
# Define y variables
y1 = df_q1['recession_1q_out']
y2 = df_q2['recession_2q_out']
y3 = df_q4['recession_4q_out']

In [15]:
# Drop target
df_q1 = df_q1.drop(columns=['recession_1q_out'])
df_q2 = df_q2.drop(columns=['recession_2q_out'])
df_q4 = df_q4.drop(columns=['recession_4q_out'])

In [16]:
# Define X
X_q1 = df_q1
X_q2 = df_q2
X_q4 = df_q4

In [17]:
# Feature names
q1_feature_names = X_q1.columns
q2_feature_names = X_q2.columns
q4_feature_names = X_q4.columns

### Split and scale data

In [18]:
# Split data into training and testing - shuffled
# X1_train, X1_test, y1_train, y1_test=train_test_split(X_q1,y1, train_size=0.8, random_state=42, stratify=y1)
# X2_train, X2_test, y2_train, y2_test=train_test_split(X_q2,y2, train_size=0.8, random_state=42, stratify=y2)
# X3_train, X3_test, y3_train, y3_test=train_test_split(X_q4,y3, train_size=0.8, random_state=42, stratify=y3)

In [18]:
# Split data into training and testing - not shuffled
X1_train, X1_test, y1_train, y1_test=train_test_split(X_q1,y1, train_size=0.8, random_state=42, shuffle=False)
X2_train, X2_test, y2_train, y2_test=train_test_split(X_q2,y2, train_size=0.8, random_state=42, shuffle=False)
X3_train, X3_test, y3_train, y3_test=train_test_split(X_q4,y3, train_size=0.8, random_state=42, shuffle=False)

In [19]:
# Create scaler object
X1_scaler = StandardScaler().fit(X1_train)
X2_scaler = StandardScaler().fit(X2_train)
X3_scaler = StandardScaler().fit(X3_train)

In [20]:
# Scale training data
X1_train_scaled = X1_scaler.transform(X1_train)
X2_train_scaled = X2_scaler.transform(X2_train)
X3_train_scaled = X3_scaler.transform(X3_train)

# Scale testing data
X1_test_scaled = X1_scaler.transform(X1_test)
X2_test_scaled = X2_scaler.transform(X2_test)
X3_test_scaled = X3_scaler.transform(X3_test)

# Random Forest

### 1 Quarter Out: shuffle=False

In [21]:
rf1 = RandomForestClassifier(n_estimators=200)
rf1 = rf1.fit(X1_train_scaled, y1_train)
rf1.score(X1_test_scaled, y1_test)

0.35294117647058826

In [22]:
sorted(zip(rf1.feature_importances_, q1_feature_names), reverse=True)

[(0.0992971621997194, 'nat_rate_of_unemp_long_term'),
 (0.09100217146479456, 'tot_public_debt_as_pct_of_gdp'),
 (0.08681414802933839, 'cpi_US_total'),
 (0.06859769947916605, '10YT_minus_2YT_avg'),
 (0.060287597104134526, 'gdp_pct_change'),
 (0.05930502494031212, 'avg_housing_starts'),
 (0.058097095648751734, '10YT_minus_2YT_percent_change_prev_quarter'),
 (0.055148484737067884, 'personal_consumption_exp_excl_food_energy'),
 (0.05187956821961216, 'fed_funds_avg_rate'),
 (0.04389446506770597, 'fed_funds_percent_change_prev_quarter'),
 (0.04328262200147395, 'avg_unemployment_rate'),
 (0.036396581060552526, 'M2_velocity'),
 (0.035934975739459245, 'real_disp_pers_inc'),
 (0.0353147541380359, 'avg_consumer_price_index'),
 (0.034775205993220475, 'gdp'),
 (0.034457625033777466, 'gross_private_domestic_invest'),
 (0.03312252569292575, 'output_gap'),
 (0.029562871820325617, 'personal_consumption_expenditures'),
 (0.02438982687828516, 'fed_funds_st_dev_rate'),
 (0.018439594751341132, 'median_sls_

### 2 Quarters Out: shuffle=False

In [23]:
rf2 = RandomForestClassifier(n_estimators=200)
rf2 = rf1.fit(X2_train_scaled, y2_train)
rf2.score(X2_test_scaled, y2_test)

0.35294117647058826

In [24]:
sorted(zip(rf2.feature_importances_, q2_feature_names), reverse=True)

[(0.12730424367017254, '10YT_minus_2YT_avg'),
 (0.08753188874963957, 'tot_public_debt_as_pct_of_gdp'),
 (0.08106683741637395, 'cpi_US_total'),
 (0.08036249378146688, 'nat_rate_of_unemp_long_term'),
 (0.06052133626389642, 'personal_consumption_exp_excl_food_energy'),
 (0.05787355933081914, 'fed_funds_avg_rate'),
 (0.05565977546288292, 'gdp'),
 (0.054270120675843365, '10YT_minus_2YT_percent_change_prev_quarter'),
 (0.05053683505989533, 'personal_consumption_expenditures'),
 (0.0446869786777809, 'gross_private_domestic_invest'),
 (0.043863004870255254, 'avg_consumer_price_index'),
 (0.040389624566969894, 'output_gap'),
 (0.03898503495779169, 'avg_housing_starts'),
 (0.036485798895462246, 'avg_unemployment_rate'),
 (0.029616137567306754, 'M2_velocity'),
 (0.027628330049837356, 'gdp_pct_change'),
 (0.02314750585311497, 'fed_funds_st_dev_rate'),
 (0.02249261135213105, 'fed_funds_percent_change_prev_quarter'),
 (0.019622124028431123, 'median_sls_price_houses_sold_US'),
 (0.017955758769928686,

### 4 Quarters Out: shuffle=False

In [25]:
rf3 = RandomForestClassifier(n_estimators=200)
rf3 = rf1.fit(X3_train_scaled, y3_train)
rf3.score(X3_test_scaled, y3_test)

0.5294117647058824

In [26]:
sorted(zip(rf2.feature_importances_, q4_feature_names), reverse=True)

[(0.16719360178342285, '10YT_minus_2YT_avg'),
 (0.07430311912386525, 'cpi_US_total'),
 (0.07293504248621001, 'output_gap'),
 (0.07004887695869, 'tot_public_debt_as_pct_of_gdp'),
 (0.06290826830719846, 'gdp'),
 (0.05794248432617065, 'fed_funds_avg_rate'),
 (0.05046003246655381, '10YT_minus_2YT_percent_change_prev_quarter'),
 (0.049418554335477115, 'gross_private_domestic_invest'),
 (0.04449560199326609, 'personal_consumption_exp_excl_food_energy'),
 (0.043155843083762954, 'nat_rate_of_unemp_long_term'),
 (0.04121719956535704, 'median_sls_price_houses_sold_US'),
 (0.03846696355290407, 'personal_consumption_expenditures'),
 (0.03701481660938923, 'avg_housing_starts'),
 (0.03568668193556566, 'avg_unemployment_rate'),
 (0.034904990047745374, 'avg_consumer_price_index'),
 (0.02756137981176569, 'fed_funds_st_dev_rate'),
 (0.02676735930975345, 'fed_funds_percent_change_prev_quarter'),
 (0.02500336570764399, 'M2_velocity'),
 (0.02193956975565135, 'gdp_pct_change'),
 (0.01857624883960709, 'real_

### 1 Quarter Out: shuffle=True

In [21]:
# rf1 = RandomForestClassifier(n_estimators=200)
# rf1 = rf1.fit(X1_train_scaled, y1_train)
# rf1.score(X1_test_scaled, y1_test)

0.8823529411764706

In [22]:
# sorted(zip(rf1.feature_importances_, q1_feature_names), reverse=True)

[(0.10037054872627489, '10YT_minus_2YT_avg'),
 (0.09416974351517814, 'gdp_pct_change'),
 (0.0832138247136725, 'tot_public_debt_as_pct_of_gdp'),
 (0.07980525716776202, 'fed_funds_percent_change_prev_quarter'),
 (0.06724926641225357, '10YT_minus_2YT_percent_change_prev_quarter'),
 (0.06031248713427215, 'cpi_US_total'),
 (0.0546558980987533, 'personal_consumption_exp_excl_food_energy'),
 (0.052426154086665316, 'fed_funds_avg_rate'),
 (0.04900153943638862, 'output_gap'),
 (0.04862096468528869, 'real_disp_pers_inc'),
 (0.04496380905888969, 'fed_funds_st_dev_rate'),
 (0.032719492486872885, 'avg_housing_starts'),
 (0.031205722956058612, 'avg_unemployment_rate'),
 (0.031027601845032504, 'median_sls_price_houses_sold_US'),
 (0.03042117076415153, 'avg_consumer_price_index'),
 (0.030290493752856102, 'M2_velocity'),
 (0.02949066610492819, 'nat_rate_of_unemp_long_term'),
 (0.028491242529549203, 'gross_private_domestic_invest'),
 (0.028090797542260623, 'gdp'),
 (0.02347331898289147, 'personal_consum

### 2 Quarters Out: shuffle=True

In [23]:
# rf2 = RandomForestClassifier(n_estimators=200)
# rf2 = rf1.fit(X2_train_scaled, y2_train)
# rf2.score(X2_test_scaled, y2_test)

0.9117647058823529

In [24]:
# sorted(zip(rf2.feature_importances_, q2_feature_names), reverse=True)

[(0.12549387353976635, '10YT_minus_2YT_avg'),
 (0.09627903668585919, 'tot_public_debt_as_pct_of_gdp'),
 (0.08980854627322438, 'cpi_US_total'),
 (0.07031256388492452, '10YT_minus_2YT_percent_change_prev_quarter'),
 (0.05878404060261827, 'fed_funds_percent_change_prev_quarter'),
 (0.05703964324549874, 'fed_funds_avg_rate'),
 (0.05061246861321934, 'gdp_pct_change'),
 (0.04905357642871393, 'output_gap'),
 (0.03844415308500449, 'M2_velocity'),
 (0.037836732672960866, 'avg_unemployment_rate'),
 (0.03766805180865528, 'personal_consumption_exp_excl_food_energy'),
 (0.03758223685356169, 'fed_funds_st_dev_rate'),
 (0.03670472877273343, 'real_disp_pers_inc'),
 (0.03573598534661903, 'personal_consumption_expenditures'),
 (0.035682354931208395, 'avg_consumer_price_index'),
 (0.03382733407677403, 'gross_private_domestic_invest'),
 (0.03226884796142386, 'nat_rate_of_unemp_long_term'),
 (0.030182543375701792, 'gdp'),
 (0.026818507735544343, 'avg_housing_starts'),
 (0.019864774105988147, 'median_sls_pr

### 4 Quarters Out: shuffle=True

In [25]:
# rf3 = RandomForestClassifier(n_estimators=200)
# rf3 = rf1.fit(X3_train_scaled, y3_train)
# rf3.score(X3_test_scaled, y3_test)

0.9411764705882353

In [26]:
# sorted(zip(rf2.feature_importances_, q4_feature_names), reverse=True)

[(0.15342873661032738, '10YT_minus_2YT_avg'),
 (0.09194704923938601, 'output_gap'),
 (0.08176523980445305, 'personal_consumption_exp_excl_food_energy'),
 (0.07452089737202273, 'cpi_US_total'),
 (0.05691902660119095, 'fed_funds_avg_rate'),
 (0.05420603375982227, '10YT_minus_2YT_percent_change_prev_quarter'),
 (0.0492838775044781, 'tot_public_debt_as_pct_of_gdp'),
 (0.045642363379660905, 'fed_funds_st_dev_rate'),
 (0.04268125796416908, 'avg_unemployment_rate'),
 (0.03936921746187423, 'nat_rate_of_unemp_long_term'),
 (0.03880290744776047, 'personal_consumption_expenditures'),
 (0.033249166588189716, 'gdp'),
 (0.03306479214174758, 'avg_consumer_price_index'),
 (0.031441715442761455, 'gross_private_domestic_invest'),
 (0.031032347085047665, 'real_disp_pers_inc'),
 (0.030838667496044164, 'gdp_pct_change'),
 (0.030235352079583517, 'avg_housing_starts'),
 (0.029262035104086793, 'M2_velocity'),
 (0.028068167125953968, 'fed_funds_percent_change_prev_quarter'),
 (0.02424114979144, 'median_sls_pri