### Import dependencies

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

### Merge datasets

In [27]:
# Read in data
df = pd.read_csv("resources/all_data.csv")

In [28]:
# Sort data frame by quarter
df = df.sort_values(by=['quarter'])

In [29]:
# Check dataset before removing nulls
df.tail()

Unnamed: 0,quarter,avg_consumer_price_index,gdp,gdp_pct_change,avg_housing_starts,output_gap,recession_actual,avg_unemployment_rate,fed_funds_avg_rate,fed_funds_percent_change_prev_quarter,fed_funds_st_dev_rate,10YT_minus_2YT_avg,10YT_minus_2YT_percent_change_prev_quarter,real_disp_pers_inc,personal_consumption_exp_excl_food_energy,tot_public_debt_as_pct_of_gdp,gross_private_domestic_invest,M2_velocity,median_sls_price_houses_sold_US,personal_consumption_expenditures
307,2018Q4,252.759,20897.804,2.9,1185.0,0.592021,0.0,3.566667,2.217097,0.152641,0.066218,0.233333,-0.078947,2.8,1.9,105.15026,3725.234,1.462,322800.0,14211.92
308,2019Q1,253.311333,21098.827,3.9,1213.0,0.848147,0.0,4.133333,2.401311,0.083088,0.004646,0.17,-0.271429,4.5,1.6,104.40334,3783.364,1.458,313000.0,14266.25
309,2019Q2,255.139333,21340.267,4.7,1255.666667,0.828815,,3.5,2.397813,-0.001457,0.024002,0.213333,0.254902,2.4,1.6,103.2006,3749.471,1.457,322500.0,14511.176
310,2019Q3,256.273,,,1282.0,,,3.7,2.197813,-0.083409,0.173154,0.11,-0.484375,,,,,,,
311,2019Q4,,,,,,,,1.845625,-0.160245,0.027561,,,,,,,,,


In [30]:
# Drop rows with missing values
df = df.dropna()

In [31]:
# Check dataset after removing nulls
df.tail()

Unnamed: 0,quarter,avg_consumer_price_index,gdp,gdp_pct_change,avg_housing_starts,output_gap,recession_actual,avg_unemployment_rate,fed_funds_avg_rate,fed_funds_percent_change_prev_quarter,fed_funds_st_dev_rate,10YT_minus_2YT_avg,10YT_minus_2YT_percent_change_prev_quarter,real_disp_pers_inc,personal_consumption_exp_excl_food_energy,tot_public_debt_as_pct_of_gdp,gross_private_domestic_invest,M2_velocity,median_sls_price_houses_sold_US,personal_consumption_expenditures
304,2018Q1,249.250333,20163.159,5.0,1320.666667,0.202456,0.0,4.333333,1.448966,0.204683,0.083902,0.596667,-0.113861,6.9,1.8,104.59493,3542.412,1.451,331800.0,13728.357
305,2018Q2,250.578667,20510.177,7.1,1259.666667,0.589182,0.0,3.833333,1.727176,0.192007,0.075492,0.446667,-0.251397,2.7,2.0,103.33928,3561.592,1.461,315600.0,13939.828
306,2018Q3,251.828667,20749.752,4.8,1233.0,0.821959,0.0,3.866667,1.923492,0.113663,0.047184,0.253333,-0.432836,3.3,2.0,103.69309,3683.981,1.462,330900.0,14114.559
307,2018Q4,252.759,20897.804,2.9,1185.0,0.592021,0.0,3.566667,2.217097,0.152641,0.066218,0.233333,-0.078947,2.8,1.9,105.15026,3725.234,1.462,322800.0,14211.92
308,2019Q1,253.311333,21098.827,3.9,1213.0,0.848147,0.0,4.133333,2.401311,0.083088,0.004646,0.17,-0.271429,4.5,1.6,104.40334,3783.364,1.458,313000.0,14266.25


In [32]:
# Set index to quarter
df = df.set_index('quarter')

In [33]:
# Rename target column
df = df.rename(columns={'target':'recession_actual'})
df.head()

Unnamed: 0_level_0,avg_consumer_price_index,gdp,gdp_pct_change,avg_housing_starts,output_gap,recession_actual,avg_unemployment_rate,fed_funds_avg_rate,fed_funds_percent_change_prev_quarter,fed_funds_st_dev_rate,10YT_minus_2YT_avg,10YT_minus_2YT_percent_change_prev_quarter,real_disp_pers_inc,personal_consumption_exp_excl_food_energy,tot_public_debt_as_pct_of_gdp,gross_private_domestic_invest,M2_velocity,median_sls_price_houses_sold_US,personal_consumption_expenditures
quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1976Q3,57.3,1886.558,7.6,1557.0,-2.199151,0.0,7.6,5.283478,0.016956,0.100618,1.096667,0.370833,3.2,6.0,33.64333,328.307,1.717,44400.0,1158.806
1976Q4,58.133333,1934.273,10.5,1691.333333,-2.246705,0.0,7.333333,4.874239,-0.077456,0.211941,1.466667,0.337386,2.6,6.0,33.78753,337.65,1.699,45500.0,1192.408
1977Q1,59.2,1988.648,11.7,1844.333333,-1.877175,0.0,8.233333,4.660667,-0.043817,0.148254,1.326667,-0.095455,0.9,6.2,33.65136,360.313,1.689,46300.0,1228.212
1977Q2,60.233333,2055.909,14.2,1918.666667,-0.776696,0.0,6.933333,5.157473,0.106595,0.332835,1.256667,-0.052764,3.8,6.5,32.80422,389.703,1.701,48900.0,1255.98
1977Q3,61.066667,2118.473,12.7,2009.0,0.186001,0.0,6.8,5.816413,0.127764,0.344309,0.826667,-0.342175,5.7,6.6,32.98791,414.134,1.713,48800.0,1286.905


### Shift data with sliding window technique

In [34]:
df['recession_1q_out'] = df['recession_actual'].shift(-1)
df['recession_2q_out'] = df['recession_actual'].shift(-2)
df['recession_4q_out'] = df['recession_actual'].shift(-4)

In [35]:
# Create three datasets -- 1 for each model (recession 1Qtr out, 2Qtrs out, 4Qtrs out)
df_q1 = df.drop(columns=['recession_2q_out','recession_4q_out','recession_actual'])
df_q2 = df.drop(columns=['recession_4q_out','recession_1q_out','recession_actual'])
df_q4 = df.drop(columns=['recession_1q_out','recession_2q_out','recession_actual'])

In [36]:
# Delete missing values
df_q1 = df_q1.dropna()
df_q2 = df_q2.dropna()
df_q4 = df_q4.dropna()
df_q4.tail()

Unnamed: 0_level_0,avg_consumer_price_index,gdp,gdp_pct_change,avg_housing_starts,output_gap,avg_unemployment_rate,fed_funds_avg_rate,fed_funds_percent_change_prev_quarter,fed_funds_st_dev_rate,10YT_minus_2YT_avg,10YT_minus_2YT_percent_change_prev_quarter,real_disp_pers_inc,personal_consumption_exp_excl_food_energy,tot_public_debt_as_pct_of_gdp,gross_private_domestic_invest,M2_velocity,median_sls_price_houses_sold_US,personal_consumption_expenditures,recession_4q_out
quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2017Q1,243.83,19190.431,4.2,1230.666667,-0.861917,4.866667,0.698889,0.55949,0.09883,1.203333,0.071217,4.9,1.8,103.41831,3288.229,1.438,313100.0,13104.419,0.0
2017Q2,244.065,19356.649,3.5,1169.333333,-0.752038,4.233333,0.947363,0.355527,0.098588,0.97,-0.193906,2.7,1.6,102.52061,3334.971,1.432,318200.0,13212.501,0.0
2017Q3,245.368333,19611.704,5.4,1175.333333,-0.396155,4.4,1.153696,0.217797,0.024029,0.88,-0.092784,2.3,1.5,103.22866,3401.815,1.435,320500.0,13345.053,0.0
2017Q4,247.273333,19918.91,6.4,1259.666667,0.033653,3.9,1.202778,0.042543,0.09912,0.673333,-0.234848,3.7,1.7,102.88087,3457.732,1.442,337900.0,13586.267,0.0
2018Q1,249.250333,20163.159,5.0,1320.666667,0.202456,4.333333,1.448966,0.204683,0.083902,0.596667,-0.113861,6.9,1.8,104.59493,3542.412,1.451,331800.0,13728.357,0.0


In [37]:
# Define y variables
y1 = df_q1['recession_1q_out']
y2 = df_q2['recession_2q_out']
y3 = df_q4['recession_4q_out']

In [38]:
# Define X
X1 = df_q1.drop(columns=['recession_1q_out'])
X2 = df_q2.drop(columns=['recession_2q_out'])
X3 = df_q4.drop(columns=['recession_4q_out'])

In [39]:
# Feature names
q1_feature_names = X1.columns
q2_feature_names = X2.columns
q4_feature_names = X3.columns

### Split and scale data

In [40]:
# Split data into training and testing - shuffled
X1_train, X1_test, y1_train, y1_test=train_test_split(X1, y1, train_size=0.8, stratify=y1)
X2_train, X2_test, y2_train, y2_test=train_test_split(X2, y2, train_size=0.8, stratify=y2)
X3_train, X3_test, y3_train, y3_test=train_test_split(X3, y3, train_size=0.8, stratify=y3)

In [18]:
# Split data into training and testing - not shuffled
# X1_train, X1_test, y1_train, y1_test=train_test_split(X1, y1, train_size=0.8, shuffle=False)
# X2_train, X2_test, y2_train, y2_test=train_test_split(X2, y2, train_size=0.8, shuffle=False)
# X3_train, X3_test, y3_train, y3_test=train_test_split(X3, y3, train_size=0.8, shuffle=False)

In [41]:
# Create scaler object
X1_scaler = StandardScaler().fit(X1_train)
X2_scaler = StandardScaler().fit(X2_train)
X3_scaler = StandardScaler().fit(X3_train)

In [42]:
# Scale training data
X1_train_scaled = X1_scaler.transform(X1_train)
X2_train_scaled = X2_scaler.transform(X2_train)
X3_train_scaled = X3_scaler.transform(X3_train)

# Scale testing data
X1_test_scaled = X1_scaler.transform(X1_test)
X2_test_scaled = X2_scaler.transform(X2_test)
X3_test_scaled = X3_scaler.transform(X3_test)

# Random Forest

### 1 Quarter Out: shuffle=False

In [21]:
# rf1 = RandomForestClassifier(n_estimators=200)
# rf1 = rf1.fit(X1_train_scaled, y1_train)
# rf1.score(X1_test_scaled, y1_test)

0.7352941176470589

In [22]:
# sorted(zip(rf1.feature_importances_, q1_feature_names), reverse=True)

[(0.12597943148075672, 'tot_public_debt_as_pct_of_gdp'),
 (0.09183217052219694, '10YT_minus_2YT_avg'),
 (0.0744126373611486, '10YT_minus_2YT_percent_change_prev_quarter'),
 (0.07105260148083196, 'gdp_pct_change'),
 (0.059586644680051676, 'gdp'),
 (0.05932546775113613, 'fed_funds_percent_change_prev_quarter'),
 (0.057102685195979025, 'avg_housing_starts'),
 (0.053505659157947076, 'fed_funds_avg_rate'),
 (0.051885804031213234, 'real_disp_pers_inc'),
 (0.05069798207518023, 'avg_unemployment_rate'),
 (0.049335445200529945, 'personal_consumption_expenditures'),
 (0.0479593091372378, 'fed_funds_st_dev_rate'),
 (0.04737643587859367, 'personal_consumption_exp_excl_food_energy'),
 (0.046814265177329686, 'output_gap'),
 (0.037690839760374364, 'avg_consumer_price_index'),
 (0.03288168807289571, 'gross_private_domestic_invest'),
 (0.02358879603569262, 'M2_velocity'),
 (0.01897213700090468, 'median_sls_price_houses_sold_US')]

### 2 Quarters Out: shuffle=False

In [23]:
# rf2 = RandomForestClassifier(n_estimators=200)
# rf2 = rf1.fit(X2_train_scaled, y2_train)
# rf2.score(X2_test_scaled, y2_test)

0.20588235294117646

In [24]:
# sorted(zip(rf2.feature_importances_, q2_feature_names), reverse=True)

[(0.14461582936366943, '10YT_minus_2YT_avg'),
 (0.0888275716617196, 'tot_public_debt_as_pct_of_gdp'),
 (0.08038958115170429, 'gdp'),
 (0.07007370476059492, 'fed_funds_avg_rate'),
 (0.05912034251839594, 'personal_consumption_exp_excl_food_energy'),
 (0.05707901120764439, 'output_gap'),
 (0.05632261118488248, 'avg_housing_starts'),
 (0.05535172918533577, 'personal_consumption_expenditures'),
 (0.05403487902453398, '10YT_minus_2YT_percent_change_prev_quarter'),
 (0.04878984816112306, 'avg_consumer_price_index'),
 (0.044059514549880475, 'M2_velocity'),
 (0.04256914345343211, 'avg_unemployment_rate'),
 (0.041474645324244254, 'fed_funds_percent_change_prev_quarter'),
 (0.039688954461473705, 'gross_private_domestic_invest'),
 (0.035255264327344465, 'gdp_pct_change'),
 (0.030365150346513377, 'median_sls_price_houses_sold_US'),
 (0.02927599065598356, 'fed_funds_st_dev_rate'),
 (0.022706228661524247, 'real_disp_pers_inc')]

### 4 Quarters Out: shuffle=False

In [25]:
# rf3 = RandomForestClassifier(n_estimators=200)
# rf3 = rf1.fit(X3_train_scaled, y3_train)
# rf3.score(X3_test_scaled, y3_test)

0.9117647058823529

In [26]:
# sorted(zip(rf2.feature_importances_, q4_feature_names), reverse=True)

[(0.1776493060251878, '10YT_minus_2YT_avg'),
 (0.09171068755415838, 'output_gap'),
 (0.07510867606939677, 'tot_public_debt_as_pct_of_gdp'),
 (0.0640746788089675, 'fed_funds_avg_rate'),
 (0.061567859069269926, 'personal_consumption_exp_excl_food_energy'),
 (0.0531486534658526, 'gdp'),
 (0.05282674744621088, 'gross_private_domestic_invest'),
 (0.051797174065521964, '10YT_minus_2YT_percent_change_prev_quarter'),
 (0.050583746777797374, 'personal_consumption_expenditures'),
 (0.04786239172228093, 'avg_consumer_price_index'),
 (0.044741877702302625, 'median_sls_price_houses_sold_US'),
 (0.044070131233050557, 'M2_velocity'),
 (0.03761118866827309, 'avg_unemployment_rate'),
 (0.035702681570729054, 'gdp_pct_change'),
 (0.03429351799840723, 'fed_funds_st_dev_rate'),
 (0.03428298559330785, 'avg_housing_starts'),
 (0.02257145007872268, 'fed_funds_percent_change_prev_quarter'),
 (0.020396246150562805, 'real_disp_pers_inc')]

### 1 Quarter Out: shuffle=True

In [43]:
rf1 = RandomForestClassifier(n_estimators=200)
rf1 = rf1.fit(X1_train_scaled, y1_train)
rf1.score(X1_test_scaled, y1_test)

0.8529411764705882

In [44]:
sorted(zip(rf1.feature_importances_, q1_feature_names), reverse=True)

[(0.10882952681970592, 'tot_public_debt_as_pct_of_gdp'),
 (0.09749402846531248, '10YT_minus_2YT_percent_change_prev_quarter'),
 (0.0875787768193589, 'gdp_pct_change'),
 (0.07513406359931382, 'fed_funds_percent_change_prev_quarter'),
 (0.0661906428841811, '10YT_minus_2YT_avg'),
 (0.06515923081995267, 'personal_consumption_exp_excl_food_energy'),
 (0.06206482419670039, 'fed_funds_avg_rate'),
 (0.060390723137980204, 'real_disp_pers_inc'),
 (0.05257551594392734, 'fed_funds_st_dev_rate'),
 (0.04609070676864795, 'M2_velocity'),
 (0.04465795467861411, 'output_gap'),
 (0.04422960192795969, 'avg_unemployment_rate'),
 (0.040486259946256584, 'avg_consumer_price_index'),
 (0.033966110652768576, 'gdp'),
 (0.03337401941656197, 'personal_consumption_expenditures'),
 (0.032651255772890506, 'avg_housing_starts'),
 (0.02901117305538346, 'gross_private_domestic_invest'),
 (0.0201155850944844, 'median_sls_price_houses_sold_US')]

### 2 Quarters Out: shuffle=True

In [45]:
rf2 = RandomForestClassifier(n_estimators=200)
rf2 = rf1.fit(X2_train_scaled, y2_train)
rf2.score(X2_test_scaled, y2_test)

0.9411764705882353

In [46]:
sorted(zip(rf2.feature_importances_, q2_feature_names), reverse=True)

[(0.1465182174401306, '10YT_minus_2YT_avg'),
 (0.10705491018073625, 'tot_public_debt_as_pct_of_gdp'),
 (0.07978246529163202, 'fed_funds_avg_rate'),
 (0.07747094334811389, 'personal_consumption_exp_excl_food_energy'),
 (0.06351721143401586, '10YT_minus_2YT_percent_change_prev_quarter'),
 (0.06155881525266735, 'fed_funds_percent_change_prev_quarter'),
 (0.04439492976850965, 'gdp'),
 (0.04438481396570998, 'fed_funds_st_dev_rate'),
 (0.04422525286932258, 'gdp_pct_change'),
 (0.04363717362546339, 'personal_consumption_expenditures'),
 (0.04156516231647598, 'M2_velocity'),
 (0.04073230309205233, 'avg_consumer_price_index'),
 (0.03943155448514243, 'output_gap'),
 (0.037165371194189126, 'avg_unemployment_rate'),
 (0.03410199366504214, 'avg_housing_starts'),
 (0.03328894971393563, 'real_disp_pers_inc'),
 (0.033051775913216064, 'gross_private_domestic_invest'),
 (0.02811815644364471, 'median_sls_price_houses_sold_US')]

### 4 Quarters Out: shuffle=True

In [47]:
rf3 = RandomForestClassifier(n_estimators=200)
rf3 = rf1.fit(X3_train_scaled, y3_train)
rf3.score(X3_test_scaled, y3_test)

0.9411764705882353

In [48]:
sorted(zip(rf2.feature_importances_, q4_feature_names), reverse=True)

[(0.2024501397037111, '10YT_minus_2YT_avg'),
 (0.0951221217721841, '10YT_minus_2YT_percent_change_prev_quarter'),
 (0.07375985391094428, 'tot_public_debt_as_pct_of_gdp'),
 (0.07081684818051018, 'output_gap'),
 (0.06647267131913522, 'fed_funds_avg_rate'),
 (0.06320023135916468, 'personal_consumption_exp_excl_food_energy'),
 (0.04906477691602158, 'avg_unemployment_rate'),
 (0.045746673321702305, 'M2_velocity'),
 (0.04007257923136654, 'median_sls_price_houses_sold_US'),
 (0.03985956426335622, 'avg_consumer_price_index'),
 (0.03887431651343074, 'fed_funds_percent_change_prev_quarter'),
 (0.03731922865434809, 'gdp'),
 (0.035770583475598985, 'fed_funds_st_dev_rate'),
 (0.030033945486778502, 'gdp_pct_change'),
 (0.028792117192329342, 'real_disp_pers_inc'),
 (0.028094060901608407, 'gross_private_domestic_invest'),
 (0.027729714206652586, 'personal_consumption_expenditures'),
 (0.026820573591157144, 'avg_housing_starts')]