### Import dependencies

In [1]:
import pandas as pd
from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

### Merge datasets

In [2]:
# Read in data
cpi = pd.read_csv("resources/cpi_final.csv")
gdp = pd.read_csv("resources/gdp_final.csv")
gdp_pct = pd.read_csv("resources/gdp_pct_chg_final.csv")
houst = pd.read_csv("resources/housing_starts_final.csv")
opg = pd.read_csv("resources/output_gap_final.csv")
rec_dt = pd.read_csv("resources/recession_dates_final.csv")
unrate = pd.read_csv("resources/unemployment_rate_final.csv")
fed_funds = pd.read_csv("resources/fed_funds_final.csv")
yield10_2 = pd.read_csv("resources/10YT_minus_2YT_final.csv")
fred = pd.read_csv("resources/FRED_data.csv")

In [3]:
# Combine all data sets into one data frame
dfs = [cpi, gdp, gdp_pct, houst, opg, rec_dt, unrate, fed_funds, yield10_2, fred]
df = reduce(lambda left,right: pd.merge(left,right,on=['quarter'],how='outer'), dfs)
df.head()

Unnamed: 0,quarter,avg_consumer_price_index,date_x,gdp,date_y,gdp_pct_change,avg_housing_starts,date_x.1,output_gap,date_y.1,...,nat_rate_of_unemp_long_term_PCH,personal_consumption_expenditures,personal_consumption_expenditures_CCA,personal_consumption_expenditures_CCH,personal_consumption_expenditures_CH1,personal_consumption_expenditures_CHG,personal_consumption_expenditures_LOG,personal_consumption_expenditures_PC1,personal_consumption_expenditures_PCA,personal_consumption_expenditures_PCH
0,1947Q1,21.7,1947-01-01,243.164,,,,,,1947-01-01,...,,,,,,,,,,
1,1947Q2,22.01,1947-04-01,245.968,1947-04-01,4.7,,,,1947-04-01,...,,,,,,,,,,
2,1947Q3,22.49,1947-07-01,249.585,1947-07-01,6.0,,,,1947-07-01,...,,,,,,,,,,
3,1947Q4,23.126667,1947-10-01,259.745,1947-10-01,17.3,,,,1947-10-01,...,,,,,,,,,,
4,1948Q1,23.616667,1948-01-01,265.742,1948-01-01,9.6,,,,1948-01-01,...,,,,,,,,,,


In [4]:
# Drop date columns
df = df.drop(columns=['date_x','date_y'])

In [5]:
# Sort data frame by quarter
df = df.sort_values(by=['quarter'])

In [6]:
# Check dataset before removing nulls
df.tail()

Unnamed: 0,quarter,avg_consumer_price_index,gdp,gdp_pct_change,avg_housing_starts,output_gap,target,avg_unemployment_rate,fed_funds_avg_rate,fed_funds_percent_change_prev_quarter,...,nat_rate_of_unemp_long_term_PCH,personal_consumption_expenditures,personal_consumption_expenditures_CCA,personal_consumption_expenditures_CCH,personal_consumption_expenditures_CH1,personal_consumption_expenditures_CHG,personal_consumption_expenditures_LOG,personal_consumption_expenditures_PC1,personal_consumption_expenditures_PCA,personal_consumption_expenditures_PCH
287,2018Q4,252.759,20897.804,2.9,1185.0,0.592021,0.0,3.566667,2.217097,0.152641,...,-0.1,14211.92,2.7,0.7,625.653,97.361,9.6,4.6,2.8,0.7
288,2019Q1,253.311333,21098.827,3.9,1213.0,0.848147,0.0,4.133333,2.401311,0.083088,...,-0.1,14266.25,1.5,0.4,537.893,54.33,9.6,3.9,1.5,0.4
289,2019Q2,255.139333,21340.267,4.7,1255.666667,0.828815,,3.5,2.397813,-0.001457,...,-0.1,14511.176,6.8,1.7,571.348,244.926,9.6,4.1,7.0,1.7
290,2019Q3,256.273,,,1282.0,,,3.7,2.197813,-0.083409,...,,,,,,,,,,
311,2019Q4,,,,,,,,1.845625,-0.160245,...,,,,,,,,,,


In [7]:
# Drop rows with missing values
df = df.dropna()

In [8]:
# Check dataset after removing nulls
df.tail()

Unnamed: 0,quarter,avg_consumer_price_index,gdp,gdp_pct_change,avg_housing_starts,output_gap,target,avg_unemployment_rate,fed_funds_avg_rate,fed_funds_percent_change_prev_quarter,...,nat_rate_of_unemp_long_term_PCH,personal_consumption_expenditures,personal_consumption_expenditures_CCA,personal_consumption_expenditures_CCH,personal_consumption_expenditures_CH1,personal_consumption_expenditures_CHG,personal_consumption_expenditures_LOG,personal_consumption_expenditures_PC1,personal_consumption_expenditures_PCA,personal_consumption_expenditures_PCH
284,2018Q1,249.250333,20163.159,5.0,1320.666667,0.202456,0.0,4.333333,1.448966,0.204683,...,-0.1,13728.357,4.2,1.0,623.938,142.09,9.5,4.8,4.2,1.0
285,2018Q2,250.578667,20510.177,7.1,1259.666667,0.589182,0.0,3.833333,1.727176,0.192007,...,-0.1,13939.828,6.1,1.5,727.327,211.471,9.5,5.5,6.3,1.5
286,2018Q3,251.828667,20749.752,4.8,1233.0,0.821959,0.0,3.866667,1.923492,0.113663,...,-0.1,14114.559,5.0,1.2,769.506,174.731,9.6,5.8,5.1,1.3
287,2018Q4,252.759,20897.804,2.9,1185.0,0.592021,0.0,3.566667,2.217097,0.152641,...,-0.1,14211.92,2.7,0.7,625.653,97.361,9.6,4.6,2.8,0.7
288,2019Q1,253.311333,21098.827,3.9,1213.0,0.848147,0.0,4.133333,2.401311,0.083088,...,-0.1,14266.25,1.5,0.4,537.893,54.33,9.6,3.9,1.5,0.4


In [9]:
# Set index to quarter
df = df.set_index('quarter')

In [10]:
# Rename target column
df = df.rename(columns={'target':'recession_actual'})
df.head()

Unnamed: 0_level_0,avg_consumer_price_index,gdp,gdp_pct_change,avg_housing_starts,output_gap,recession_actual,avg_unemployment_rate,fed_funds_avg_rate,fed_funds_percent_change_prev_quarter,fed_funds_st_dev_rate,...,nat_rate_of_unemp_long_term_PCH,personal_consumption_expenditures,personal_consumption_expenditures_CCA,personal_consumption_expenditures_CCH,personal_consumption_expenditures_CH1,personal_consumption_expenditures_CHG,personal_consumption_expenditures_LOG,personal_consumption_expenditures_PC1,personal_consumption_expenditures_PCA,personal_consumption_expenditures_PCH
quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1976Q3,57.3,1886.558,7.6,1557.0,-2.199151,0.0,7.6,5.283478,0.016956,0.100618,...,0.1,1158.806,10.2,2.6,111.614,29.266,7.1,10.7,10.8,2.6
1976Q4,58.133333,1934.273,10.5,1691.333333,-2.246705,0.0,7.333333,4.874239,-0.077456,0.211941,...,0.1,1192.408,11.4,2.9,116.185,33.602,7.1,10.8,12.1,2.9
1977Q1,59.2,1988.648,11.7,1844.333333,-1.877175,0.0,8.233333,4.660667,-0.043817,0.148254,...,0.1,1228.212,11.8,3.0,118.304,35.804,7.1,10.7,12.6,3.0
1977Q2,60.233333,2055.909,14.2,1918.666667,-0.776696,0.0,6.933333,5.157473,0.106595,0.332835,...,0.1,1255.98,8.9,2.2,126.44,27.768,7.1,11.2,9.4,2.3
1977Q3,61.066667,2118.473,12.7,2009.0,0.186001,0.0,6.8,5.816413,0.127764,0.344309,...,0.0,1286.905,9.7,2.4,128.099,30.925,7.2,11.1,10.2,2.5


### Shift data with sliding window technique

In [11]:
df['recession_1q_out'] = df['recession_actual'].shift(-1)
df['recession_2q_out'] = df['recession_actual'].shift(-2)
df['recession_4q_out'] = df['recession_actual'].shift(-4)

In [12]:
# Create three datasets -- 1 for each model (recession 1Qtr out, 2Qtrs out, 4Qtrs out)
df_q1 = df.drop(columns=['recession_2q_out','recession_4q_out','recession_actual'])
df_q2 = df.drop(columns=['recession_4q_out','recession_1q_out','recession_actual'])
df_q4 = df.drop(columns=['recession_1q_out','recession_2q_out','recession_actual'])

In [13]:
# Delete missing values
df_q1 = df_q1.dropna()
df_q2 = df_q2.dropna()
df_q4 = df_q4.dropna()
df_q4.tail()

Unnamed: 0_level_0,avg_consumer_price_index,gdp,gdp_pct_change,avg_housing_starts,output_gap,avg_unemployment_rate,fed_funds_avg_rate,fed_funds_percent_change_prev_quarter,fed_funds_st_dev_rate,10YT_minus_2YT_avg,...,personal_consumption_expenditures,personal_consumption_expenditures_CCA,personal_consumption_expenditures_CCH,personal_consumption_expenditures_CH1,personal_consumption_expenditures_CHG,personal_consumption_expenditures_LOG,personal_consumption_expenditures_PC1,personal_consumption_expenditures_PCA,personal_consumption_expenditures_PCH,recession_4q_out
quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017Q1,243.83,19190.431,4.2,1230.666667,-0.861917,4.866667,0.698889,0.55949,0.09883,1.203333,...,13104.419,4.4,1.1,580.895,144.652,9.5,4.6,4.5,1.1,0.0
2017Q2,244.065,19356.649,3.5,1169.333333,-0.752038,4.233333,0.947363,0.355527,0.098588,0.97,...,13212.501,3.3,0.8,524.24,108.082,9.5,4.1,3.3,0.8,0.0
2017Q3,245.368333,19611.704,5.4,1175.333333,-0.396155,4.4,1.153696,0.217797,0.024029,0.88,...,13345.053,4.0,1.0,522.673,132.552,9.5,4.1,4.1,1.0,0.0
2017Q4,247.273333,19918.91,6.4,1259.666667,0.033653,3.9,1.202778,0.042543,0.09912,0.673333,...,13586.267,7.2,1.8,626.5,241.214,9.5,4.8,7.4,1.8,0.0
2018Q1,249.250333,20163.159,5.0,1320.666667,0.202456,4.333333,1.448966,0.204683,0.083902,0.596667,...,13728.357,4.2,1.0,623.938,142.09,9.5,4.8,4.2,1.0,0.0


In [14]:
# Define y variables
y1 = df_q1['recession_1q_out']
y2 = df_q2['recession_2q_out']
y3 = df_q4['recession_4q_out']

In [15]:
# Drop target
df_q1 = df_q1.drop(columns=['recession_1q_out'])
df_q2 = df_q2.drop(columns=['recession_2q_out'])
df_q4 = df_q4.drop(columns=['recession_4q_out'])

In [16]:
# Define X
X_q1 = df_q1
X_q2 = df_q2
X_q4 = df_q4

In [17]:
# Feature names
q1_feature_names = X_q1.columns
q2_feature_names = X_q2.columns
q4_feature_names = X_q4.columns

### Split and scale data

In [34]:
# Split data into training and testing
X1_train, X1_test, y1_train, y1_test=train_test_split(X_q1,y1, train_size=0.8, random_state=42, stratify=y1)
X2_train, X2_test, y2_train, y2_test=train_test_split(X_q2,y2, train_size=0.8, random_state=42, stratify=y2)
X3_train, X3_test, y3_train, y3_test=train_test_split(X_q4,y3, train_size=0.8, random_state=42, stratify=y3)

In [19]:
# Remove shuffle to see if model performs better
# Split data into training and testing
# X1_train, X1_test, y1_train, y1_test=train_test_split(X_q1,y1, train_size=0.8, random_state=42, shuffle=False)
# X2_train, X2_test, y2_train, y2_test=train_test_split(X_q2,y2, train_size=0.8, random_state=42, shuffle=False)
# X3_train, X3_test, y3_train, y3_test=train_test_split(X_q4,y3, train_size=0.8, random_state=42, shuffle=False)

In [35]:
# Create scaler object
X1_scaler = StandardScaler().fit(X1_train)
X2_scaler = StandardScaler().fit(X2_train)
X3_scaler = StandardScaler().fit(X3_train)

In [36]:
# Scale training data
X1_train_scaled = X1_scaler.transform(X1_train)
X2_train_scaled = X2_scaler.transform(X2_train)
X3_train_scaled = X3_scaler.transform(X3_train)

# Scale testing data
X1_test_scaled = X1_scaler.transform(X1_test)
X2_test_scaled = X2_scaler.transform(X2_test)
X3_test_scaled = X3_scaler.transform(X3_test)

# Random Forest

### 1 Quarter Out: shuffle=False

In [22]:
# rf1 = RandomForestClassifier(n_estimators=200)
# rf1 = rf1.fit(X1_train_scaled, y1_train)
# rf1.score(X1_test_scaled, y1_test)

0.5294117647058824

In [23]:
# sorted(zip(rf1.feature_importances_, q1_feature_names), reverse=True)

[(0.047493061322909265, 'nat_rate_of_unemp_long_term'),
 (0.04412525495964016, 'tot_public_debt_as_pct_of_gdp'),
 (0.039336079590773676, 'cpi_US_total'),
 (0.036790669182571976, '10YT_minus_2YT_avg'),
 (0.03409700088705132, 'nat_rate_of_unemp_long_term_CH1'),
 (0.03248342976321602, 'DPCERO1Q156NBEA'),
 (0.02852987894603907, 'personal_consumption_exp_excl_food_energy_LOG'),
 (0.022144699900702763, 'nat_rate_of_unemp_long_term_CCA'),
 (0.02185879739235256, 'personal_consumption_exp_excl_food_energy'),
 (0.02142251269111409, 'nat_rate_of_unemp_long_term_CHG'),
 (0.02004240485387914, 'tot_public_debt_as_pct_of_gdp_LOG'),
 (0.02002105883924761, 'gross_private_domestic_invest_CHG'),
 (0.019765752282101087, 'gdp'),
 (0.01777912182323513, '10YT_minus_2YT_percent_change_prev_quarter'),
 (0.017365462284890307, 'avg_unemployment_rate'),
 (0.016912106434267587, 'fed_funds_avg_rate'),
 (0.016813571713994738, 'median_sls_price_houses_sold_US_CH1'),
 (0.01666790332439547, 'nat_rate_of_unemp_long_term

### 2 Quarters Out: shuffle=False

In [24]:
# rf2 = RandomForestClassifier(n_estimators=200)
# rf2 = rf1.fit(X2_train_scaled, y2_train)
# rf2.score(X2_test_scaled, y2_test)

0.5

In [25]:
# sorted(zip(rf2.feature_importances_, q2_feature_names), reverse=True)

[(0.08354156183512848, '10YT_minus_2YT_avg'),
 (0.04583271595355061, 'tot_public_debt_as_pct_of_gdp'),
 (0.042220288575313535, 'nat_rate_of_unemp_long_term'),
 (0.038345223873815594, 'cpi_US_total'),
 (0.03520458674399937, 'personal_consumption_exp_excl_food_energy'),
 (0.029018485772723927, 'nat_rate_of_unemp_long_term_CH1'),
 (0.027714872678605843, 'gdp'),
 (0.027649754059931284, 'tot_public_debt_as_pct_of_gdp_PC1'),
 (0.027079704580195443, 'fed_funds_avg_rate'),
 (0.02425575854103765, 'personal_consumption_expenditures'),
 (0.023760161198057787, 'DPCERO1Q156NBEA'),
 (0.02110879863818906, 'nat_rate_of_unemp_long_term_CHG'),
 (0.01991320473314035, 'nat_rate_of_unemp_long_term_CCA'),
 (0.018604592701587368, 'tot_public_debt_as_pct_of_gdp_LOG'),
 (0.0183537051860914, 'nat_rate_of_unemp_long_term_PCA'),
 (0.01710342638448944, 'output_gap'),
 (0.014798057211256236, '10YT_minus_2YT_percent_change_prev_quarter'),
 (0.01317131340353141, 'tot_public_debt_as_pct_of_gdp_CH1'),
 (0.0131014435482

### 4 Quarters Out: shuffle=False

In [26]:
# rf3 = RandomForestClassifier(n_estimators=200)
# rf3 = rf1.fit(X3_train_scaled, y3_train)
# rf3.score(X3_test_scaled, y3_test)

0.47058823529411764

In [27]:
# sorted(zip(rf2.feature_importances_, q4_feature_names), reverse=True)

[(0.07349781232869303, '10YT_minus_2YT_avg'),
 (0.04088020280330462, 'output_gap'),
 (0.02969870682898103, 'nat_rate_of_unemp_long_term'),
 (0.02843774497126978, '10YT_minus_2YT_percent_change_prev_quarter'),
 (0.025110652312428042, 'gross_private_domestic_invest'),
 (0.024788436235196176, 'personal_consumption_exp_excl_food_energy'),
 (0.023951532237162943, 'nat_rate_of_unemp_long_term_PCA'),
 (0.023255669496648524, 'personal_consumption_expenditures_PC1'),
 (0.023100809346842846, 'gdp'),
 (0.02258052480992896, 'nat_rate_of_unemp_long_term_CCA'),
 (0.022043780660909652, 'cpi_US_total'),
 (0.021426649983698323, 'avg_consumer_price_index'),
 (0.020942773039052277, 'fed_funds_avg_rate'),
 (0.01937751399135811, 'nat_rate_of_unemp_long_term_CHG'),
 (0.018747536391155843, 'tot_public_debt_as_pct_of_gdp'),
 (0.01852341269746169, 'personal_consumption_expenditures'),
 (0.018411166390493915, 'personal_consumption_exp_excl_food_energy_CH1'),
 (0.017572263164264534, 'median_sls_price_houses_sold

### 1 Quarter Out: shuffle=True

In [37]:
rf1 = RandomForestClassifier(n_estimators=200)
rf1 = rf1.fit(X1_train_scaled, y1_train)
rf1.score(X1_test_scaled, y1_test)

0.9117647058823529

In [38]:
sorted(zip(rf1.feature_importances_, q1_feature_names), reverse=True)

[(0.05346055234898867, '10YT_minus_2YT_avg'),
 (0.03586984720610246, 'cpi_US_total_CHG'),
 (0.035659546884358025, 'tot_public_debt_as_pct_of_gdp'),
 (0.035222837581235684, '10YT_minus_2YT_percent_change_prev_quarter'),
 (0.03305344802602674, 'cpi_US_total'),
 (0.02857346777367635, 'output_gap'),
 (0.025240477075546126, 'DPCERO1Q156NBEA_CH1'),
 (0.022314688580778216, 'DPCERO1Q156NBEA'),
 (0.02197310599452968, 'fed_funds_avg_rate'),
 (0.02144334685083388, 'gross_private_domestic_invest_CH1'),
 (0.020337566907028003, 'gross_private_domestic_invest_CHG'),
 (0.020196413877203178, 'personal_consumption_exp_excl_food_energy'),
 (0.0183372090444759, 'median_sls_price_houses_sold_US_PC1'),
 (0.018004894748309897, 'tot_public_debt_as_pct_of_gdp_PC1'),
 (0.0173222606519042, 'personal_consumption_exp_excl_food_energy_LOG'),
 (0.016991176209342585, 'gross_private_domestic_invest_PCH'),
 (0.01694924679557412, 'tot_public_debt_as_pct_of_gdp_PCA'),
 (0.0166983566809823, 'fed_funds_percent_change_prev_

### 2 Quarters Out: shuffle=True

In [39]:
rf2 = RandomForestClassifier(n_estimators=200)
rf2 = rf1.fit(X2_train_scaled, y2_train)
rf2.score(X2_test_scaled, y2_test)

0.9411764705882353

In [40]:
sorted(zip(rf2.feature_importances_, q2_feature_names), reverse=True)

[(0.06287033075075966, '10YT_minus_2YT_avg'),
 (0.051416340224532925, 'cpi_US_total'),
 (0.03431309668344302, 'output_gap'),
 (0.03350670003424272, 'tot_public_debt_as_pct_of_gdp'),
 (0.033370881095522054, 'fed_funds_avg_rate'),
 (0.03313372834263285, 'tot_public_debt_as_pct_of_gdp_PC1'),
 (0.030595441551608427, '10YT_minus_2YT_percent_change_prev_quarter'),
 (0.02691392928970297, 'gross_private_domestic_invest_CCA'),
 (0.023898870762700086, 'M2_velocity'),
 (0.022488484189796123, 'median_sls_price_houses_sold_US_PC1'),
 (0.021908793337841577, 'cpi_US_total_CHG'),
 (0.02147323171311159, 'personal_consumption_exp_excl_food_energy'),
 (0.02098754324820371, 'gross_private_domestic_invest_CHG'),
 (0.019866992196690945, 'tot_public_debt_as_pct_of_gdp_CH1'),
 (0.019255924142568152, 'gross_private_domestic_invest_PCH'),
 (0.019164467301732924, 'tot_public_debt_as_pct_of_gdp_LOG'),
 (0.018816451776829658, 'cpi_US_total_CH1'),
 (0.01656284329303007, 'avg_unemployment_rate'),
 (0.015952424608188

### 4 Quarters Out: shuffle=True

In [41]:
rf3 = RandomForestClassifier(n_estimators=200)
rf3 = rf1.fit(X3_train_scaled, y3_train)
rf3.score(X3_test_scaled, y3_test)

0.9117647058823529

In [42]:
sorted(zip(rf2.feature_importances_, q4_feature_names), reverse=True)

[(0.09464033189240627, '10YT_minus_2YT_avg'),
 (0.050733382843181456, '10YT_minus_2YT_percent_change_prev_quarter'),
 (0.04442454261013532, 'output_gap'),
 (0.03022349691760189, 'cpi_US_total'),
 (0.029603639660847083, 'fed_funds_avg_rate'),
 (0.02913502683825881, 'personal_consumption_expenditures_PC1'),
 (0.027306166631957132, 'personal_consumption_exp_excl_food_energy'),
 (0.02566034349694061, 'avg_unemployment_rate'),
 (0.020636775999708635, 'tot_public_debt_as_pct_of_gdp'),
 (0.017598364357556802, 'gross_private_domestic_invest_CCA'),
 (0.01648782553658777, 'median_sls_price_houses_sold_US_CH1'),
 (0.016351234889508688, 'cpi_US_total_CHG'),
 (0.01617011314366527, 'gross_private_domestic_invest_PCA'),
 (0.0150859782429291, 'nat_rate_of_unemp_long_term_PCA'),
 (0.014821551876593567, 'DPCERO1Q156NBEA'),
 (0.014567227807467555, 'nat_rate_of_unemp_long_term'),
 (0.01456501831179204, 'gross_private_domestic_invest_CCH'),
 (0.014455888790167385, 'tot_public_debt_as_pct_of_gdp_CH1'),
 (0.