In [1]:
import pandas as pd

df = pd.read_csv('/Users/patwilliams/DSCI441/DSCI411-project/Data_PreProcessing/final_economic_data.csv')

In [2]:
df.head()

Unnamed: 0,Year,Annual_%_Change_nasdaq,Annual_%_Change_dowj,Annual_%_Change_sp500,Annual_%_Change_corn,Annual_%_Change_cotton,Annual_%_Change_pound,Annual_%_Change_yen,Annual_Unemployment_Rate,Annual_%_Change_silver,...,Annual_%_Change_FFRate,Annual_%_Change_tenyrRate,Annual_%_Change_oneyrRate,Annual_%_Change_gold,Annual_%_Change_crude,Annual_%_Change_copper,Annual_%_Change_coffee,Unemployment_Rate_Increased,Unemployment_Rate_Percent_Change,Unemployment_Rate_Change_Indicator
0,2024.0,9.05,5.55,9.89,-6.53,13.58,-0.51,6.85,3.8,-0.092688,...,-0.223298,0.262182,-0.145917,-0.144743,0.056138,-0.134274,-0.345731,1,4.827586,0
1,2023.0,43.42,13.7,24.23,-30.62,-2.41,5.22,7.56,3.625,-0.229543,...,-0.070694,-0.082794,-0.199847,0.188286,-0.582778,-0.166927,0.208869,0,-0.229358,0
2,2022.0,-33.1,-8.78,-19.44,14.37,-26.29,-10.52,13.91,3.633333,-0.124077,...,6.297961,3.488887,3.026747,-0.340666,-0.128344,-0.64613,-0.838549,0,-31.981279,1
3,2021.0,21.39,18.73,26.89,22.57,44.14,-1.23,11.49,5.341667,-0.569481,...,-0.370152,2.094849,3.026747,-0.471016,1.052936,0.566733,1.624098,0,-33.985582,1
4,2020.0,43.64,7.25,16.26,24.82,13.14,3.21,-5.0,8.091667,1.282133,...,-0.845809,-1.852644,-1.742681,0.639346,-0.807901,0.536434,-0.237071,1,120.181406,1


In [3]:
from sklearn.model_selection import train_test_split

# Assuming the DataFrame is already sorted by year
# Extract the row for the holdout set (the most recent year, 2024)
holdout = df[df['Year'] == 2024]

# Remove the holdout set from the main DataFrame
df_modeling = df[df['Year'] != 2024]

# IF YEAR SHOULD BE INCORPORATED

# Calculate 'Years Since Epoch' where Epoch is set to 1974
#df['Years_Since_Epoch'] = df['Year'] - 1974
# Display the updated DataFrame to verify the transformation
#df[['Year', 'Years_Since_Epoch']].head()


# Define the features and target
X = df_modeling.drop(['Annual_Unemployment_Rate', 'Year', 'Unemployment_Rate_Increased', 'Unemployment_Rate_Percent_Change', 'Unemployment_Rate_Change_Indicator'], axis=1) # should year be included?? Is it better to include as t starting with t = 0... t+1
y = df_modeling['Annual_Unemployment_Rate']

# Split the data into training and testing sets
# We want 40 observations in the training set, and 10 in the test set
# The dataset is already sorted by year, so we can split accordingly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10, shuffle=False)

# Check the shapes to confirm the sizes
(X_train.shape, X_test.shape, y_train.shape, y_test.shape, holdout.shape)

((40, 16), (10, 16), (40,), (10,), (1, 21))

### AdaBoost

In [4]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import AdaBoostRegressor

reg_ada = AdaBoostRegressor(n_estimators=10, random_state=500)
reg_ada.fit(X_train, y_train)

# Calculate the predictions on the test set
pred = reg_ada.predict(X_test)

# Evaluate the performance using the RMSE
rmse = np.sqrt(mean_squared_error(y_test, pred))
print('RMSE: {:.3f}'.format(rmse))

RMSE: 2.923


### GridSearchCV

In [5]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [5, 10, 50, 100, 150, 200],
    'learning_rate': [0.005, 0.01, 0.1, 0.5, 1.0]

}

# Initialize the AdaBoostRegressor
reg_ada = AdaBoostRegressor(random_state=500)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=reg_ada, param_grid=param_grid, cv=5, scoring='r2')

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameter and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation R^2:", grid_search.best_score_)

Best parameters: {'learning_rate': 1.0, 'n_estimators': 5}
Best cross-validation R^2: -1.5222325768831293


In [6]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
best_ada = grid_search.best_estimator_

# Make predictions using the best model on the test set
pred = best_ada.predict(X_test)

# Calculate various performance metrics
mse = mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, pred)

# Print the performance metrics
print("Best AdaBoost Model Performance Metrics:")
print("Mean Squared Error (MSE): {:.3f}".format(mse))
print("Mean Absolute Error (MAE): {:.3f}".format(mae))
print("Root Mean Squared Error (RMSE): {:.3f}".format(rmse))
print("R-squared (R2): {:.3f}".format(r2))

Best AdaBoost Model Performance Metrics:
Mean Squared Error (MSE): 7.767
Mean Absolute Error (MAE): 2.493
Root Mean Squared Error (RMSE): 2.787
R-squared (R2): -3.125


### GradientBoostRegressor

In [7]:
from sklearn.ensemble import GradientBoostingRegressor

# Define GradientBoosting parameters to tune
param_grid = {
    'n_estimators': [5, 10, 50, 100, 200, 300],
    'learning_rate': [0.01, 0.02, 0.1, 0.2, 0.3, 0.5],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]  # Percentage of samples used for fitting the individual base learners
}

# Initialize the GradientBoostingRegressor
reg_gb = GradientBoostingRegressor(random_state=500)

# Initialize GridSearchCV
grid_search_gb = GridSearchCV(estimator=reg_gb, param_grid=param_grid, cv=5, scoring='r2')

# Fit GridSearchCV to the training data
grid_search_gb.fit(X_train, y_train)

# Retrieve the best model from GridSearchCV
best_gb = grid_search_gb.best_estimator_

# Make predictions using the best model on the test set
pred_gb = best_gb.predict(X_test)

# Calculate various performance metrics
mse_gb = mean_squared_error(y_test, pred_gb)
mae_gb = mean_absolute_error(y_test, pred_gb)
rmse_gb = np.sqrt(mse_gb)
r2_gb = r2_score(y_test, pred_gb)

# Print the performance metrics
print("Gradient Boosting Model Performance Metrics:")
print("Mean Squared Error (MSE): {:.3f}".format(mse_gb))
print("Mean Absolute Error (MAE): {:.3f}".format(mae_gb))
print("Root Mean Squared Error (RMSE): {:.3f}".format(rmse_gb))
print("R-squared (R2): {:.3f}".format(r2_gb))

Gradient Boosting Model Performance Metrics:
Mean Squared Error (MSE): 4.540
Mean Absolute Error (MAE): 1.737
Root Mean Squared Error (RMSE): 2.131
R-squared (R2): -1.411


In [8]:
print("Best parameters:", grid_search_gb.best_params_)
print("Best cross-validation R^2:", grid_search_gb.best_score_)

Best parameters: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 5, 'subsample': 0.5}
Best cross-validation R^2: -1.1856265856398154


### XGBoostRegressor

In [9]:
from xgboost import XGBRegressor

# Define XGBRegressor parameters to tune
param_grid = {
    'n_estimators': [5, 10, 50, 100, 200, 300],
    'learning_rate': [0.01, 0.02, 0.1, 0.2, 0.3, 0.5],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.5,0.6,0.7, 0.8, 0.9]  # Percentage of features used per tree
}

# Initialize the XGBRegressor
reg_xgb = XGBRegressor(random_state=500)

# Initialize GridSearchCV
grid_search_xgb = GridSearchCV(estimator=reg_xgb, param_grid=param_grid, cv=5, scoring='r2')

# Fit GridSearchCV to the training data
grid_search_xgb.fit(X_train, y_train)

# Retrieve the best model from GridSearchCV
best_xgb = grid_search_xgb.best_estimator_

# Make predictions using the best model on the test set
pred_xgb = best_xgb.predict(X_test)

# Calculate various performance metrics
mse_xgb = mean_squared_error(y_test, pred_xgb)
mae_xgb = mean_absolute_error(y_test, pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(y_test, pred_xgb)

# Print the performance metrics
print("XGBoost Model Performance Metrics:")
print("Mean Squared Error (MSE): {:.3f}".format(mse_xgb))
print("Mean Absolute Error (MAE): {:.3f}".format(mae_xgb))
print("Root Mean Squared Error (RMSE): {:.3f}".format(rmse_xgb))
print("R-squared (R2): {:.3f}".format(r2_xgb))

XGBoost Model Performance Metrics:
Mean Squared Error (MSE): 4.854
Mean Absolute Error (MAE): 1.806
Root Mean Squared Error (RMSE): 2.203
R-squared (R2): -1.578


In [10]:
print("Best parameters:", grid_search_xgb.best_params_)
print("Best cross-validation R^2:", grid_search_xgb.best_score_)

Best parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 5, 'subsample': 1.0}
Best cross-validation R^2: -1.1035481647946999


# Classification Models

In [20]:
import pandas as pd

df3 = pd.read_csv('/Users/patwilliams/DSCI441/DSCI411-project/Data_PreProcessing/final_economic_data.csv')

In [21]:
from sklearn.model_selection import train_test_split

holdout = df3[df3['Year'] == 2024]

# Remove the holdout set from the main DataFrame
df_modeling = df3[df3['Year'] != 2024]

# IF YEAR SHOULD BE INCORPORATED

# Calculate 'Years Since Epoch' where Epoch is set to 1974
#df['Years_Since_Epoch'] = df['Year'] - 1974
# Display the updated DataFrame to verify the transformation
#df[['Year', 'Years_Since_Epoch']].head()


# Define the features and target
X = df_modeling.drop(['Annual_Unemployment_Rate', 'Year', 'Unemployment_Rate_Increased', 'Unemployment_Rate_Percent_Change', 'Unemployment_Rate_Change_Indicator'], axis=1) # should year be included?? Is it better to include as t starting with t = 0... t+1
y = df_modeling['Unemployment_Rate_Increased']

# Split the data into training and testing sets
# We want 40 observations in the training set, and 10 in the test set
# The dataset is already sorted by year, so we can split accordingly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10, shuffle=False)

# Check the shapes to confirm the sizes
(X_train.shape, X_test.shape, y_train.shape, y_test.shape, holdout.shape)

((40, 16), (10, 16), (40,), (10,), (1, 21))

#### If going to use holdout for prediction, must drop out the year, unemployment rate, and the binary created

### AdaBoostClassifier

In [22]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define AdaBoost parameters to tune
param_grid = {
    'n_estimators': [5, 10, 50, 100, 150, 200],
    'learning_rate': [0.005, 0.01, 0.1, 0.5, 1.0]
}

# Initialize the AdaBoostClassifier
clf_ada = AdaBoostClassifier(random_state=500)

# Initialize GridSearchCV with 'accuracy' as the scoring metric
grid_search_clf = GridSearchCV(estimator=clf_ada, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV to the training data
grid_search_clf.fit(X_train, y_train)

# Retrieve the best model
best_ada_clf = grid_search_clf.best_estimator_

# Make predictions using the best model on the test set
pred_clf = best_ada_clf.predict(X_test)

# Calculate various performance metrics
accuracy = accuracy_score(y_test, pred_clf)
precision = precision_score(y_test, pred_clf)
recall = recall_score(y_test, pred_clf)
f1 = f1_score(y_test, pred_clf)

# Print the performance metrics
print("Best Classifier Model Performance Metrics:")
print("Accuracy: {:.3f}".format(accuracy))
print("Precision: {:.3f}".format(precision))
print("Recall: {:.3f}".format(recall))
print("F1 Score: {:.3f}".format(f1))

Best Classifier Model Performance Metrics:
Accuracy: 0.500
Precision: 0.000
Recall: 0.000
F1 Score: 0.000


In [23]:
print("Best parameters:", grid_search_clf.best_params_)
print("Best cross-validation accuracy:", grid_search_clf.best_score_)

Best parameters: {'learning_rate': 1.0, 'n_estimators': 10}
Best cross-validation accuracy: 0.75


### GradientBoostingClassifier

In [24]:
from sklearn.ensemble import GradientBoostingClassifier

# Define GradientBoosting parameters to tune
param_grid = {
    'n_estimators': [5, 10, 50, 100, 200, 300],
    'learning_rate': [0.01, 0.02, 0.1, 0.2, 0.3, 0.5],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]  # Percentage of samples used for fitting the individual base learners
}

# Initialize the GradientBoostingClassifier
clf_gb = GradientBoostingClassifier(random_state=500)

# Initialize GridSearchCV with 'accuracy' as the scoring metric
grid_search_clf_gb = GridSearchCV(estimator=clf_gb, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV to the training data
grid_search_clf_gb.fit(X_train, y_train)

# Retrieve the best model from GridSearchCV
best_gb_clf = grid_search_clf_gb.best_estimator_

# Make predictions using the best model on the test set
pred_gb_clf = best_gb_clf.predict(X_test)

# Calculate various performance metrics
accuracy_gb = accuracy_score(y_test, pred_gb_clf)
precision_gb = precision_score(y_test, pred_gb_clf)
recall_gb = recall_score(y_test, pred_gb_clf)
f1_gb = f1_score(y_test, pred_gb_clf)

# Print the performance metrics
print("Gradient Boosting Classifier Model Performance Metrics:")
print("Accuracy: {:.3f}".format(accuracy_gb))
print("Precision: {:.3f}".format(precision_gb))
print("Recall: {:.3f}".format(recall_gb))
print("F1 Score: {:.3f}".format(f1_gb))

Gradient Boosting Classifier Model Performance Metrics:
Accuracy: 0.700
Precision: 1.000
Recall: 0.250
F1 Score: 0.400


In [25]:
print("Best parameters:", grid_search_clf_gb.best_params_)
print("Best cross-validation accuracy:", grid_search_clf_gb.best_score_)

Best parameters: {'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 50, 'subsample': 0.6}
Best cross-validation accuracy: 0.825


In [26]:
from xgboost import XGBClassifier

# Define XGBClassifier parameters to tune
param_grid = {
    'n_estimators': [5, 10, 50, 100, 200, 300],
    'learning_rate': [0.01, 0.02, 0.1, 0.2, 0.3, 0.5],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.5,0.6,0.7, 0.8, 0.9]  # Percentage of features used per tree
}

# Initialize the XGBClassifier
clf_xgb = XGBClassifier(random_state=500, use_label_encoder=False, eval_metric='logloss')

# Initialize GridSearchCV with 'accuracy' as the scoring metric
grid_search_clf_xgb = GridSearchCV(estimator=clf_xgb, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV to the training data
grid_search_clf_xgb.fit(X_train, y_train)

# Retrieve the best model from GridSearchCV
best_xgb_clf = grid_search_clf_xgb.best_estimator_

# Make predictions using the best model on the test set
pred_xgb_clf = best_xgb_clf.predict(X_test)

# Calculate various performance metrics
accuracy_xgb = accuracy_score(y_test, pred_xgb_clf)
precision_xgb = precision_score(y_test, pred_xgb_clf)
recall_xgb = recall_score(y_test, pred_xgb_clf)
f1_xgb = f1_score(y_test, pred_xgb_clf)

# Print the performance metrics
print("XGBoost Classifier Model Performance Metrics:")
print("Accuracy: {:.3f}".format(accuracy_xgb))
print("Precision: {:.3f}".format(precision_xgb))
print("Recall: {:.3f}".format(recall_xgb))
print("F1 Score: {:.3f}".format(f1_xgb))

XGBoost Classifier Model Performance Metrics:
Accuracy: 0.700
Precision: 0.667
Recall: 0.500
F1 Score: 0.571


In [27]:
print("Best parameters:", grid_search_clf_xgb.best_params_)
print("Best cross-validation accuracy:", grid_search_clf_xgb.best_score_)

Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.9}
Best cross-validation accuracy: 0.825


# Classification for Variance

In [28]:
df4 = pd.read_csv('/Users/patwilliams/DSCI441/DSCI411-project/Data_PreProcessing/final_economic_data.csv')

holdout = df4[df4['Year'] == 2024]

# Remove the holdout set from the main DataFrame
df_modeling = df4[df4['Year'] != 2024]

# IF YEAR SHOULD BE INCORPORATED

# Calculate 'Years Since Epoch' where Epoch is set to 1974
#df['Years_Since_Epoch'] = df['Year'] - 1974
# Display the updated DataFrame to verify the transformation
#df[['Year', 'Years_Since_Epoch']].head()


# Define the features and target
X = df_modeling.drop(['Annual_Unemployment_Rate', 'Year', 'Unemployment_Rate_Increased', 'Unemployment_Rate_Percent_Change', 'Unemployment_Rate_Change_Indicator'], axis=1) # should year be included?? Is it better to include as t starting with t = 0... t+1
y = df_modeling['Unemployment_Rate_Change_Indicator']

# Split the data into training and testing sets
# We want 40 observations in the training set, and 10 in the test set
# The dataset is already sorted by year, so we can split accordingly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10, shuffle=False)

# Check the shapes to confirm the sizes
(X_train.shape, X_test.shape, y_train.shape, y_test.shape, holdout.shape)

((40, 16), (10, 16), (40,), (10,), (1, 21))