In [1]:
# install missing packages
%pip install plotly
%pip install py-cpuinfo
%pip install pandas-datareader

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# check system details
import os
import psutil
import cpuinfo

ram_info = psutil.virtual_memory()
print(f"Total RAM: {ram_info.total / 1024 / 1024 / 1024:.2f} GB")
print(f"Available RAM: {ram_info.available / 1024 / 1024 / 1024:.2f} GB")
print(f"Used RAM: {ram_info.used / 1024 / 1024 / 1024:.2f} GB")
print(f"Percentage Usage Of RAM: {ram_info.percent}%")
print(f"CPU Cores: {os.cpu_count()}")
print(f"CPU Speed: {cpuinfo.get_cpu_info()['hz_actual_friendly']}")
disk_info = psutil.disk_usage(os.getcwd())
print(f"Total Disk: {disk_info.total / 1024 / 1024 / 1024:.2f} GB")
print(f"Available Disk: {disk_info.free / 1024 / 1024 / 1024:.2f} GB")
print(f"Used Disk: {disk_info.used / 1024 / 1024 / 1024:.2f} GB")
print(f"Percentage Usage Of Disk: {disk_info.percent}%")

Total RAM: 15.47 GB
Available RAM: 14.44 GB
Used RAM: 0.76 GB
Percentage Usage Of RAM: 6.6%
CPU Cores: 4
CPU Speed: 2.5000 GHz
Total Disk: 24.99 GB
Available Disk: 16.65 GB
Used Disk: 8.34 GB
Percentage Usage Of Disk: 33.4%


In [3]:
# import requirements
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from gbm_regressor import Regression

In [4]:
# get the data
unemployment = pd.read_csv("unemployment.csv")
unemployment = unemployment.sort_values(by=[
    "Date", 
    "Area Type ", 
    "Area Name ", 
    "Seasonally Adjusted (Y/N) ", 
    "Status (Preliminary / Final) ",
]).reset_index(drop=True)
unemployment = unemployment.head(int(0.25 * unemployment.shape[0]))  # only use 25% of the data

In [5]:
# resort the data
unemployment = unemployment.sort_values(by=[
    "Area Type ", 
    "Area Name ", 
    "Seasonally Adjusted (Y/N) ", 
    "Status (Preliminary / Final) ",
    "Date", 
]).reset_index(drop=True)

In [6]:
# lag the output variable
unemployment["Unemployment Rate(t-1)"] = unemployment.groupby([
    "Area Type ", 
    "Area Name ", 
    "Seasonally Adjusted (Y/N) ", 
    "Status (Preliminary / Final) ",
])["Unemployment Rate "].shift(1)

unemployment["Unemployment Rate(t-2)"] = unemployment.groupby([
    "Area Type ", 
    "Area Name ", 
    "Seasonally Adjusted (Y/N) ", 
    "Status (Preliminary / Final) ",
])["Unemployment Rate "].shift(2)

unemployment["Unemployment Rate(t-3)"] = unemployment.groupby([
    "Area Type ", 
    "Area Name ", 
    "Seasonally Adjusted (Y/N) ", 
    "Status (Preliminary / Final) ",
])["Unemployment Rate "].shift(3)

unemployment["Unemployment Rate(t-4)"] = unemployment.groupby([
    "Area Type ", 
    "Area Name ", 
    "Seasonally Adjusted (Y/N) ", 
    "Status (Preliminary / Final) ",
])["Unemployment Rate "].shift(4)

In [7]:
# remove rows with missing values
unemployment = unemployment.dropna().reset_index(drop=True)

In [8]:
# resort the data
unemployment = unemployment.sort_values(by=[
    "Date", 
    "Area Type ", 
    "Area Name ", 
    "Seasonally Adjusted (Y/N) ", 
    "Status (Preliminary / Final) ",
]).reset_index(drop=True)

In [9]:
# split up the data into training and testing
y = unemployment[["Unemployment Rate "]]
X = unemployment.drop(columns=["Unemployment Rate ", "Employment ", "Unemployment ", "Year ", "Month"])
trainX = X.head(int(0.8 * X.shape[0]))
trainy = y.head(int(0.8 * y.shape[0]))
testX = X.tail(int(0.2 * X.shape[0])).reset_index(drop=True)
testy = y.tail(int(0.2 * y.shape[0])).reset_index(drop=True)

In [10]:
# build the model
print("\n---- Unemployment Regression Analysis ----\n")
model = Regression(
    name="XGBoost With Feature Engineering", 
    frac=0.33,  # only use 33% of the data for preprocessing
    rename=True, 
    time=True, 
    binary=True, 
    imputation=False, 
    variance=True,
    scale=True,
    atwood=True,
    binning=True,
    reciprocal=True, 
    interaction=True, 
    selection=True,
)
try:
    model.load()  # load the machine learning pipeline
    predictions = model.predict(testX)
except:
    model.fit(trainX, trainy)  # build the machine learning pipeline
    predictions = model.predict(testX)


---- Unemployment Regression Analysis ----

1/6) Model Training
> Renaming Features
> Extracting Time Features
> Transforming Categorical Features
> Renaming Features
> Removing Constant Features
> Scaling Features
> Selecting Features
> Computing Atwoood Numbers
> Binning Features
> Computing Reciprocals
> Computing Interactions
> Removing Constant Features
> Selecting Features
> Transforming The Training Data
> Training XGBoost
1.38 Minutes
2/6) Model Performance
> Transforming The Testing Data
7.74 Seconds
3/6) Model Deployment
> Transforming All The Data
> Training XGBoost
42.7 Seconds
4/6) Model Indicators
0.12 Seconds
5/6) Model Prediction
> Transforming The New Data
3.65 Seconds
6/6) Model Monitoring
5.24 Seconds


In [11]:
# model diagnostics
print("Model Diagnostics:")
print(f"> RMSE: {model.rmse}")
print(f"> R2: {model.r2}")
print(f"> In Control: {model.in_control}")
print("> Model Indicators:")
for i, indicator in enumerate(model.indicators["Indicator"][:10]):
    print(f"{i+1}. {indicator}")
print("> Feature Drift:")
for i, feature in enumerate(model.drift.loc[model.drift["pvalue"] < 0.05, "Feature"][:10]):
    print(f"{i+1}. {feature}")
if model.drift.loc[model.drift["pvalue"] < 0.05].shape[0] == 0:
    print("None")

Model Diagnostics:
> RMSE: 0.03976699037633044
> R2: 0.7156108681268599
> In Control: 92.12%
> Model Indicators:
1. Unemployment_Rate(t-1)
2. Unemployment_Rate(t-1)*Date_federal_funds_rate(0.2-0.4)
3. Unemployment_Rate(t-2)(0.2-0.4)*1/Unemployment_Rate(t-1)
4. Date_unemployment(0.2-0.4)*1/Unemployment_Rate(t-1)
5. Date_unemployment(0.2-0.4)*1/Unemployment_Rate(t-4)
6. Unemployment_Rate(t-1)(0.6-0.8)*1/Labor_Force
7. Unemployment_Rate(t-1)(0.6-0.8)*1/Unemployment_Rate(t-4)
8. Date_gdp(0.6-0.8)*1/Unemployment_Rate(t-1)
9. Unemployment_Rate(t-1)(0.6-0.8)*1/Date_federal_funds_rate
10. Date_gdp*1/Date_nasdaq
> Feature Drift:
1. Unemployment_Rate(t-1)*Unemployment_Rate(t-4)
2. Labor_Force*Date_day_of_week_Thursday
3. Date_week_1*Date_gdp(0.4-0.6)
4. Date_day_of_week_Friday*1/Labor_Force
5. Date_day_of_week_Friday*1/Unemployment_Rate(t-4)
6. Unemployment_Rate(t-3)*Unemployment_Rate(t-3)(0.2-0.4)
7. Unemployment_Rate(t-3)*Unemployment_Rate(t-1)(0.2-0.4)
8. Unemployment_Rate(t-2)*Date_year_2015

In [12]:
# score the model
rmse = mean_squared_error(
    y_true=testy.iloc[:,0].to_numpy(),
    y_pred=predictions,
    squared=False,
)
r2 = r2_score(
    y_true=testy.iloc[:,0].to_numpy(),
    y_pred=predictions,
)

print(f"RMSE: {rmse}")
print(f"R2: {r2}")

RMSE: 0.02838886218775213
R2: 0.8476271699355646


In [13]:
# save the machine learning pipeline
model.dump()

In [14]:
# refit the model to include the test data
model.refit(testX, testy)

Model Retraining:
> Transforming The New Data
> Training XGBoost
39.49 Seconds
