### Loading data

In [21]:
import torch
import numpy as np
import torch.nn as nn
from sklearn.preprocessing import RobustScaler
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
import requests
import pandas as pd
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import QuantileLoss
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from utils import eo_grabber



In [2]:
url = "https://www.federalregister.gov/api/v1/documents.csv?fields[]=document_number&fields[]=executive_order_number&fields[]=president&fields[]=publication_date&fields[]=signing_date&fields[]=title&per_page=1000&order=newest&conditions[publication_date][gte]=2000&conditions[presidential_document_type][]=executive_order"

eo_grabber(url)

Data successfully written to federal_register_data.csv


In [2]:
df = pd.read_csv('federal_register_data.csv')
df.shape

(1000, 6)

### EDA

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   document_number         1000 non-null   object 
 1   executive_order_number  983 non-null    float64
 2   president               1000 non-null   object 
 3   publication_date        1000 non-null   object 
 4   signing_date            983 non-null    object 
 5   title                   1000 non-null   object 
dtypes: float64(1), object(5)
memory usage: 47.0+ KB


In [4]:
df.document_number.duplicated().any()

np.False_

### Data pre-processing

In [5]:

df['signing_date'] = pd.to_datetime(df['signing_date'], errors='coerce')
df['publication_date'] = pd.to_datetime(df['signing_date'], errors='coerce')

valid_diff = (df['publication_date'] - df['signing_date']).dt.days

median_diff = valid_diff.dropna().median()

df['signing_date'] = df['signing_date'].fillna(df['publication_date'] - pd.to_timedelta(median_diff, unit='D'))


In [6]:
#drop rows with NaN in 'executive_order_number'
df= df.dropna(subset=['executive_order_number'])

In [7]:

df['signing_year'] = df['signing_date'].dt.year
df['political_party'] = df['president'].apply(lambda x: 'Democrat' if 'Obama' in x or 'Biden' in x else ('Republican' if 'Trump' in x or 'Bush' in x else 'Other'))

In [8]:
# Define presidential terms with term number
presidents_terms = [
    # Format: (name, term_number, start_date, end_date)
    ("Bill Clinton", 1, "1993-01-20", "1997-01-19"),
    ("Bill Clinton", 2, "1997-01-20", "2001-01-19"),
    ("George W. Bush", 1, "2001-01-20", "2005-01-19"),
    ("George W. Bush", 2, "2005-01-20", "2009-01-19"),
    ("Barack Obama", 1, "2009-01-20", "2013-01-19"),
    ("Barack Obama", 2, "2013-01-20", "2017-01-19"),
    ("Donald J. Trump", 1, "2017-01-20", "2021-01-19"),
    ("Joseph R. Biden Jr.", 1, "2021-01-20", "2025-01-19"),
    ("Donald J. Trump", 2, "2025-01-20", "2029-01-19"),  # Hypothetical re-election
]

# Convert dates to datetime objects
presidents_terms = [(name, term, pd.to_datetime(start), pd.to_datetime(end)) 
                   for name, term, start, end in presidents_terms]

# Function to determine president and term number based on publication date
def get_president_and_term(pub_date):
    if pd.isna(pub_date):
        return "Unknown President", "Unknown Term"
    
    for name, term, start, end in presidents_terms:
        if start <= pub_date <= end:
            term_text = "1st Term" if term == 1 else "2nd Term"
            return name, term_text, start.strftime("%Y-%m-%d"), end.strftime("%Y-%m-%d")
    
    return "Unknown President", "Unknown Term"

# Apply the function to get both president and term
df['president_info'] = df['signing_date'].apply(get_president_and_term)

# Split the tuple into separate columns for clarity
#df['president'] = df['president_info'].apply(lambda x: x[0])
df['serving_term'] = df['president_info'].apply(lambda x: x[1])
df['term_start'] = df['president_info'].apply(lambda x: x[2])
df['term_end'] = df['president_info'].apply(lambda x: x[3])

# Drop the intermediate column if desired
df = df.drop('president_info', axis=1)


In [9]:
df['eo_title_length'] = df['title'].apply(lambda x: len(x) if isinstance(x, str) else 0).astype(int)

In [10]:
df["term_start"] = pd.to_datetime(df["term_start"])
df["days_since_term_start"] = (df["publication_date"] - df["term_start"]).dt.days   



In [11]:
df['president'] = df.president.str.split(';', expand=True)[3]
df.head()

Unnamed: 0,document_number,executive_order_number,president,publication_date,signing_date,title,signing_year,political_party,serving_term,term_start,term_end,eo_title_length,days_since_term_start
0,2025-09802,14303.0,Donald Trump,2025-05-23,2025-05-23,Restoring Gold Standard Science,2025,Republican,2nd Term,2025-01-20,2029-01-19,31,123
1,2025-09801,14302.0,Donald Trump,2025-05-23,2025-05-23,Reinvigorating the Nuclear Industrial Base,2025,Republican,2nd Term,2025-01-20,2029-01-19,42,123
2,2025-09799,14301.0,Donald Trump,2025-05-23,2025-05-23,Reforming Nuclear Reactor Testing at the Depar...,2025,Republican,2nd Term,2025-01-20,2029-01-19,61,123
3,2025-09798,14300.0,Donald Trump,2025-05-23,2025-05-23,Ordering the Reform of the Nuclear Regulatory ...,2025,Republican,2nd Term,2025-01-20,2029-01-19,56,123
4,2025-09796,14299.0,Donald Trump,2025-05-23,2025-05-23,Deploying Advanced Nuclear Reactor Technologie...,2025,Republican,2nd Term,2025-01-20,2029-01-19,69,123


In [12]:
df.columns

Index(['document_number', 'executive_order_number', 'president',
       'publication_date', 'signing_date', 'title', 'signing_year',
       'political_party', 'serving_term', 'term_start', 'term_end',
       'eo_title_length', 'days_since_term_start'],
      dtype='object')

In [13]:

df['month_year'] = df['signing_date'].dt.to_period('M').astype(str)  # '2021-03'


eo_counts = df.groupby(['month_year', 'political_party', 'serving_term']).size().reset_index(name='eo_count')
eo_counts = pd.get_dummies(eo_counts, columns=['serving_term'], prefix='term')
eo_counts = pd.get_dummies(eo_counts, columns=['political_party'], prefix='term')


In [14]:
eo_counts['year'] = pd.to_datetime(eo_counts['month_year'], format='%Y-%m').dt.year
eo_counts['month'] = pd.to_datetime(eo_counts['month_year'], format='%Y-%m').dt.month


In [27]:
#eo_counts['EO_YTD'] = eo_counts.groupby('year')['eo_count'].cumsum()
eo_counts['EO_YTD_Per_Month'] = eo_counts['EO_YTD'] / eo_counts['month']



In [41]:
eo_counts['Prev_Year'] = eo_counts['year'] - 1
eo_counts['Prev_Month'] = eo_counts['month'] 

# Rename the EO count column for the join
prev_year_data = eo_counts[['year', 'month', 'eo_count']].copy()
prev_year_data.columns = ['Prev_Year', 'Prev_Month', 'EO_Prev_Year_Same_Month']

# Join on (Year-1, Month) to get same-month last year
eo_counts = eo_counts.merge(prev_year_data, on=['Prev_Year', 'Prev_Month'], how='left')

# Drop helper columns
eo_counts.drop(columns=['Prev_Year', 'Prev_Month'], inplace=True)

In [68]:
eo_counts['EO_Prev_Year_Same_Month'] = eo_counts['EO_Prev_Year_Same_Month'].fillna(0)


In [43]:
features = ['EO_Prev_Year_Same_Month','EO_YTD_Per_Month','year', 'month', 'term_1st Term', 'term_2nd Term', 'term_Democrat', 'term_Republican']
target = 'eo_count'

In [71]:

train_df = eo_counts[eo_counts['year'] <= 2015]
test_df = eo_counts[eo_counts['year'] > 2015]


In [72]:
X_train = train_df[features].values.astype(float)
y_train = train_df[target].values
X_test = test_df[features].values.astype(float)
y_test = test_df[target].values

In [73]:


scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [74]:


y_train_log = np.log1p(y_train)  # log(1 + y) for train target
y_test_log = np.log1p(y_test)    # same for test target


In [75]:


X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_log, dtype=torch.float32).view(-1, 1)

X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_log, dtype=torch.float32).view(-1, 1)


In [77]:
# Define the model
# model = nn.Sequential(
#     nn.Linear(6, 32),
#     nn.ReLU(),
#     nn.Linear(32, 16),
#     nn.ReLU(),
#     nn.Linear(16, 1)
# )
model = nn.Sequential(
    nn.Linear(8, 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, 8),
    nn.ReLU(),
    nn.Linear(8, 1)
)
# model = nn.Sequential(
#     nn.Linear(6, 64),
#     nn.BatchNorm1d(64),
#     nn.ReLU(),
#     nn.Dropout(0.2),

#     nn.Linear(64, 32),
#     nn.BatchNorm1d(32),
#     nn.ReLU(),
#     nn.Dropout(0.2),

#     nn.Linear(32, 16),
#     nn.BatchNorm1d(16),
#     nn.ReLU(),
#     nn.Dropout(0.1),

#     nn.Linear(16, 8),
#     nn.ReLU(),

#     nn.Linear(8, 1)
# )

# Loss and optimizer
loss_fn = torch.nn.MSELoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
#optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)


# Prepare mini-batch DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Training loop
num_epochs = 10000

for epoch in range(num_epochs):
    epoch_loss = 0.0
    for X_batch, y_batch in train_loader:
        # Forward pass
        outputs = model(X_batch)
        loss = loss_fn(outputs, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    # Print loss every 100 epochs
    if epoch % 100 == 0:
        print(f"Epoch {epoch}: Loss = {epoch_loss:.4f}")





Epoch 0: Loss = 4.7656
Epoch 100: Loss = 4.0277
Epoch 200: Loss = 3.2945
Epoch 300: Loss = 2.8143
Epoch 400: Loss = 2.2217
Epoch 500: Loss = 1.9126
Epoch 600: Loss = 1.5184
Epoch 700: Loss = 1.4685
Epoch 800: Loss = 1.2571
Epoch 900: Loss = 1.1340
Epoch 1000: Loss = 1.1708
Epoch 1100: Loss = 0.9116
Epoch 1200: Loss = 0.8907
Epoch 1300: Loss = 0.9086
Epoch 1400: Loss = 0.8008
Epoch 1500: Loss = 0.7849
Epoch 1600: Loss = 0.8159
Epoch 1700: Loss = 0.8051
Epoch 1800: Loss = 0.7705
Epoch 1900: Loss = 0.7610
Epoch 2000: Loss = 0.7288
Epoch 2100: Loss = 0.7275
Epoch 2200: Loss = 0.7216
Epoch 2300: Loss = 0.7367
Epoch 2400: Loss = 0.7103
Epoch 2500: Loss = 0.7209
Epoch 2600: Loss = 0.7633
Epoch 2700: Loss = 0.7438
Epoch 2800: Loss = 0.7133
Epoch 2900: Loss = 0.7848
Epoch 3000: Loss = 0.7045
Epoch 3100: Loss = 0.7013
Epoch 3200: Loss = 0.7504
Epoch 3300: Loss = 0.7835
Epoch 3400: Loss = 0.6864
Epoch 3500: Loss = 0.7111
Epoch 3600: Loss = 0.7408
Epoch 3700: Loss = 0.6851
Epoch 3800: Loss = 0.739

In [78]:
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor)
    mse = loss_fn(predictions, y_test_tensor)
    print(f"Final test MSE: {mse.item():.4f}")
    rmse = torch.sqrt(torch.tensor(mse.item()))
    print(f"Test RMSE: {rmse.item():.2f}")


Final test MSE: 0.1828
Test RMSE: 0.43


In [79]:


# Set the model to evaluation mode
model.eval()

# Get predictions
with torch.no_grad():
    preds_log = model(X_test_tensor).squeeze()

# If you used log1p transformation, invert it with expm1
preds = torch.expm1(preds_log).numpy()  # convert log1p back to original scale
actuals = y_test  # y_test should already be in original scale (not log)

# Create the comparison table
results_df = pd.DataFrame({
    'Actual': actuals,
    'Predicted': preds
})

# Optional: Round to 2 decimal places
results_df = results_df.round(2)

# Show the first few rows
print(results_df.tail(20))


     Actual  Predicted
90        3   2.530000
91        2   2.540000
92        3   2.540000
93        1   2.530000
94        1   2.580000
95        3   2.560000
96        3   2.490000
97        4   2.580000
98        1   2.510000
99        1   2.480000
100       2   2.480000
101       2   2.480000
102       1   2.470000
103       5   2.500000
104      13   5.380000
105      46  81.989998
106      30  34.689999
107      33  27.520000
108      34  26.150000
109      14  19.830000


In [38]:
import torch
import numpy as np

model.eval()
with torch.no_grad():
    # Predict on test set (log scale)
    preds_log = model(X_test_tensor)

    # Inverse transform predictions and actual values
    preds_orig = torch.expm1(preds_log).squeeze().numpy()
    actuals_orig = torch.expm1(y_test_tensor).squeeze().numpy()

    # Calculate RMSE on original scale
    rmse_orig = np.sqrt(np.mean((preds_orig - actuals_orig) ** 2))
    print(f"Test RMSE (original scale): {rmse_orig:.4f}")


Test RMSE (original scale): 6.2041


In [39]:
print("Train target range:")
print(f"Min: {y_train.min()}, Max: {y_train.max()}")


print("Test target range:")
print(f"Min: {y_test.min()}, Max: {y_test.max()}")


Train target range:
Min: 1, Max: 9
Test target range:
Min: 1, Max: 46


In [40]:
eo_counts.dtypes

month_year           object
eo_count              int64
term_1st Term          bool
term_2nd Term          bool
term_Democrat          bool
term_Republican        bool
year                  int32
month                 int32
EO_YTD                int64
EO_YTD_Per_Month    float64
dtype: object