# Baseline Model


## Imports and Constants

In [83]:
seed = 42
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

In [94]:
# step 1: Read the csv
beige_df = pd.read_csv("beige_book_1996_2025.csv")
labels_df = pd.read_csv("filtered_labels.csv")
labels_df['timestamp'] = labels_df['observation_date']

beige_df = pd.merge(beige_df, labels_df, on='timestamp', how='inner')
beige_df = beige_df.drop('Unnamed: 0', axis=1)
beige_df = beige_df.drop('observation_date', axis=1)
beige_df = beige_df.drop(index=0) # first row is na

print(len(beige_df))
beige_df.head()

230


Unnamed: 0,year,month,url,text,timestamp,rate,decision
1,1996,12,https://www.federalreserve.gov/fomc/beigebook/...,moderate economic growth continues to be repor...,1996-12-01,5.25,hold
2,1997,1,https://www.federalreserve.gov/fomc/beigebook/...,most district reports characterized early autu...,1997-01-01,5.25,hold
3,1997,3,https://www.federalreserve.gov/fomc/beigebook/...,district economies generally continue to expan...,1997-03-01,5.25,hold
4,1997,5,https://www.federalreserve.gov/fomc/beigebook/...,district economies generally continued to expa...,1997-05-01,5.5,raise
5,1997,6,https://www.federalreserve.gov/fomc/beigebook/...,all twelve district economies expanded in may ...,1997-06-01,5.5,hold


In [95]:
# step 2: One-hot encode the labels (so numeric labels of 0, 1, 2)
label_encoder = LabelEncoder()
beige_df['labels'] = label_encoder.fit_transform(beige_df['decision'])

In [96]:
# step 3: Generate TF-IDF encodings for the text
tf_idf_vectorizer = TfidfVectorizer()
X = tf_idf_vectorizer.fit_transform(beige_df['text'])
y = beige_df['labels']
print(X.shape, len(y))

(230, 9923) 230


In [97]:
# step 4: Train-test split (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed, test_size=0.2)

In [98]:
# step 5: Run logistic regression on train set
model = LogisticRegression(random_state=seed)
model.fit(X_train, y_train)

In [99]:
# step 6: Run the model on the test set for prediction
X_pred = model.predict(X_train)
y_pred = model.predict(X_test)

In [100]:
# step 7: evaluation
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
print(precision, recall, accuracy)
print(confusion_matrix(y_test, y_pred))

0.26811594202898553 0.3333333333333333 0.8043478260869565
[[37  0  0]
 [ 5  0  0]
 [ 4  0  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [101]:
beige_df['decision'].value_counts()

Unnamed: 0_level_0,count
decision,Unnamed: 1_level_1
hold,160
raise,43
lower,27


In [102]:
y_test.value_counts()

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
0,37
1,5
2,4


In [103]:
y_train.value_counts()

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
0,123
2,39
1,22
