3/8/25: Updated from the last stable/functional version ("dim1_BOW_train-test-combined").  Two major additions:  A different cross-validation method that does not require using **negative** mean absolute error.  I have also updated and expanded the "newsroom" test dataset in the hope of resolving the negative R2 issue.

7/15/25: This model implements HK request for multi-output regression.

In [None]:
##import and format data
import pandas as pd
import numpy as np

#Upload complete training dataset
all114 = pd.read_csv('training_data_114_final.csv',dtype='string')
all2 = all114.astype({'speech':'string','nominate_dim1':'float', 'nominate_dim2': 'float'})
final114 = all2.dropna()


In [None]:
#upload the custom stopword list
from congress_stopwords import congress


In [None]:
final114.speech.str.len()
#Just making sure that no speech data were truncated during preprocessing
#and that the proper number of documents are present.

Unnamed: 0,speech
0,17784
1,25356
2,113985
3,109704
4,45340
...,...
433,90749
434,31889
435,15301
436,24527


In [None]:
###lemmatize prior to other vectorizer
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

W = []
lemmatizer = WordNetLemmatizer()
for i in final114.speech:
  lems = lemmatizer.lemmatize(i)
  W.append(lems)



[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
####create the TF-IDF matrix
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
        sublinear_tf=True, max_df=0.5, min_df=5, stop_words=congress
    )
train2 = vectorizer.fit_transform(W)
#train1 = vectorizer.fit_transform(final114.speech)

In [None]:
X = train2
#X = train1

###construction of single target Y for both dimensions
#y = final114.nominate_dim1
#y1 = final114.nominate_dim2

target_columns = ["nominate_dim1", "nominate_dim2"]
Y = final114[target_columns].values

In [None]:
Y.shape

(438, 2)

In [None]:
from sklearn.model_selection import RepeatedKFold, cross_val_score, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, max_error
from sklearn.multioutput import MultiOutputRegressor


In [None]:
###METRICS PER TARGET

models = {
    'Linear Regression': LinearRegression(),
    #'Random Forest Regressor': RandomForestRegressor(n_estimators=600),
    'Gradient Boosting Regressor': MultiOutputRegressor(GradientBoostingRegressor(n_estimators=400))
}

# Create a RepeatedKFold cross-validator
rkf = RepeatedKFold(n_splits=2, n_repeats=1, random_state=42) #used 4 splits for LR; only 2 for XGB to reduce run time

# Function to calculate all metrics
def calculate_metrics(model, X, y, cv):
    r2_scores = []
    rmse_scores = []
    mae_scores = []
    max_errors = []

    for train_index, test_index in cv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Calculate metrics
        r2_scores.append(r2_score(y_test, y_pred, multioutput = "raw_values" ))
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred, multioutput = "raw_values")))
        mae_scores.append(mean_absolute_error(y_test, y_pred, multioutput = "raw_values"))
        #max_errors.append(max_error(y_test, y_pred))

    return r2_scores, rmse_scores, mae_scores, max_errors

# Evaluate each model using cross-validation and calculate the metrics
for model_name, model in models.items():
    r2_scores, rmse_scores, mae_scores, max_errors = calculate_metrics(model, X, Y, rkf)

    #print(f"{model_name} - R2 Scores: {r2_scores}")
    #print(f"{model_name} - Mean R2: {np.mean(r2_scores)}")
    #print(f"{model_name} - RMSE Scores: {rmse_scores}")
    #print(f"{model_name} - Mean RMSE: {np.mean(rmse_scores)}")
    #print(f"{model_name} - MAE Scores: {mae_scores}")
    #print(f"{model_name} - Mean MAE: {np.mean(mae_scores)}")
    #print(f"{model_name} - Max Errors: {max_errors}")
    #print(f"{model_name} - Mean Max Error: {np.mean(max_errors)}\n")

In [None]:
###xgboost
r2_scores

[array([ 0.50948591, -0.03263678]), array([0.48523653, 0.05701168])]

In [None]:
###xgb
rmse_scores

[array([0.31961312, 0.25284122]), array([0.32121654, 0.26207433])]

In [None]:
###xgb
mae_scores


[array([0.25942253, 0.19871271]), array([0.26089689, 0.21238795])]

In [None]:
###linear regression
r2_scores

[array([0.63386312, 0.22673646]),
 array([0.69054935, 0.10249001]),
 array([0.63685457, 0.25525914]),
 array([0.57190927, 0.28064026])]

In [None]:
###linear regression
rmse_scores

[array([0.26930338, 0.20900693]),
 array([0.2587597 , 0.24974819]),
 array([0.27279631, 0.24218162]),
 array([0.29057337, 0.21215098])]

In [None]:
###linear regression
mae_scores

[array([0.22162394, 0.16880257]),
 array([0.21769852, 0.19436334]),
 array([0.23122124, 0.192135  ]),
 array([0.24182382, 0.17434119])]