3/8/25: Updated from the last stable/functional version ("dim1_BOW_train-test-combined").  Two major additions:  A different cross-validation method that does not require using **negative** mean absolute error.  I have also updated and expanded the "newsroom" test dataset in the hope of resolving the negative R2 issue.

In [1]:
##import and format data
import pandas as pd
import numpy as np

#Upload complete training dataset
all114 = pd.read_csv('training_data_114_final.csv',dtype='string')
all2 = all114.astype({'speech':'string','nominate_dim1':'float', 'nominate_dim2': 'float'})
final114 = all2.dropna()


In [2]:
#upload the custom stopword list
from congress_stopwords import congress


In [None]:
final114.speech.str.len()
#Just making sure that no speech data were truncated during preprocessing
#and that the proper number of documents are present.

Unnamed: 0,speech
0,17784
1,25356
2,113985
3,109704
4,45340
...,...
433,90749
434,31889
435,15301
436,24527


In [3]:
###lemmatize prior to vectorizer
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

W = []
lemmatizer = WordNetLemmatizer()
for i in final114.speech:
  lems = lemmatizer.lemmatize(i)
  W.append(lems)



[nltk_data] Downloading package wordnet to /root/nltk_data...


In [4]:
####create the TF-IDF matrix
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
        sublinear_tf=True, max_df=0.5, min_df=5, stop_words=congress
    )
train2 = vectorizer.fit_transform(W) #fit the lemmatized text
#train1 = vectorizer.fit_transform(final114.speech) #fit the un-lemmatized text

In [6]:
X = train2
###variable of NOMINATE values
y = final114.nominate_dim1
y1 = final114.nominate_dim2

In [7]:
###116th CONGRESSIONAL RECORD speeches
all116 = pd.read_csv('116incCR1.txt', dtype="string", sep = ',')
all2 = all116.astype({'score':'float','dim2':'float'})
cr116_1 = all2.dropna()


In [None]:
cr116_1.text.str.len()

In [8]:
###lemmatize prior to other vectorizer
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

D = []
lemmatizer = WordNetLemmatizer()
for i in cr116_1.text:
  lems = lemmatizer.lemmatize(i)
  D.append(lems)



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
X_val = vectorizer.transform(D)
y_val = cr116_1.score
y1_val = cr116_1.dim2

In [10]:
from sklearn.model_selection import RepeatedKFold, cross_val_score, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, max_error


In [11]:
#TRAINING AND CROSS-VALIDATION -- DIMENSION 1
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(n_estimators=600, max_depth = 40, min_samples_split = 10),
    'Gradient Boosting Regressor': GradientBoostingRegressor(n_estimators=1600, max_depth = 110, min_samples_split = 12)
}

# Create a RepeatedKFold cross-validator
rkf = RepeatedKFold(n_splits=4, n_repeats=1, random_state=42)

# Function to calculate all metrics
def calculate_metrics(model, X, y, cv):
    r2_scores = []
    rmse_scores = []
    mae_scores = []
    max_errors = []

    for train_index, test_index in cv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Calculate metrics
        r2_scores.append(r2_score(y_test, y_pred))
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
        mae_scores.append(mean_absolute_error(y_test, y_pred))
        max_errors.append(max_error(y_test, y_pred))

    return r2_scores, rmse_scores, mae_scores, max_errors

# Evaluate each model using cross-validation and calculate the metrics
for model_name, model in models.items():
    r2_scores, rmse_scores, mae_scores, max_errors = calculate_metrics(model, X, y, rkf)

    #print(f"{model_name} - R2 Scores: {r2_scores}")
    print(f"{model_name} - Mean R2: {np.mean(r2_scores)}")
    #print(f"{model_name} - RMSE Scores: {rmse_scores}")
    print(f"{model_name} - Mean RMSE: {np.mean(rmse_scores)}")
    #print(f"{model_name} - MAE Scores: {mae_scores}")
    print(f"{model_name} - Mean MAE: {np.mean(mae_scores)}")
    #print(f"{model_name} - Max Errors: {max_errors}")
    print(f"{model_name} - Mean Max Error: {np.mean(max_errors)}\n")


Linear Regression - Mean R2: 0.6332940797759072
Linear Regression - Mean RMSE: 0.2728581890866658
Linear Regression - Mean MAE: 0.2280918770918586
Linear Regression - Mean Max Error: 0.7222833274196097

Random Forest Regressor - Mean R2: 0.4907917805864554
Random Forest Regressor - Mean RMSE: 0.32203802262825565
Random Forest Regressor - Mean MAE: 0.2694307223637772
Random Forest Regressor - Mean Max Error: 0.8591031522809529

Gradient Boosting Regressor - Mean R2: 0.40546375746655594
Gradient Boosting Regressor - Mean RMSE: 0.3473486549367068
Gradient Boosting Regressor - Mean MAE: 0.2620027125049575
Gradient Boosting Regressor - Mean Max Error: 0.9783494150816602



In [None]:
###VALIDATION DATA 1 --DIMENSION 1-- 116 CR
model = LinearRegression()
model.fit(X, y)

y_val_pred = model.predict(X_val)


r2_val = r2_score(y_val, y_val_pred)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
mae_val = mean_absolute_error(y_val, y_val_pred)
max_err_val = max_error(y_val, y_val_pred)

# Print the evaluation metrics for the validation data
print("Validation Set Evaluation:")
print(f"R-squared: {r2_val}")
print(f"RMSE: {rmse_val}")
print(f"MAE: {mae_val}")
print(f"Max Error: {max_err_val}")



Validation Set Evaluation:
R-squared: 0.39855365263893694
RMSE: 0.3316620202261714
MAE: 0.2990383523967646
Max Error: 0.528852428121852


In [12]:
#TRAINING AND CROSS-VALIDATION -- DIMENSION 2
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(n_estimators=600, max_depth = 40, min_samples_split = 10),
    'Gradient Boosting Regressor': GradientBoostingRegressor(n_estimators=1600, max_depth = 110, min_samples_split = 12)
}

# Create a RepeatedKFold cross-validator
rkf = RepeatedKFold(n_splits=4, n_repeats=1, random_state=42)

# Function to calculate all metrics
def calculate_metrics(model, X, y, cv):
    r2_scores = []
    rmse_scores = []
    mae_scores = []
    max_errors = []

    for train_index, test_index in cv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y1_train, y1_test = y1[train_index], y1[test_index]

        model.fit(X_train, y1_train)
        y1_pred = model.predict(X_test)

        # Calculate metrics
        r2_scores.append(r2_score(y1_test, y1_pred))
        rmse_scores.append(np.sqrt(mean_squared_error(y1_test, y1_pred)))
        mae_scores.append(mean_absolute_error(y1_test, y1_pred))
        max_errors.append(max_error(y1_test, y1_pred))

    return r2_scores, rmse_scores, mae_scores, max_errors

# Evaluate each model using cross-validation and calculate the metrics
for model_name, model in models.items():
    r2_scores, rmse_scores, mae_scores, max_errors = calculate_metrics(model, X, y, rkf)

    #print(f"{model_name} - R2 Scores: {r2_scores}")
    print(f"{model_name} - Mean R2: {np.mean(r2_scores)}")
    #print(f"{model_name} - RMSE Scores: {rmse_scores}")
    print(f"{model_name} - Mean RMSE: {np.mean(rmse_scores)}")
    #print(f"{model_name} - MAE Scores: {mae_scores}")
    print(f"{model_name} - Mean MAE: {np.mean(mae_scores)}")
    #print(f"{model_name} - Max Errors: {max_errors}")
    print(f"{model_name} - Mean Max Error: {np.mean(max_errors)}\n")


Linear Regression - Mean R2: 0.2162814676252641
Linear Regression - Mean RMSE: 0.2282719283105907
Linear Regression - Mean MAE: 0.18241052351386491
Linear Regression - Mean Max Error: 0.6452823352637933

Random Forest Regressor - Mean R2: 0.10168723656486628
Random Forest Regressor - Mean RMSE: 0.24437066928307147
Random Forest Regressor - Mean MAE: 0.1961095431544096
Random Forest Regressor - Mean Max Error: 0.716543839490829

Gradient Boosting Regressor - Mean R2: 0.01798993768275045
Gradient Boosting Regressor - Mean RMSE: 0.2555566805733291
Gradient Boosting Regressor - Mean MAE: 0.20233491663453979
Gradient Boosting Regressor - Mean Max Error: 0.7516183152715963



In [None]:
###VALIDATION 1 -- DIMENSION 2 -- 116 CR
model = LinearRegression()
model.fit(X, y1)

y_val_pred = model.predict(X_val)

r2_val = r2_score(y1_val, y_val_pred)
rmse_val = np.sqrt(mean_squared_error(y1_val, y_val_pred))
mae_val = mean_absolute_error(y1_val, y_val_pred)
max_err_val = max_error(y1_val, y_val_pred)

# Print the evaluation metrics for the validation data
print("Validation Set Evaluation:")
print(f"R-squared: {r2_val}")
print(f"RMSE: {rmse_val}")
print(f"MAE: {mae_val}")
print(f"Max Error: {max_err_val}")


Validation Set Evaluation:
R-squared: -0.10955256256645263
RMSE: 0.2631081827970129
MAE: 0.1854923806317491
Max Error: 0.45279131237775716


VALIDATION 2 -- BOTH DIMENSIONS: VALIDATION WITH NEWSROOM DATA

In [None]:
import chardet

# Detect file encoding
with open('116incNR1.txt', 'rb') as f:
    result = chardet.detect(f.read())

# Get the detected encoding
file_encoding = result['encoding']

In [None]:
print(file_encoding)

Windows-1252


In [None]:
####IMPORT AND CLEAN NEWSROOM FILE
####IMPORTANT -- IF YOU HAVE THE CLEAN_FILE1.TXT FILE, YOU DON'T NEED TO RUN THE FOLLOWING BLOCK

In [None]:
####TRYING THIS NOW IN A WAY THAT READS EACH LINE MANUALLY BUT PRESERVES 5 COLUMNS
####IMPORTANT -- IF YOU HAVE THE CLEAN_FILE1.TXT FILE, YOU DON'T NEED TO RUN THIS BLOCK
####THIS PROCESS WAS TO FIX SOME THE FILE FORMATING ISSUES CAUSED BY THE DIFFERENT FORMATS OF THE RAW DATA

import pandas as pd

# Step 1: Read and normalize lines
lines = []
with open("116incNR1.txt", encoding="windows-1252", errors="ignore") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        # Clean smart quotes and unwanted chars
        line = line.replace('“', '"').replace('”', '"')
        line = line.replace('‘', "'").replace('’', "'")
        line = line.replace('\xa0', ' ')
        line = line.replace('\x92', "'").replace('\x93', '"').replace('\x94', '"')
        lines.append(line)

# Step 2: Split lines from right and left
rows = []
for line in lines:
    # First, split from right: grab the last 3 fields
    parts = line.rsplit(",", 3)
    if len(parts) != 4:
        continue  # skip malformed rows
    front_part, dim1, score1, score2 = parts

    # Now split from left to extract candidate and text
    first_comma = front_part.find(",")
    if first_comma == -1:
        continue  # skip if no candidate/text separation
    candidate = front_part[:first_comma].strip()
    text = front_part[first_comma + 1:].strip()

    rows.append([candidate, text, dim1.strip(), score1.strip(), score2.strip()])

# Step 3: Create DataFrame
df = pd.DataFrame(rows, columns=["candidate", "text", "dim", "score1", "score2"])

####EXPORTED TO CLEAN_FILE1.TXT



In [None]:
df =
df1 = df.dropna()

In [None]:
print(df1.head(9))



    candidate                                               text   dim  \
1       Beyer  "Unemployment benefits are set to expire for a...  dim1   
2    griffith  "Yesterday the President signed the more than ...  dim1   
3       cline  "Virginians across the Commonwealth are hurtin...  dim1   
4     wittman  "Let me be very clear: the FY21 NDAA does righ...  dim1   
5      wexton  "today called for a court order to extend Virg...  dim1   
6    connolly  "I am deeply disappointed Republicans were unw...  dim1   
7      rscott  "For 60 years, the NDAA has been passed with b...  dim1   
8  spanberger  "called on the Trump Administration and congre...  dim1   

   score1  score2  
1  -0.385  -0.095  
2   0.520  -0.365  
3   0.720  -0.202  
4   0.451   0.001  
5  -0.369   0.313  
6  -0.307  -0.038  
7  -0.450   0.012  
8  -0.188   0.338  


In [None]:
print(df1.dtypes)

candidate     object
text          object
dim           object
score1       float64
score2       float64
dtype: object


In [None]:
df1.head(9)

Unnamed: 0,candidate,text,dim,score1,score2
1,Beyer,"""Unemployment benefits are set to expire for a...",dim1,-0.385,-0.095
2,griffith,"""Yesterday the President signed the more than ...",dim1,0.52,-0.365
3,cline,"""Virginians across the Commonwealth are hurtin...",dim1,0.72,-0.202
4,wittman,"""Let me be very clear: the FY21 NDAA does righ...",dim1,0.451,0.001
5,wexton,"""today called for a court order to extend Virg...",dim1,-0.369,0.313
6,connolly,"""I am deeply disappointed Republicans were unw...",dim1,-0.307,-0.038
7,rscott,"""For 60 years, the NDAA has been passed with b...",dim1,-0.45,0.012
8,spanberger,"""called on the Trump Administration and congre...",dim1,-0.188,0.338


In [None]:
df.text.str.len()

Unnamed: 0,text
0,4
1,32023
2,17402
3,2070
4,20452
5,28644
6,23252
7,50368
8,84539


In [None]:
###lemmatize prior to other vectorizer
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

N = []
lemmatizer = WordNetLemmatizer()
for i in df1.text:
  lems = lemmatizer.lemmatize(i)
  N.append(lems)



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
###DIMENSIONS 1 AND 2 -- NEWSROOM
X_val = vectorizer.transform(N)
y_val = df1.score1
y1_val = df1.score2

In [None]:
###TEST 2 -- DIMENSION 1 -- 116 NEWSROOM
model = LinearRegression()
model.fit(X, y)

y_val_pred = model.predict(X_val)

# Step 7: (Optional) Evaluate the model on the validation data
r2_val = r2_score(y_val, y_val_pred)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
mae_val = mean_absolute_error(y_val, y_val_pred)
max_err_val = max_error(y_val, y_val_pred)

# Print the evaluation metrics for the validation data
print("Validation Set Evaluation:")
print(f"R-squared: {r2_val}")
print(f"RMSE: {rmse_val}")
print(f"MAE: {mae_val}")
print(f"Max Error: {max_err_val}")


Validation Set Evaluation:
R-squared: 0.14747181901530293
RMSE: 0.4140407911608793
MAE: 0.384729746265834
Max Error: 0.7049608707291906


In [None]:
###TEST 2 -- DIMENSION 2 -- 116 NEWSROOM
model = LinearRegression()
model.fit(X, y1)

y1_val_pred = model.predict(X_val)

# Step 7: (Optional) Evaluate the model on the validation data
r2_val = r2_score(y1_val, y1_val_pred)
rmse_val = np.sqrt(mean_squared_error(y1_val, y1_val_pred))
mae_val = mean_absolute_error(y1_val, y1_val_pred)
max_err_val = max_error(y1_val, y1_val_pred)

# Print the evaluation metrics for the validation data
print("Validation Set Evaluation:")
print(f"R-squared: {r2_val}")
print(f"RMSE: {rmse_val}")
print(f"MAE: {mae_val}")
print(f"Max Error: {max_err_val}")


Validation Set Evaluation:
R-squared: -0.10883210514255426
RMSE: 0.234485352185755
MAE: 0.18946081711292856
Max Error: 0.38716467987737774


PREDICTION OF CANDIDATE VALUES

In [None]:
all116 = pd.read_csv('116cand.csv', dtype="string", sep = ',')
cand116 = all116.dropna()


In [None]:
cand116.text.str.len()

Unnamed: 0,text
0,4372
1,12524
2,1163
3,7164
4,8081
5,8892
6,1956
7,22044
8,1985
9,2664


In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

CAN = []
lemmatizer = WordNetLemmatizer()
for i in cand116.text:
  lems = lemmatizer.lemmatize(i)
  CAN.append(lems)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
X_cand = vectorizer.transform(CAN)


In [None]:
###VALIDATION 1 -- DIMENSION 1 -- CANDIDATE PREDICTIONS
model = LinearRegression()
model.fit(X, y)

y_cand_pred = model.predict(X_cand)

In [None]:
print(y_cand_pred)

[ 0.21304251 -0.04830325  0.06700477  0.10071491  0.15502075 -0.06501661
  0.11156136  0.1877279   0.26834506  0.1464924   0.18390541]


In [None]:
###VALIDATION 1 -- DIMENSION 2 -- CANDIDATE PREDICTIONS
model = LinearRegression()
model.fit(X, y1)

y1_cand_pred = model.predict(X_cand)

In [None]:
print(y1_cand_pred)

[ 0.03038339 -0.03442199  0.06936853 -0.02577686 -0.02996066 -0.01953278
 -0.05089021 -0.12169     0.07070308 -0.07584534 -0.00078786]
