3/8/25: Updated from the last stable/functional version ("dim1_BOW_train-test-combined").  Two major additions:  A different cross-validation method that does not require using **negative** mean absolute error.  I have also updated and expanded the "newsroom" test dataset in the hope of resolving the negative R2 issue.


10/17/25: fixed lemmatizer

In [None]:
##import and format data
import pandas as pd
import numpy as np

#Upload complete training dataset
all114 = pd.read_csv('training_data_114_final.csv',dtype='string')
all2 = all114.astype({'speech':'string','nominate_dim1':'float', 'nominate_dim2': 'float'})
final114 = all2.dropna()

# OR Upload LEMMATIZED dataset
all114 = pd.read_csv('lemmatized_output.csv', dtype="string")
docs = all114['speech']


In [None]:
#upload the custom stopword list
from congress_stopwords import congress


In [None]:
final114.speech.str.len()
#Just making sure that no speech data were truncated during preprocessing
#and that the proper number of documents are present.

Unnamed: 0,speech
0,17784
1,25356
2,113985
3,109704
4,45340
...,...
433,90749
434,31889
435,15301
436,24527


In [None]:
####create the TF-IDF matrix
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
        sublinear_tf=True, max_df=0.5, min_df=5, stop_words=congress
    )
train2 = vectorizer.fit_transform(docs) #fit the lemmatized text


In [None]:
X = train2
###variable of NOMINATE values
y = final114.nominate_dim1
y1 = final114.nominate_dim2

FEATURE OVERVIEW

In [None]:
tfidf = TfidfVectorizer()

###tf-idf in matrix form (from g4g)
print('\nWord indexes:')
print(tfidf.vocabulary)

# display tf-idf values
print('\ntf-idf value:')
print(X)

# in matrix form
print('\ntf-idf values in matrix form:')
print(X.toarray())

In [None]:
X.shape

(438, 14533)

IMPORT VALIDATION DATA

In [None]:
###116th CONGRESSIONAL RECORD speeches
all116 = pd.read_csv('116incCR1.txt', dtype="string", sep = ',')
all2 = all116.astype({'score':'float','dim2':'float'})
cr116_1 = all2.dropna()
val_speech = cr116_1['text']

In [None]:
cr116_1.text.str.len()

In [None]:
doc = val_speech

import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Function to lemmatize a single text entry
def lemmatize_text(text):
    text = str(text)
    if len(text) > 1_000_000:
        return "TEXT TOO LONG"
    doc = nlp(text)  # Ensure text is string
    return " ".join([token.lemma_ for token in doc])

# Apply to the whole column
val_speeches = doc.apply(lemmatize_text)

# Preview result
print(val_speeches.head())

0    I rise in support of my amendment with Represe...
1    as a Navy veteran , I believe in focus our lim...
2    this legislation fund critical nutrition assis...
3    I rise in support of the bill . I want to than...
4    I rise today in opposition to H.R. 1644 , the ...
Name: text, dtype: object


In [None]:
X_val = vectorizer.transform(val_speeches)
y_val = cr116_1.score
y_val1 = cr116_1.dim2

In [None]:
X_val.shape

(9, 14537)

In [None]:
####116 CR VALIDATION - DIMENSION 1

import numpy as np
from sklearn.linear_model import Ridge, Lasso, SGDRegressor, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error
from scipy.stats import uniform, loguniform

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=0.05589),
    'Lasso Regression': Lasso(max_iter=5000, alpha=0.000498),
    'SGD Regressor (squared_epsilon_insensitive)': SGDRegressor(
        loss='squared_epsilon_insensitive',
        max_iter=2000,
        random_state=42,
        alpha=0.000366,
        penalty='l2',
        epsilon=0.0374,
        learning_rate='adaptive'
    )
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X, y)

# Assuming best_models are already trained on full training data,
# X_unseen is the unseen feature data, and y_val is the ground truth labels.

print("Performance on unseen validation data:\n")

for name, model in models.items():
    # Predict on unseen data
    y_pred = model.predict(X_val)

    # Calculate metrics
    r2 = r2_score(y_val, y_pred)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    max_err = max_error(y_val, y_pred)

    # Display results
    print(f"{name}:")
    print(f"  R2 Score: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE: {mae:.4f}")
    print(f"  Max Error: {max_err:.4f}\n")




Training Linear Regression...
Training Ridge Regression...
Training Lasso Regression...
Training SGD Regressor (squared_epsilon_insensitive)...
Performance on unseen validation data:

Linear Regression:
  R2 Score: 0.4254
  RMSE: 0.3242
  MAE: 0.2834
  Max Error: 0.4821

Ridge Regression:
  R2 Score: 0.4363
  RMSE: 0.3211
  MAE: 0.2786
  Max Error: 0.4887

Lasso Regression:
  R2 Score: 0.5597
  RMSE: 0.2838
  MAE: 0.2420
  Max Error: 0.4320

SGD Regressor (squared_epsilon_insensitive):
  R2 Score: 0.4003
  RMSE: 0.3312
  MAE: 0.3021
  Max Error: 0.5468



In [None]:
###116CR - DIMENSION 2


import numpy as np
from sklearn.linear_model import Ridge, Lasso, SGDRegressor, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error
from scipy.stats import uniform, loguniform


models1 = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=0.3905),
    'Lasso Regression': Lasso(max_iter=5000, alpha=0.000498),
    'SGD Regressor (squared_epsilon_insensitive)': SGDRegressor(
        loss='squared_epsilon_insensitive',
        max_iter=2000,
        random_state=42,
        alpha=0.000366,
        penalty='l2',
        epsilon=0.0374,
        learning_rate='adaptive'
    )
}

for name, model in models1.items():
    print(f"Training {name}...")
    model.fit(X, y1)



# Assuming best_models are already trained on full training data,
# X_unseen is the unseen feature data, and y_val is the ground truth labels.

print("Performance on unseen validation data:\n")

for name, model in models1.items():
    # Predict on unseen data
    y_pred1 = model.predict(X_val)

    # Calculate metrics
    r2 = r2_score(y_val1, y_pred1)
    rmse = np.sqrt(mean_squared_error(y_val1, y_pred1))
    mae = mean_absolute_error(y_val1, y_pred1)
    max_err = max_error(y_val1, y_pred1)

    # Display results
    print(f"{name}:")
    print(f"  R2 Score: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE: {mae:.4f}")
    print(f"  Max Error: {max_err:.4f}\n")




Training Linear Regression...
Training Ridge Regression...
Training Lasso Regression...
Training SGD Regressor (squared_epsilon_insensitive)...
Performance on unseen validation data:

Linear Regression:
  R2 Score: -0.7228
  RMSE: 0.3279
  MAE: 0.2666
  Max Error: 0.5065

Ridge Regression:
  R2 Score: -0.0685
  RMSE: 0.2582
  MAE: 0.1929
  Max Error: 0.4218

Lasso Regression:
  R2 Score: -0.0652
  RMSE: 0.2578
  MAE: 0.2118
  Max Error: 0.4277

SGD Regressor (squared_epsilon_insensitive):
  R2 Score: 0.0228
  RMSE: 0.2469
  MAE: 0.1857
  Max Error: 0.4132



PREDICTION OF CANDIDATE VALUES

In [None]:
all116 = pd.read_csv('116cand.csv', dtype="string", sep = ',')
cand116 = all116.dropna()
cand_speech = cand116['text']

In [None]:
doc = cand_speech

import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Function to lemmatize a single text entry
def lemmatize_text(text):
    text = str(text)
    if len(text) > 1_000_000:
        return "TEXT TOO LONG"
    doc = nlp(text)  # Ensure text is string
    return " ".join([token.lemma_ for token in doc])

# Apply to the whole column
cand_speeches = doc.apply(lemmatize_text)

# Preview result
print(cand_speeches.head())

0    a self - describe '' biblical conservative '' ...
1    campaign release a 30 second television ad tit...
2    vote for the Tax Cuts and Jobs Act , which red...
3    I never envision run for Congress , but in 201...
4    the top issue that I will be fight for in the ...
Name: text, dtype: object


In [None]:
X_cand = vectorizer.transform(cand_speeches)


In [None]:
# Fit best models on full training data -- DIMENSION 1
for name, model in models.items():
    print(f"Training {name} on full training dataset...")
    model.fit(X, y)  # refit best model on full training data

# Now apply to unseen data and get predictions
predictions = {}
for name, model in models.items():
    print(f"Predicting with {name} on unseen data...")
    preds = model.predict(X_cand)
    predictions[name] = preds




Training Linear Regression on full training dataset...
Training Ridge Regression on full training dataset...
Training Lasso Regression on full training dataset...
Training SGD Regressor (squared_epsilon_insensitive) on full training dataset...
Predicting with Linear Regression on unseen data...
Predicting with Ridge Regression on unseen data...
Predicting with Lasso Regression on unseen data...
Predicting with SGD Regressor (squared_epsilon_insensitive) on unseen data...


In [None]:
print(predictions)

{'Linear Regression': array([-0.0010346 , -0.11896243, -0.07801749,  0.02989306, -0.03563424,
       -0.24147284,  0.02964163,  0.1443163 ,  0.06983388,  0.01403918,
       -0.0231867 ]), 'Ridge Regression': array([ 0.0694039 , -0.07221694,  0.00903735,  0.09400091,  0.01781125,
       -0.19077331,  0.09421905,  0.17688012,  0.14546007,  0.08311768,
        0.06887001]), 'Lasso Regression': array([ 0.21251319, -0.25158809,  0.14890596,  0.15096586,  0.14765196,
       -0.01533217,  0.16274456,  0.22950975,  0.29168301,  0.08805769,
        0.16274456]), 'SGD Regressor (squared_epsilon_insensitive)': array([ 0.0394398 , -0.09442378,  0.01046293,  0.06943526, -0.01779803,
       -0.15303731,  0.04516855,  0.0925432 ,  0.12101758,  0.03029941,
        0.05915306])}


DIMENSION 2

In [None]:
####PREDICT CHALLENGER SCORES -- DIMENSION 2

import numpy as np
from sklearn.linear_model import Ridge, Lasso, SGDRegressor, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error
from scipy.stats import uniform, loguniform


models1 = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=0.3905),
    'Lasso Regression': Lasso(max_iter=5000, alpha=0.000498),
    'SGD Regressor (squared_epsilon_insensitive)': SGDRegressor(
        loss='squared_epsilon_insensitive',
        max_iter=2000,
        random_state=42,
        alpha=0.000366,
        penalty='l2',
        epsilon=0.0374,
        learning_rate='adaptive'
    )
}

for name, model in models1.items():
    print(f"Training {name} on full training dataset...")
    model.fit(X, y1)  # refit best model on full training data

# Now apply to unseen data and get predictions
predictions1 = {}
for name, model in models1.items():
    print(f"Predicting with {name} on unseen data...")
    preds = model.predict(X_cand)
    predictions1[name] = preds




Training Linear Regression on full training dataset...
Training Ridge Regression on full training dataset...
Training Lasso Regression on full training dataset...
Training SGD Regressor (squared_epsilon_insensitive) on full training dataset...
Predicting with Linear Regression on unseen data...
Predicting with Ridge Regression on unseen data...
Predicting with Lasso Regression on unseen data...
Predicting with SGD Regressor (squared_epsilon_insensitive) on unseen data...


In [None]:
print(predictions1)

{'Linear Regression': array([-0.36581358, -0.34364216, -0.40006669, -0.36927886, -0.31684913,
       -0.32776083, -0.41907314, -0.37670343, -0.35619167, -0.4624663 ,
       -0.46416853]), 'Ridge Regression': array([-0.03563684, -0.09526789, -0.0109313 , -0.05854173, -0.05594001,
       -0.09672039, -0.05530891, -0.14974461, -0.00504754, -0.09642173,
       -0.02933692]), 'Lasso Regression': array([ 0.09348907, -0.04597661,  0.01038274,  0.01038274,  0.05000378,
       -0.04319724,  0.01038274,  0.01038274,  0.01038274,  0.01038274,
        0.01038274]), 'SGD Regressor (squared_epsilon_insensitive)': array([-0.02005133, -0.05521012,  0.00494939, -0.02798439, -0.03322456,
       -0.04496227,  0.00110484, -0.08969106,  0.01603935, -0.04445202,
        0.00972751])}


FEATURE EXTRACTION - DIMENSION 1

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso, SGDRegressor, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error

feature_names = np.array(vectorizer.get_feature_names_out())

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=0.05589),
    'Lasso Regression': Lasso(max_iter=5000, alpha=0.000498),
    'SGD Regressor (squared_epsilon_insensitive)': SGDRegressor(
        loss='squared_epsilon_insensitive',
        max_iter=2000,
        random_state=42,
        alpha=0.000366,
        penalty='l2',
        epsilon=0.0374,
        learning_rate='adaptive'
    )
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X, y)
    coefs = model.coef_

    # Sort coefficients
    sorted_idx = np.argsort(coefs)
    top_neg_idx = sorted_idx[:20]   # 5 most negative
    top_pos_idx = sorted_idx[-20:]  # 5 most positive

    print(f"\n=== {name} ===")
    print("Top positive words (increase Dim 1 score):")
    for i in top_pos_idx[::-1]:
        print(f"  {feature_names[i]:<15} {coefs[i]:.4f}")

    print("Top negative words (decrease Dim 1 score):")
    for i in top_neg_idx:
        print(f"  {feature_names[i]:<15} {coefs[i]:.4f}")





Training Linear Regression...

=== Linear Regression ===
Top positive words (increase Dim 1 score):
  331             0.5159
  bureaucrat      0.4567
  obamacare       0.4355
  unborn          0.4059
  lodge           0.3738
  sergeant        0.3695
  irs             0.3589
  illegal         0.3307
  fathers         0.3225
  purse           0.3209
  dad             0.3172
  epa             0.3130
  enemy           0.3092
  amnesty         0.3077
  islamic         0.3055
  liberty         0.2928
  pilot           0.2901
  branch          0.2876
  alien           0.2856
  oath            0.2796
Top negative words (decrease Dim 1 score):
  print           -0.4430
  climate         -0.3434
  caucus          -0.3275
  shooting        -0.3109
  loophole        -0.3098
  nafta           -0.2975
  pollution       -0.2928
  voting          -0.2915
  color           -0.2867
  lgbt            -0.2851
  messrs          -0.2839
  toxic           -0.2838
  flint           -0.2809
  voucher         -

FEATURE EXTRACTION - DIMENSION 2

In [None]:
####FEATURE EXTRACTION -- DIMENSION 2

import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso, SGDRegressor, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error

feature_names = np.array(vectorizer.get_feature_names_out())

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=0.05589),
    'Lasso Regression': Lasso(max_iter=5000, alpha=0.000498),
    'SGD Regressor (squared_epsilon_insensitive)': SGDRegressor(
        loss='squared_epsilon_insensitive',
        max_iter=2000,
        random_state=42,
        alpha=0.000366,
        penalty='l2',
        epsilon=0.0374,
        learning_rate='adaptive'
    )
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X, y1)
    coefs = model.coef_

    # Sort coefficients
    sorted_idx = np.argsort(coefs)
    top_neg_idx = sorted_idx[:20]   # 5 most negative
    top_pos_idx = sorted_idx[-20:]  # 5 most positive

    print(f"\n=== {name} ===")
    print("Top positive words (increase Dim 1 score):")
    for i in top_pos_idx[::-1]:
        print(f"  {feature_names[i]:<15} {coefs[i]:.4f}")

    print("Top negative words (decrease Dim 1 score):")
    for i in top_neg_idx:
        print(f"  {feature_names[i]:<15} {coefs[i]:.4f}")





Training Linear Regression...

=== Linear Regression ===
Top positive words (increase Dim 1 score):
  print           0.6145
  rural           0.4908
  farm            0.3714
  331             0.3692
  recommit        0.3507
  housing         0.3234
  prayer          0.3073
  god             0.2952
  appropriations  0.2828
  5243            0.2667
  november        0.2604
  unborn          0.2581
  producer        0.2578
  cftc            0.2572
  russia          0.2570
  immigration     0.2558
  bureaucracy     0.2544
  cyber           0.2541
  commissioner    0.2498
  enemy           0.2444
Top negative words (decrease Dim 1 score):
  subsidy         -0.2665
  receipt         -0.2455
  progressive     -0.2188
  corporation     -0.2014
  hemp            -0.1991
  conservation    -0.1926
  fault           -0.1901
  backdoor        -0.1882
  borrow          -0.1816
  airline         -0.1814
  seizure         -0.1807
  encryption      -0.1804
  extra           -0.1773
  marijuana       -