In [1]:
##import and format data
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('punkt')

#Upload complete training dataset
all114 = pd.read_csv('training_data_114_final.csv', dtype="string")
all2 = all114.astype({'nominate_dim1':'float', 'nominate_dim2': 'float'})
next114 = all2.dropna()

###variable of NOMINATE values
y = next114.nominate_dim1
y1 = next114.nominate_dim2

# OR Upload LEMMATIZED dataset -- DO NOT run this and the complete training dataset upload.  Choose one.
all114 = pd.read_csv('lemmatized_output.csv', dtype="string")
docs = all114['speech']


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
#upload and run the custom stopword list
from congress_stopwords import congress



In [None]:
####LEMMATIZE
doc = final114

import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Function to lemmatize a single text entry
def lemmatize_text(text):
    text = str(text)
    if len(text) > 1_000_000:
        return "TEXT TOO LONG"
    doc = nlp(text)  # Ensure text is string
    return " ".join([token.lemma_ for token in doc])

# Apply to the whole column
docs = doc.apply(lemmatize_text)

# Preview result
print(docs.head())

0    Mr. Speaker . nearly 160 million Americans rec...
1    let I thank my colleague for yielding . and le...
2    but neither the United States nor any state sh...
3    Mr. Speaker . the Energy and Commerce Committe...
4    Mr. Speaker . I rise today to ask my colleague...
Name: speech, dtype: object


In [4]:
# Step 4: Convert text to TF-IDF matrix and make it dense


vectorizer = TfidfVectorizer(stop_words=congress, min_df=5, max_df=0.5)
X_sparse = vectorizer.fit_transform(docs)
#X_dense = X_sparse.toarray()  # Convert to dense format for PCA

print(f"TF-IDF shape before PCA: {X_sparse.shape}")

TF-IDF shape before PCA: (438, 14533)


In [5]:
from sklearn.decomposition import TruncatedSVD

#SVD = 100
n_components = 100
svd100 = TruncatedSVD(n_components=n_components)
X100 = svd100.fit_transform(X_sparse)

print(f"PCA shape after reduction: {X100.shape}") # second dimension should equal n_components

PCA shape after reduction: (438, 100)


In [None]:
from sklearn.decomposition import TruncatedSVD

#SVD = 250
n_components = 250
svd250 = TruncatedSVD(n_components=n_components)
X250 = svd250.fit_transform(X_sparse)

print(f"PCA shape after reduction: {X250.shape}") # second dimension should equal n_components

PCA shape after reduction: (438, 250)


In [None]:
print(best_model)

Pipeline(steps=[('svd', TruncatedSVD(n_components=100, random_state=42)),
                ('rf',
                 RandomForestRegressor(max_depth=10, min_samples_split=5,
                                       n_estimators=1000, random_state=42))])


VALIDATION

In [None]:
###116th CONGRESSIONAL RECORD speeches
all116 = pd.read_csv('116incCR1.txt', dtype="string", sep = ',')
all2 = all116.astype({'score':'float','dim2':'float'})
cr116_1 = all2.dropna()


In [None]:
val_speech = cr116_1['text']


In [None]:
doc = val_speech

import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Function to lemmatize a single text entry
def lemmatize_text(text):
    text = str(text)
    if len(text) > 1_000_000:
        return "TEXT TOO LONG"
    doc = nlp(text)  # Ensure text is string
    return " ".join([token.lemma_ for token in doc])

# Apply to the whole column
val_speeches = doc.apply(lemmatize_text)

# Preview result
print(val_speeches.head())

0    I rise in support of my amendment with Represe...
1    as a Navy veteran , I believe in focus our lim...
2    this legislation fund critical nutrition assis...
3    I rise in support of the bill . I want to than...
4    I rise today in opposition to H.R. 1644 , the ...
Name: text, dtype: object


In [None]:
len(val_speeches)

9

In [None]:
X_val = vectorizer.transform(val_speeches)
y_val = cr116_1.score
y_val1 = cr116_1.dim2

In [None]:
X_val.shape

(9, 14533)

In [None]:
#SVD = 250
#n_components = 250
#svd250 = TruncatedSVD(n_components=n_components)
X_val_svd = svd250.transform(X_val)

print(f"shape after reduction: {X_val_svd.shape}") # second dimension should equal n_components

shape after reduction: (9, 250)


In [None]:
#SVD = 100
#n_components = 100
#svd250 = TruncatedSVD(n_components=n_components)
X_val_svd100 = svd100.transform(X_val)

print(f"shape after reduction: {X_val_svd100.shape}") # second dimension should equal n_components

shape after reduction: (9, 100)


In [None]:
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, r2_score, mean_squared_error, mean_absolute_error, max_error

###VALIDATION DATA 1 --DIMENSION 1-- 116 CR -- SVD 250
model = RandomForestRegressor(n_estimators=1000, max_depth = 10, min_samples_split = 5)
model.fit(X250, y)

y_val_pred = model.predict(X_val_svd)


r2_val = r2_score(y_val, y_val_pred)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
mae_val = mean_absolute_error(y_val, y_val_pred)
max_err_val = max_error(y_val, y_val_pred)

# Print the evaluation metrics for the validation data
print("Validation Set Evaluation:")
print(f"R-squared: {r2_val}")
print(f"RMSE: {rmse_val}")
print(f"MAE: {mae_val}")
print(f"Max Error: {max_err_val}")



Validation Set Evaluation:
R-squared: 0.08686558693983637
RMSE: 0.4086623716227639
MAE: 0.32181992320751507
Max Error: 0.8377825549803792


In [None]:
###export trained model for SVD = 250
import joblib

joblib.dump(model,'svd250_full.pkl')

['svd250_full.pkl']

In [None]:
#### CR116 - SVD 100
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, r2_score, mean_squared_error, mean_absolute_error, max_error

###VALIDATION DATA 1 --DIMENSION 1-- 116 CR -- SVD 100
model = RandomForestRegressor(n_estimators=1000, max_depth = 10, min_samples_split = 5)
model.fit(X100, y)

y_val_pred = model.predict(X_val_svd100)


r2_val = r2_score(y_val, y_val_pred)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
mae_val = mean_absolute_error(y_val, y_val_pred)
max_err_val = max_error(y_val, y_val_pred)

# Print the evaluation metrics for the validation data
print("Validation Set Evaluation:")
print(f"R-squared: {r2_val}")
print(f"RMSE: {rmse_val}")
print(f"MAE: {mae_val}")
print(f"Max Error: {max_err_val}")



Validation Set Evaluation:
R-squared: 0.14926708322952542
RMSE: 0.39445177474880694
MAE: 0.3034715047720144
Max Error: 0.817029165130385


In [None]:
###SVD 100 -- DIMENSION 1 -- FEATURE EXTRACTION
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso, SGDRegressor, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error

feature_names = np.array(vectorizer.get_feature_names_out())

models = {
    'Random Forest': RandomForestRegressor(n_estimators=1000, max_depth = 10, min_samples_split = 5)
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X100, y)
    # --- Extract feature importances ---

# Map regression coefficients back to word space
# model.coef_ shape = (n_components,)
# svd.components_.T shape = (n_words, n_components)
word_influence = svd100.components_.T

# Create interpretable table
feature_names = np.array(vectorizer.get_feature_names_out())
word_importance = pd.DataFrame({
    "word": feature_names,
    "influence": word_influence
}).sort_values("influence", ascending=False)

print(word_importance.head(10))
print(word_importance.tail(10))





In [None]:
###export trained model for SVD = 250
import joblib

joblib.dump(model,'svd100_full.pkl')

['svd100_full.pkl']

PREDICTION OF CANDIDATE VALUES

In [8]:
all116 = pd.read_csv('116cand.csv', dtype="string", sep = ',')
cand116 = all116.dropna()
cand116.shape

(11, 5)

In [9]:
cand_speech = cand116['text']

In [11]:
doc = cand_speech

import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Function to lemmatize a single text entry
def lemmatize_text(text):
    text = str(text)
    if len(text) > 1_000_000:
        return "TEXT TOO LONG"
    doc = nlp(text)  # Ensure text is string
    return " ".join([token.lemma_ for token in doc])

# Apply to the whole column
cand_speeches = doc.apply(lemmatize_text)

# Preview result
print(cand_speeches.head())

0    a self - describe '' biblical conservative '' ...
1    campaign release a 30 second television ad tit...
2    vote for the Tax Cuts and Jobs Act , which red...
3    I never envision run for Congress , but in 201...
4    the top issue that I will be fight for in the ...
Name: text, dtype: object


In [12]:
cand_test = vectorizer.transform(cand_speeches)


In [None]:
#SVD = 250
#n_components = 250
#svd250 = TruncatedSVD(n_components=n_components)
X_cand_svd250 = svd250.transform(cand_test)

print(f"shape after reduction: {X_cand_svd250.shape}") # second dimension should equal n_components

shape after reduction: (11, 250)


In [None]:
###TEST -- DIMENSION 1 -- CANDIDATE PREDICTIONS
RandomForestRegressor(n_estimators=1000, max_depth = 10, min_samples_split = 5)
model.fit(X250,y)

import joblib
model = joblib.load('svd250_full.pkl')
y_cand_pred = model.predict(X_cand_svd250)

In [None]:
print(y_cand_pred)

[ 0.36154716 -0.36882553  0.2453934   0.11846853 -0.36982461 -0.32843549
 -0.16556807  0.37015189  0.3706053  -0.33139527  0.083895  ]


In [13]:
#SVD = 100
#n_components = 100
#svd100 = TruncatedSVD(n_components=n_components)
X_cand_svd100 = svd100.transform(cand_test)

print(f"shape after reduction: {X_cand_svd100.shape}") # second dimension should equal n_components

shape after reduction: (11, 100)


In [None]:
###TEST -- DIMENSION 1 -- CANDIDATE PREDICTIONS
RandomForestRegressor(n_estimators=1000, max_depth = 10, min_samples_split = 5)
model.fit(X100,y)

import joblib
model = joblib.load('svd100_full.pkl')
y_cand_pred = model.predict(X_cand_svd100)

In [None]:
print(y_cand_pred)

[ 0.35398384 -0.3720347   0.2208927   0.105426   -0.38397169 -0.33877372
 -0.13786896  0.33653539  0.32601701 -0.34398712  0.06288962]


In [None]:
cand116['Name']

Unnamed: 0,Name
0,Robert Good
1,Qasim Rashid
2,Scott Taylor
3,John Collick
4,Leon Benjamin
5,Bryant Webb
6,Nicholas Betts
7,Nicholas Freitas
8,Jeffrey Jordan
9,Aliscia Andrews


DIMENSION 2

In [None]:
###VALIDATION DATA 1 --DIMENSION 2-- 116 CR -- SVD250
model = RandomForestRegressor(n_estimators=1000, max_depth = 10, min_samples_split = 5)
model.fit(X250, y1)

y_val_pred1 = model.predict(X_val_svd)


r2_val = r2_score(y_val1, y_val_pred1)
rmse_val = np.sqrt(mean_squared_error(y_val1, y_val_pred1))
mae_val = mean_absolute_error(y_val1, y_val_pred1)
max_err_val = max_error(y_val1, y_val_pred1)

# Print the evaluation metrics for the validation data
print("Validation Set Evaluation:")
print(f"R-squared: {r2_val}")
print(f"RMSE: {rmse_val}")
print(f"MAE: {mae_val}")
print(f"Max Error: {max_err_val}")



In [None]:
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, r2_score, mean_squared_error, mean_absolute_error, max_error

###VALIDATION DATA 1 --DIMENSION 2-- 116 CR -- SVD100
model = RandomForestRegressor(n_estimators=800, max_depth = 10, min_samples_split = 2)
model.fit(X100, y1)

y_val_pred1 = model.predict(X_val_svd100)


r2_val = r2_score(y_val1, y_val_pred1)
rmse_val = np.sqrt(mean_squared_error(y_val1, y_val_pred1))
mae_val = mean_absolute_error(y_val1, y_val_pred1)
max_err_val = max_error(y_val1, y_val_pred1)

# Print the evaluation metrics for the validation data
print("Validation Set Evaluation:")
print(f"R-squared: {r2_val}")
print(f"RMSE: {rmse_val}")
print(f"MAE: {mae_val}")
print(f"Max Error: {max_err_val}")



In [None]:
###SVD 100 -- DIMENSION 2 -- FEATURE EXTRACTION
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso, SGDRegressor, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error

feature_names = np.array(vectorizer.get_feature_names_out())

models = {
    'Random Forest': RandomForestRegressor(n_estimators=1000, max_depth = 10, min_samples_split = 5)
}







In [None]:
###TEST -- DIMENSION 2 -- CANDIDATE PREDICTIONS
RandomForestRegressor(n_estimators=1000, max_depth = 10, min_samples_split = 5)
model.fit(X250,y1)

import joblib
joblib.dump(model,'svd250_full.pkl')


y_cand_pred = model.predict(X_cand_svd250)

In [14]:
###TEST -- DIMENSION 2 -- CANDIDATE PREDICTIONS -- SVD 100
RandomForestRegressor(n_estimators=1000, max_depth = 10, min_samples_split = 5)
model.fit(X100,y1)

#import joblib
#joblib.dump(model,'svd100_full.pkl')

y_cand_pred = model.predict(X_cand_svd100)

In [15]:
print(y_cand_pred)

[-0.07171355 -0.10855115 -0.12840685 -0.10317945 -0.09423463 -0.0949562
 -0.05527338 -0.12068057 -0.02962044 -0.11476047 -0.04400044]
