In [None]:
##import and format data
import pandas as pd
import numpy as np

#Upload complete training dataset
all114 = pd.read_csv('training_data_114_final.csv',dtype='string')
all2 = all114.astype({'speech':'string','nominate_dim1':'float', 'nominate_dim2': 'float'})
final114 = all2.dropna()

# OR Upload LEMMATIZED dataset
all114 = pd.read_csv('lemmatized_output.csv', dtype="string")
docs = all114['speech']


In [None]:
#upload the custom stopword list
from congress_stopwords import congress


In [None]:
final114.speech.str.len()
#Just making sure that no speech data were truncated during preprocessing
#and that the proper number of documents are present.

In [None]:
####create the TF-IDF matrix
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
        sublinear_tf=True, max_df=0.5, min_df=5, stop_words=congress
    )
train2 = vectorizer.fit_transform(docs) #fit the lemmatized text


In [None]:
X = train2
###variables of NOMINATE values
y = final114.nominate_dim1
y1 = final114.nominate_dim2

FEATURE OVERVIEW

In [None]:
tfidf = TfidfVectorizer()

###tf-idf in matrix form (from g4g)
print('\nWord indexes:')
print(tfidf.vocabulary)

# display tf-idf values
print('\ntf-idf value:')
print(X)

# in matrix form
print('\ntf-idf values in matrix form:')
print(X.toarray())

In [None]:
X.shape

IMPORT VALIDATION DATA

In [None]:
###116th CONGRESSIONAL RECORD speeches
all116 = pd.read_csv('116incCR1.txt', dtype="string", sep = ',')
all2 = all116.astype({'score':'float','dim2':'float'})
cr116_1 = all2.dropna()
val_speech = cr116_1['text']

In [None]:
cr116_1.text.str.len()

In [None]:
doc = val_speech

import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Function to lemmatize a single text entry
def lemmatize_text(text):
    text = str(text)
    if len(text) > 1_000_000:
        return "TEXT TOO LONG"
    doc = nlp(text)  # Ensure text is string
    return " ".join([token.lemma_ for token in doc])

# Apply to the whole column
val_speeches = doc.apply(lemmatize_text)

# Preview result
print(val_speeches.head())

In [None]:
X_val = vectorizer.transform(val_speeches)
y_val = cr116_1.score
y_val1 = cr116_1.dim2

In [None]:
X_val.shape

In [None]:
####116 CR VALIDATION - DIMENSION 1

import numpy as np
from sklearn.linear_model import Ridge, Lasso, SGDRegressor, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error
from scipy.stats import uniform, loguniform

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=0.05589),
    'Lasso Regression': Lasso(max_iter=5000, alpha=0.000498),
    'SGD Regressor (squared_epsilon_insensitive)': SGDRegressor(
        loss='squared_epsilon_insensitive',
        max_iter=2000,
        random_state=42,
        alpha=0.000366,
        penalty='l2',
        epsilon=0.0374,
        learning_rate='adaptive'
    )
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X, y)

# Assuming best_models are already trained on full training data,
# X_unseen is the unseen feature data, and y_val is the ground truth labels.

print("Performance on unseen validation data:\n")

for name, model in models.items():
    # Predict on unseen data
    y_pred = model.predict(X_val)

    # Calculate metrics
    r2 = r2_score(y_val, y_pred)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    max_err = max_error(y_val, y_pred)

    # Display results
    print(f"{name}:")
    print(f"  R2 Score: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE: {mae:.4f}")
    print(f"  Max Error: {max_err:.4f}\n")




In [None]:
###116CR - DIMENSION 2


import numpy as np
from sklearn.linear_model import Ridge, Lasso, SGDRegressor, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error
from scipy.stats import uniform, loguniform


models1 = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=0.3905),
    'Lasso Regression': Lasso(max_iter=5000, alpha=0.000498),
    'SGD Regressor (squared_epsilon_insensitive)': SGDRegressor(
        loss='squared_epsilon_insensitive',
        max_iter=2000,
        random_state=42,
        alpha=0.000366,
        penalty='l2',
        epsilon=0.0374,
        learning_rate='adaptive'
    )
}

for name, model in models1.items():
    print(f"Training {name}...")
    model.fit(X, y1)



# Assuming best_models are already trained on full training data,
# X_unseen is the unseen feature data, and y_val is the ground truth labels.

print("Performance on unseen validation data:\n")

for name, model in models1.items():
    # Predict on unseen data
    y_pred1 = model.predict(X_val)

    # Calculate metrics
    r2 = r2_score(y_val1, y_pred1)
    rmse = np.sqrt(mean_squared_error(y_val1, y_pred1))
    mae = mean_absolute_error(y_val1, y_pred1)
    max_err = max_error(y_val1, y_pred1)

    # Display results
    print(f"{name}:")
    print(f"  R2 Score: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE: {mae:.4f}")
    print(f"  Max Error: {max_err:.4f}\n")




PREDICTION OF CANDIDATE VALUES

In [None]:
all116 = pd.read_csv('116cand.csv', dtype="string", sep = ',')
cand116 = all116.dropna()
cand_speech = cand116['text']

In [None]:
doc = cand_speech

import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Function to lemmatize a single text entry
def lemmatize_text(text):
    text = str(text)
    if len(text) > 1_000_000:
        return "TEXT TOO LONG"
    doc = nlp(text)  # Ensure text is string
    return " ".join([token.lemma_ for token in doc])

# Apply to the whole column
cand_speeches = doc.apply(lemmatize_text)

# Preview result
print(cand_speeches.head())

In [None]:
X_cand = vectorizer.transform(cand_speeches)


In [None]:
# Fit best models on full training data -- DIMENSION 1
for name, model in models.items():
    print(f"Training {name} on full training dataset...")
    model.fit(X, y)  # refit best model on full training data

# Now apply to unseen data and get predictions
predictions = {}
for name, model in models.items():
    print(f"Predicting with {name} on unseen data...")
    preds = model.predict(X_cand)
    predictions[name] = preds




In [None]:
print(predictions)

DIMENSION 2

In [None]:
####PREDICT CHALLENGER SCORES -- DIMENSION 2

import numpy as np
from sklearn.linear_model import Ridge, Lasso, SGDRegressor, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error
from scipy.stats import uniform, loguniform


models1 = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=0.3905),
    'Lasso Regression': Lasso(max_iter=5000, alpha=0.000498),
    'SGD Regressor (squared_epsilon_insensitive)': SGDRegressor(
        loss='squared_epsilon_insensitive',
        max_iter=2000,
        random_state=42,
        alpha=0.000366,
        penalty='l2',
        epsilon=0.0374,
        learning_rate='adaptive'
    )
}

for name, model in models1.items():
    print(f"Training {name} on full training dataset...")
    model.fit(X, y1)  # refit best model on full training data

# Now apply to unseen data and get predictions
predictions1 = {}
for name, model in models1.items():
    print(f"Predicting with {name} on unseen data...")
    preds = model.predict(X_cand)
    predictions1[name] = preds




In [None]:
print(predictions1)

FEATURE EXTRACTION - DIMENSION 1

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso, SGDRegressor, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error

feature_names = np.array(vectorizer.get_feature_names_out())

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=0.05589),
    'Lasso Regression': Lasso(max_iter=5000, alpha=0.000498),
    'SGD Regressor (squared_epsilon_insensitive)': SGDRegressor(
        loss='squared_epsilon_insensitive',
        max_iter=2000,
        random_state=42,
        alpha=0.000366,
        penalty='l2',
        epsilon=0.0374,
        learning_rate='adaptive'
    )
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X, y)
    coefs = model.coef_

    # Sort coefficients
    sorted_idx = np.argsort(coefs)
    top_neg_idx = sorted_idx[:20]   # 5 most negative
    top_pos_idx = sorted_idx[-20:]  # 5 most positive

    print(f"\n=== {name} ===")
    print("Top positive words (increase Dim 1 score):")
    for i in top_pos_idx[::-1]:
        print(f"  {feature_names[i]:<15} {coefs[i]:.4f}")

    print("Top negative words (decrease Dim 1 score):")
    for i in top_neg_idx:
        print(f"  {feature_names[i]:<15} {coefs[i]:.4f}")





FEATURE EXTRACTION - DIMENSION 2

In [None]:
####FEATURE EXTRACTION -- DIMENSION 2

import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso, SGDRegressor, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error

feature_names = np.array(vectorizer.get_feature_names_out())

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=0.05589),
    'Lasso Regression': Lasso(max_iter=5000, alpha=0.000498),
    'SGD Regressor (squared_epsilon_insensitive)': SGDRegressor(
        loss='squared_epsilon_insensitive',
        max_iter=2000,
        random_state=42,
        alpha=0.000366,
        penalty='l2',
        epsilon=0.0374,
        learning_rate='adaptive'
    )
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X, y1)
    coefs = model.coef_

    # Sort coefficients
    sorted_idx = np.argsort(coefs)
    top_neg_idx = sorted_idx[:20]   # 5 most negative
    top_pos_idx = sorted_idx[-20:]  # 5 most positive

    print(f"\n=== {name} ===")
    print("Top positive words (increase Dim 1 score):")
    for i in top_pos_idx[::-1]:
        print(f"  {feature_names[i]:<15} {coefs[i]:.4f}")

    print("Top negative words (decrease Dim 1 score):")
    for i in top_neg_idx:
        print(f"  {feature_names[i]:<15} {coefs[i]:.4f}")



