In [None]:
##import and format data
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('punkt')

#Upload complete training dataset
all114 = pd.read_csv('training_data_114_final.csv', dtype="string")
all2 = all114.astype({'nominate_dim1':'float', 'nominate_dim2': 'float'})
next114 = all2.dropna()

###variable of NOMINATE values
ydim1 = next114.nominate_dim1
y1 = next114.nominate_dim2

#Full training set text data
final114 = next114['speech']

# OR Upload LEMMATIZED dataset -- DO NOT run this and the complete training dataset upload.  Choose one.
#all114 = pd.read_csv('lemmatized_output.csv', dtype="string")
#docs = all114['speech']


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#upload and run the custom stopword list
from congress_stopwords import congress



In [None]:
####LEMMATIZE
doc = final114

import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Function to lemmatize a single text entry
def lemmatize_text(text):
    text = str(text)
    if len(text) > 1_000_000:
        return "TEXT TOO LONG"
    doc = nlp(text)  # Ensure text is string
    return " ".join([token.lemma_ for token in doc])

# Apply to the whole column
docs = doc.apply(lemmatize_text)

# Preview result
print(docs.head())

In [None]:
# Step 4: Convert text to TF-IDF matrix and make it dense


vectorizer = TfidfVectorizer(stop_words=congress, min_df=5, max_df=0.5)
X_sparse = vectorizer.fit_transform(docs)


print(f"TF-IDF shape before PCA: {X_sparse.shape}")

DIMENSION 1

In [None]:
####CODE REMOVES RF AND DISPLAYS HYPERPARAMETERS

from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error
from scipy.stats import loguniform, uniform
import numpy as np
import pandas as pd

# data
X = X_sparse
y = ydim1

# === Define models and parameter search spaces ===
param_distributions = {
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {}  # No hyperparameters to tune
    },
    'Ridge Regression': {
        'model': Ridge(),
        'params': {'alpha': loguniform(1e-3, 1e3)}
    },
    'Lasso Regression': {
        'model': Lasso(max_iter=5000),
        'params': {'alpha': loguniform(1e-4, 10)}
    },
    'SGD Regressor (squared_epsilon_insensitive)': {
        'model': SGDRegressor(
            loss='squared_epsilon_insensitive',
            max_iter=2000,
            random_state=42
        ),
        'params': {
            'alpha': loguniform(1e-5, 1e-1),
            'penalty': ['l1', 'l2', 'elasticnet'],
            'epsilon': uniform(0.001, 0.2),
            'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive']
        }
    }
}

# === Cross-validation===
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=4, shuffle=True, random_state=42)

# === Storage for results ===
nested_results = {}

# === Nested Cross-Validation ===
for name, cfg in param_distributions.items():
    print(f"\n===== {name} =====")
    outer_scores = []
    best_params_per_fold = []

    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X)):
        print(f"\n--- Outer Fold {fold + 1} ---")

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        if cfg['params']:
            search = RandomizedSearchCV(
                estimator=cfg['model'],
                param_distributions=cfg['params'],
                n_iter=30,
                scoring='r2',
                cv=inner_cv,
                n_jobs=-1,
                random_state=42,
                verbose=0
            )
            search.fit(X_train, y_train)
            best_model = search.best_estimator_
            best_params = search.best_params_
            print(f"Best params (fold {fold + 1}): {best_params}")
        else:
            best_model = cfg['model']
            best_model.fit(X_train, y_train)
            best_params = {}
            print(f"No hyperparameters to tune for {name}.")

        best_params_per_fold.append(best_params)

        # Evaluate on outer test set
        y_pred = best_model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        max_err = max_error(y_test, y_pred)

        outer_scores.append({
            'fold': fold + 1,
            'r2': r2,
            'rmse': rmse,
            'mae': mae,
            'max_error': max_err
        })

        print(f"Fold {fold + 1} R²: {r2:.4f} | RMSE: {rmse:.4f} | MAE: {mae:.4f} | MAX ERROR: {max_err:.4f}")

    # Aggregate outer CV results
    mean_r2 = np.mean([r['r2'] for r in outer_scores])
    mean_rmse = np.mean([r['rmse'] for r in outer_scores])
    mean_mae = np.mean([r['mae'] for r in outer_scores])
    mean_maxerr = np.mean([r['max_error'] for r in outer_scores])

    # Summarize best params (if any)
    if best_params_per_fold and any(best_params_per_fold):
        params_df = pd.DataFrame(best_params_per_fold)
        mean_params = params_df.mode().iloc[0].to_dict()  # most common best params
    else:
        mean_params = {}

    nested_results[name] = {
        'outer_scores': outer_scores,
        'mean_r2': mean_r2,
        'mean_rmse': mean_rmse,
        'mean_mae': mean_mae,
        'mean_max_error': mean_maxerr,
        'best_params_per_fold': best_params_per_fold,
        'mean_best_params': mean_params
    }

    print(f"\nMean Outer R²: {mean_r2:.4f}")
    print(f"Mean Outer RMSE: {mean_rmse:.4f}")
    print(f"Mean Outer MAE: {mean_mae:.4f}")
    print(f"Mean Outer Max Error: {mean_maxerr:.4f}")
    if mean_params:
        print(f"Most common/best parameters across folds: {mean_params}")

# === Summary of all models ===
print("\n===== Nested CV Summary =====")
for name, res in nested_results.items():
    print(f"\n{name}:")
    print(f"  Mean R² = {res['mean_r2']:.4f}")
    print(f"  Mean RMSE = {res['mean_rmse']:.4f}")
    print(f"  Mean MAE = {res['mean_mae']:.4f}")
    print(f"  Mean Max Error = {res['mean_max_err']:.4f}")
    if res['mean_best_params']:
        print(f"  Representative Best Params: {res['mean_best_params']}")


DIMENSION 2

In [None]:
###DIMENSION 2

from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error
from scipy.stats import loguniform, uniform
import numpy as np
import pandas as pd

#  data
X = X_sparse
y = y1

# === Define models and parameter search spaces ===
param_distributions = {
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {}  # No hyperparameters to tune
    },
    'Ridge Regression': {
        'model': Ridge(),
        'params': {'alpha': loguniform(1e-3, 1e3)}
    },
    'Lasso Regression': {
        'model': Lasso(max_iter=5000),
        'params': {'alpha': loguniform(1e-4, 10)}
    },
    'SGD Regressor (squared_epsilon_insensitive)': {
        'model': SGDRegressor(
            loss='squared_epsilon_insensitive',
            max_iter=2000,
            random_state=42
        ),
        'params': {
            'alpha': loguniform(1e-5, 1e-1),
            'penalty': ['l1', 'l2', 'elasticnet'],
            'epsilon': uniform(0.001, 0.2),
            'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive']
        }
    }
}

# === Cross-validation ===
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=4, shuffle=True, random_state=42)

# === Storage for results ===
nested_results = {}

# === Nested Cross-Validation ===
for name, cfg in param_distributions.items():
    print(f"\n===== {name} =====")
    outer_scores = []
    best_params_per_fold = []

    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X)):
        print(f"\n--- Outer Fold {fold + 1} ---")

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        if cfg['params']:
            search = RandomizedSearchCV(
                estimator=cfg['model'],
                param_distributions=cfg['params'],
                n_iter=30,
                scoring='r2',
                cv=inner_cv,
                n_jobs=-1,
                random_state=42,
                verbose=0
            )
            search.fit(X_train, y_train)
            best_model = search.best_estimator_
            best_params = search.best_params_
            print(f"Best params (fold {fold + 1}): {best_params}")
        else:
            best_model = cfg['model']
            best_model.fit(X_train, y_train)
            best_params = {}
            print(f"No hyperparameters to tune for {name}.")

        best_params_per_fold.append(best_params)

        # Evaluate on outer test set
        y_pred = best_model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        max_err = max_error(y_test, y_pred)

        outer_scores.append({
            'fold': fold + 1,
            'r2': r2,
            'rmse': rmse,
            'mae': mae,
            'max_error': max_err
        })

        print(f"Fold {fold + 1} R²: {r2:.4f} | RMSE: {rmse:.4f} | MAE: {mae:.4f} | Max Err: {max_err:.4f}")

    # Aggregate outer CV results
    mean_r2 = np.mean([r['r2'] for r in outer_scores])
    mean_rmse = np.mean([r['rmse'] for r in outer_scores])
    mean_mae = np.mean([r['mae'] for r in outer_scores])
    mean_maxerr = np.mean([r['max_error'] for r in outer_scores])

    # Summarize best params (if any)
    if best_params_per_fold and any(best_params_per_fold):
        params_df = pd.DataFrame(best_params_per_fold)
        mean_params = params_df.mode().iloc[0].to_dict()  # most common best params
    else:
        mean_params = {}

    nested_results[name] = {
        'outer_scores': outer_scores,
        'mean_r2': mean_r2,
        'mean_rmse': mean_rmse,
        'mean_mae': mean_mae,
        'mean_max_error': mean_maxerr,
        'best_params_per_fold': best_params_per_fold,
        'mean_best_params': mean_params
    }

    print(f"\nMean Outer R²: {mean_r2:.4f}")
    print(f"Mean Outer RMSE: {mean_rmse:.4f}")
    print(f"Mean Outer MAE: {mean_mae:.4f}")
    print(f"Mean Outer Max Error: {mean_maxerr:.4f}")
    if mean_params:
        print(f"Most common/best parameters across folds: {mean_params}")

# === Summary of all models ===
print("\n===== Nested CV Summary =====")
for name, res in nested_results.items():
    print(f"\n{name}:")
    print(f"  Mean R² = {res['mean_r2']:.4f}")
    print(f"  Mean RMSE = {res['mean_rmse']:.4f}")
    print(f"  Mean MAE = {res['mean_mae']:.4f}")
    if res['mean_best_params']:
        print(f"  Representative Best Params: {res['mean_best_params']}")


FEATURE EXTRACTION - DIMENSION 1

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso, SGDRegressor, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error

X = X_sparse
y = ydim1


feature_names = np.array(vectorizer.get_feature_names_out())

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=0.05589),
    'Lasso Regression': Lasso(max_iter=5000, alpha=0.000498),
    'SGD Regressor (squared_epsilon_insensitive)': SGDRegressor(
        loss='squared_epsilon_insensitive',
        max_iter=2000,
        random_state=42,
        alpha=0.000366,
        penalty='l2',
        epsilon=0.0374,
        learning_rate='adaptive'
    )
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X, y)
    coefs = model.coef_

    # Sort coefficients
    sorted_idx = np.argsort(coefs)
    top_neg_idx = sorted_idx[:20]   # 5 most negative
    top_pos_idx = sorted_idx[-20:]  # 5 most positive

    print(f"\n=== {name} ===")
    print("Top positive words (increase Dim 1 score):")
    for i in top_pos_idx[::-1]:
        print(f"  {feature_names[i]:<15} {coefs[i]:.4f}")

    print("Top negative words (decrease Dim 1 score):")
    for i in top_neg_idx:
        print(f"  {feature_names[i]:<15} {coefs[i]:.4f}")





FEATURE EXTRACTION - DIMENSION 2

In [None]:
####FEATURE EXTRACTION -- DIMENSION 2

import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso, SGDRegressor, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error

X = X_sparse
y = y1

feature_names = np.array(vectorizer.get_feature_names_out())

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=0.05589),
    'Lasso Regression': Lasso(max_iter=5000, alpha=0.000498),
    'SGD Regressor (squared_epsilon_insensitive)': SGDRegressor(
        loss='squared_epsilon_insensitive',
        max_iter=2000,
        random_state=42,
        alpha=0.000366,
        penalty='l2',
        epsilon=0.0374,
        learning_rate='adaptive'
    )
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X, y1)
    coefs = model.coef_

    # Sort coefficients
    sorted_idx = np.argsort(coefs)
    top_neg_idx = sorted_idx[:20]   # 5 most negative
    top_pos_idx = sorted_idx[-20:]  # 5 most positive

    print(f"\n=== {name} ===")
    print("Top positive words (increase Dim 1 score):")
    for i in top_pos_idx[::-1]:
        print(f"  {feature_names[i]:<15} {coefs[i]:.4f}")

    print("Top negative words (decrease Dim 1 score):")
    for i in top_neg_idx:
        print(f"  {feature_names[i]:<15} {coefs[i]:.4f}")



