11/16/25: This is an update of the notebook previously labeled "Unreduced_models_NestedCV_withMAXERROR."  Includes the following:

1. Text feature extraction
2. 2nd test of Dimension 2 with geographic area

In [3]:
##import and format data
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('punkt')

#Upload complete training dataset
all114 = pd.read_csv('training_data_114_final.csv', dtype="string")
all2 = all114.astype({'nominate_dim1':'float', 'nominate_dim2': 'float'})
next114 = all2.dropna()

###variable of NOMINATE values
ydim1 = next114.nominate_dim1
y1 = next114.nominate_dim2

#Full training set text data
final114 = next114['speech']

# OR Upload LEMMATIZED dataset -- DO NOT run this and the complete training dataset upload.  Choose one.
#all114 = pd.read_csv('lemmatized_output.csv', dtype="string")
#docs = all114['speech']


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#upload and run the custom stopword list
from congress_stopwords import congress



In [4]:
####LEMMATIZE
doc = final114

import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Function to lemmatize a single text entry
def lemmatize_text(text):
    text = str(text)
    if len(text) > 1_000_000:
        return "TEXT TOO LONG"
    doc = nlp(text)  # Ensure text is string
    return " ".join([token.lemma_ for token in doc])

# Apply to the whole column
docs = doc.apply(lemmatize_text)

# Preview result
print(docs.head())

0    Mr. Speaker . nearly 160 million Americans rec...
1    let I thank my colleague for yielding . and le...
2    but neither the United States nor any state sh...
3    Mr. Speaker . the Energy and Commerce Committe...
4    Mr. Speaker . I rise today to ask my colleague...
Name: speech, dtype: object


In [5]:
# Step 4: Convert text to TF-IDF matrix and make it dense


vectorizer = TfidfVectorizer(stop_words=congress, min_df=5, max_df=0.5)
X_sparse = vectorizer.fit_transform(docs)


print(f"TF-IDF shape before PCA: {X_sparse.shape}")

TF-IDF shape before PCA: (438, 14539)


DIMENSION 1

In [7]:
####CODE REMOVES RF AND DISPLAYS HYPERPARAMETERS

from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error
from scipy.stats import loguniform, uniform
import numpy as np
import pandas as pd

# === Example data ===
# Replace with your actual data
X = X_sparse
y = ydim1

# === Define models and parameter search spaces ===
param_distributions = {
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {}  # No hyperparameters to tune
    },
    'Ridge Regression': {
        'model': Ridge(),
        'params': {'alpha': loguniform(1e-3, 1e3)}
    },
    'Lasso Regression': {
        'model': Lasso(max_iter=5000),
        'params': {'alpha': loguniform(1e-4, 10)}
    },
    'SGD Regressor (squared_epsilon_insensitive)': {
        'model': SGDRegressor(
            loss='squared_epsilon_insensitive',
            max_iter=2000,
            random_state=42
        ),
        'params': {
            'alpha': loguniform(1e-5, 1e-1),
            'penalty': ['l1', 'l2', 'elasticnet'],
            'epsilon': uniform(0.001, 0.2),
            'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive']
        }
    }
}

# === Cross-validation strategies ===
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=4, shuffle=True, random_state=42)

# === Storage for results ===
nested_results = {}

# === Nested Cross-Validation ===
for name, cfg in param_distributions.items():
    print(f"\n===== {name} =====")
    outer_scores = []
    best_params_per_fold = []

    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X)):
        print(f"\n--- Outer Fold {fold + 1} ---")

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        if cfg['params']:
            search = RandomizedSearchCV(
                estimator=cfg['model'],
                param_distributions=cfg['params'],
                n_iter=30,
                scoring='r2',
                cv=inner_cv,
                n_jobs=-1,
                random_state=42,
                verbose=0
            )
            search.fit(X_train, y_train)
            best_model = search.best_estimator_
            best_params = search.best_params_
            print(f"Best params (fold {fold + 1}): {best_params}")
        else:
            best_model = cfg['model']
            best_model.fit(X_train, y_train)
            best_params = {}
            print(f"No hyperparameters to tune for {name}.")

        best_params_per_fold.append(best_params)

        # Evaluate on outer test set
        y_pred = best_model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        max_err = max_error(y_test, y_pred)

        outer_scores.append({
            'fold': fold + 1,
            'r2': r2,
            'rmse': rmse,
            'mae': mae,
            'max_error': max_err
        })

        print(f"Fold {fold + 1} R²: {r2:.4f} | RMSE: {rmse:.4f} | MAE: {mae:.4f} | MAX ERROR: {max_err:.4f}")

    # Aggregate outer CV results
    mean_r2 = np.mean([r['r2'] for r in outer_scores])
    mean_rmse = np.mean([r['rmse'] for r in outer_scores])
    mean_mae = np.mean([r['mae'] for r in outer_scores])
    mean_maxerr = np.mean([r['max_error'] for r in outer_scores])

    # Summarize best params (if any)
    if best_params_per_fold and any(best_params_per_fold):
        params_df = pd.DataFrame(best_params_per_fold)
        mean_params = params_df.mode().iloc[0].to_dict()  # most common best params
    else:
        mean_params = {}

    nested_results[name] = {
        'outer_scores': outer_scores,
        'mean_r2': mean_r2,
        'mean_rmse': mean_rmse,
        'mean_mae': mean_mae,
        'mean_max_error': mean_maxerr,
        'best_params_per_fold': best_params_per_fold,
        'mean_best_params': mean_params
    }

    print(f"\nMean Outer R²: {mean_r2:.4f}")
    print(f"Mean Outer RMSE: {mean_rmse:.4f}")
    print(f"Mean Outer MAE: {mean_mae:.4f}")
    print(f"Mean Outer Max Error: {mean_maxerr:.4f}")
    if mean_params:
        print(f"Most common/best parameters across folds: {mean_params}")

# === Summary of all models ===
print("\n===== Nested CV Summary =====")
for name, res in nested_results.items():
    print(f"\n{name}:")
    print(f"  Mean R² = {res['mean_r2']:.4f}")
    print(f"  Mean RMSE = {res['mean_rmse']:.4f}")
    print(f"  Mean MAE = {res['mean_mae']:.4f}")
    print(f"  Mean Max Error = {res['mean_max_err']:.4f}")
    if res['mean_best_params']:
        print(f"  Representative Best Params: {res['mean_best_params']}")



===== Linear Regression =====

--- Outer Fold 1 ---
No hyperparameters to tune for Linear Regression.
Fold 1 R²: 0.5239 | RMSE: 0.0981 | MAE: 0.2446 | MAX ERROR: 0.7923

--- Outer Fold 2 ---
No hyperparameters to tune for Linear Regression.
Fold 2 R²: 0.1287 | RMSE: 0.1682 | MAE: 0.3111 | MAX ERROR: 1.3200

--- Outer Fold 3 ---
No hyperparameters to tune for Linear Regression.
Fold 3 R²: 0.4194 | RMSE: 0.1299 | MAE: 0.2850 | MAX ERROR: 1.3200

--- Outer Fold 4 ---
No hyperparameters to tune for Linear Regression.
Fold 4 R²: 0.5372 | RMSE: 0.0910 | MAE: 0.2598 | MAX ERROR: 0.6887

--- Outer Fold 5 ---
No hyperparameters to tune for Linear Regression.
Fold 5 R²: 0.4696 | RMSE: 0.1036 | MAE: 0.2645 | MAX ERROR: 1.0330

Mean Outer R²: 0.4157
Mean Outer RMSE: 0.1182
Mean Outer MAE: 0.2730
Mean Outer Max Error: 1.0308

===== Ridge Regression =====

--- Outer Fold 1 ---
Best params (fold 1): {'alpha': np.float64(0.055895242052179224)}
Fold 1 R²: 0.5313 | RMSE: 0.0966 | MAE: 0.2474 | MAX ERRO

KeyError: 'mean_max_err'

DIMENSION 2

In [9]:
####REMOVES RF AND PRESENTS HYPERPARAMETERS

####CODE REMOVES RF AND DISPLAYS HYPERPARAMETERS

from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error
from scipy.stats import loguniform, uniform
import numpy as np
import pandas as pd

# === Example data ===
# Replace with your actual data
X = X_sparse
y = y1

# === Define models and parameter search spaces ===
param_distributions = {
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {}  # No hyperparameters to tune
    },
    'Ridge Regression': {
        'model': Ridge(),
        'params': {'alpha': loguniform(1e-3, 1e3)}
    },
    'Lasso Regression': {
        'model': Lasso(max_iter=5000),
        'params': {'alpha': loguniform(1e-4, 10)}
    },
    'SGD Regressor (squared_epsilon_insensitive)': {
        'model': SGDRegressor(
            loss='squared_epsilon_insensitive',
            max_iter=2000,
            random_state=42
        ),
        'params': {
            'alpha': loguniform(1e-5, 1e-1),
            'penalty': ['l1', 'l2', 'elasticnet'],
            'epsilon': uniform(0.001, 0.2),
            'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive']
        }
    }
}

# === Cross-validation strategies ===
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=4, shuffle=True, random_state=42)

# === Storage for results ===
nested_results = {}

# === Nested Cross-Validation ===
for name, cfg in param_distributions.items():
    print(f"\n===== {name} =====")
    outer_scores = []
    best_params_per_fold = []

    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X)):
        print(f"\n--- Outer Fold {fold + 1} ---")

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        if cfg['params']:
            search = RandomizedSearchCV(
                estimator=cfg['model'],
                param_distributions=cfg['params'],
                n_iter=30,
                scoring='r2',
                cv=inner_cv,
                n_jobs=-1,
                random_state=42,
                verbose=0
            )
            search.fit(X_train, y_train)
            best_model = search.best_estimator_
            best_params = search.best_params_
            print(f"Best params (fold {fold + 1}): {best_params}")
        else:
            best_model = cfg['model']
            best_model.fit(X_train, y_train)
            best_params = {}
            print(f"No hyperparameters to tune for {name}.")

        best_params_per_fold.append(best_params)

        # Evaluate on outer test set
        y_pred = best_model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        max_err = max_error(y_test, y_pred)

        outer_scores.append({
            'fold': fold + 1,
            'r2': r2,
            'rmse': rmse,
            'mae': mae,
            'max_error': max_err
        })

        print(f"Fold {fold + 1} R²: {r2:.4f} | RMSE: {rmse:.4f} | MAE: {mae:.4f} | Max Err: {max_err:.4f}")

    # Aggregate outer CV results
    mean_r2 = np.mean([r['r2'] for r in outer_scores])
    mean_rmse = np.mean([r['rmse'] for r in outer_scores])
    mean_mae = np.mean([r['mae'] for r in outer_scores])
    mean_maxerr = np.mean([r['max_error'] for r in outer_scores])

    # Summarize best params (if any)
    if best_params_per_fold and any(best_params_per_fold):
        params_df = pd.DataFrame(best_params_per_fold)
        mean_params = params_df.mode().iloc[0].to_dict()  # most common best params
    else:
        mean_params = {}

    nested_results[name] = {
        'outer_scores': outer_scores,
        'mean_r2': mean_r2,
        'mean_rmse': mean_rmse,
        'mean_mae': mean_mae,
        'mean_max_error': mean_maxerr,
        'best_params_per_fold': best_params_per_fold,
        'mean_best_params': mean_params
    }

    print(f"\nMean Outer R²: {mean_r2:.4f}")
    print(f"Mean Outer RMSE: {mean_rmse:.4f}")
    print(f"Mean Outer MAE: {mean_mae:.4f}")
    print(f"Mean Outer Max Error: {mean_maxerr:.4f}")
    if mean_params:
        print(f"Most common/best parameters across folds: {mean_params}")

# === Summary of all models ===
print("\n===== Nested CV Summary =====")
for name, res in nested_results.items():
    print(f"\n{name}:")
    print(f"  Mean R² = {res['mean_r2']:.4f}")
    print(f"  Mean RMSE = {res['mean_rmse']:.4f}")
    print(f"  Mean MAE = {res['mean_mae']:.4f}")
    if res['mean_best_params']:
        print(f"  Representative Best Params: {res['mean_best_params']}")



===== Linear Regression =====

--- Outer Fold 1 ---
No hyperparameters to tune for Linear Regression.
Fold 1 R²: -0.2981 | RMSE: 0.0702 | MAE: 0.2165 | Max Err: 0.6202

--- Outer Fold 2 ---
No hyperparameters to tune for Linear Regression.
Fold 2 R²: -0.5018 | RMSE: 0.1008 | MAE: 0.2536 | Max Err: 0.7640

--- Outer Fold 3 ---
No hyperparameters to tune for Linear Regression.
Fold 3 R²: 0.0296 | RMSE: 0.0802 | MAE: 0.2291 | Max Err: 0.7769

--- Outer Fold 4 ---
No hyperparameters to tune for Linear Regression.
Fold 4 R²: -0.1813 | RMSE: 0.0720 | MAE: 0.2109 | Max Err: 0.7522

--- Outer Fold 5 ---
No hyperparameters to tune for Linear Regression.
Fold 5 R²: -0.2712 | RMSE: 0.0890 | MAE: 0.2406 | Max Err: 0.6842

Mean Outer R²: -0.2446
Mean Outer RMSE: 0.0824
Mean Outer MAE: 0.2301
Mean Outer Max Error: 0.7195

===== Ridge Regression =====

--- Outer Fold 1 ---
Best params (fold 1): {'alpha': np.float64(0.3905441275210791)}
Fold 1 R²: 0.1412 | RMSE: 0.0465 | MAE: 0.1709 | Max Err: 0.5325

FEATURE EXTRACTION - DIMENSION 1

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso, SGDRegressor, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error

X = X_sparse
y = ydim1


feature_names = np.array(vectorizer.get_feature_names_out())

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=0.05589),
    'Lasso Regression': Lasso(max_iter=5000, alpha=0.000498),
    'SGD Regressor (squared_epsilon_insensitive)': SGDRegressor(
        loss='squared_epsilon_insensitive',
        max_iter=2000,
        random_state=42,
        alpha=0.000366,
        penalty='l2',
        epsilon=0.0374,
        learning_rate='adaptive'
    )
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X, y)
    coefs = model.coef_

    # Sort coefficients
    sorted_idx = np.argsort(coefs)
    top_neg_idx = sorted_idx[:20]   # 5 most negative
    top_pos_idx = sorted_idx[-20:]  # 5 most positive

    print(f"\n=== {name} ===")
    print("Top positive words (increase Dim 1 score):")
    for i in top_pos_idx[::-1]:
        print(f"  {feature_names[i]:<15} {coefs[i]:.4f}")

    print("Top negative words (decrease Dim 1 score):")
    for i in top_neg_idx:
        print(f"  {feature_names[i]:<15} {coefs[i]:.4f}")





Training Linear Regression...

=== Linear Regression ===
Top positive words (increase Dim 1 score):
  statehood       0.6347
  lodge           0.5522
  taiwan          0.5386
  sergeant        0.5326
  331             0.5240
  veterans        0.5228
  unborn          0.4785
  bureaucrat      0.4769
  epa             0.4538
  missile         0.4364
  obamacare       0.4123
  trillion        0.4017
  illegal         0.3994
  dad             0.3978
  enemy           0.3955
  alien           0.3941
  patent          0.3890
  coach           0.3809
  liberty         0.3624
  fathers         0.3542
Top negative words (decrease Dim 1 score):
  pollution       -0.5035
  borrower        -0.4888
  aca             -0.4828
  rail            -0.4731
  print           -0.4430
  bureau          -0.4146
  passenger       -0.3964
  wall            -0.3887
  los             -0.3883
  climate         -0.3859
  loophole        -0.3849
  wage            -0.3533
  houston         -0.3527
  fishing         -

FEATURE EXTRACTION - DIMENSION 2

In [None]:
####FEATURE EXTRACTION -- DIMENSION 2

import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso, SGDRegressor, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error

X = X_sparse
y = y1

feature_names = np.array(vectorizer.get_feature_names_out())

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=0.05589),
    'Lasso Regression': Lasso(max_iter=5000, alpha=0.000498),
    'SGD Regressor (squared_epsilon_insensitive)': SGDRegressor(
        loss='squared_epsilon_insensitive',
        max_iter=2000,
        random_state=42,
        alpha=0.000366,
        penalty='l2',
        epsilon=0.0374,
        learning_rate='adaptive'
    )
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X, y1)
    coefs = model.coef_

    # Sort coefficients
    sorted_idx = np.argsort(coefs)
    top_neg_idx = sorted_idx[:20]   # 5 most negative
    top_pos_idx = sorted_idx[-20:]  # 5 most positive

    print(f"\n=== {name} ===")
    print("Top positive words (increase Dim 1 score):")
    for i in top_pos_idx[::-1]:
        print(f"  {feature_names[i]:<15} {coefs[i]:.4f}")

    print("Top negative words (decrease Dim 1 score):")
    for i in top_neg_idx:
        print(f"  {feature_names[i]:<15} {coefs[i]:.4f}")





Training Linear Regression...

=== Linear Regression ===
Top positive words (increase Dim 1 score):
  print           0.6145
  rural           0.5713
  farm            0.5500
  housing         0.4744
  irs             0.4644
  orleans         0.4501
  intelligence    0.4356
  cftc            0.4302
  bashar          0.4189
  memorandum      0.4108
  alassad         0.4072
  tweet           0.4065
  omaha           0.4028
  veterans        0.3937
  valley          0.3932
  houston         0.3901
  classified      0.3872
  331             0.3799
  november        0.3781
  cyber           0.3685
Top negative words (decrease Dim 1 score):
  carbon          -0.4651
  northern        -0.3982
  progressive     -0.3786
  marijuana       -0.3105
  gmo             -0.3074
  marriage        -0.3070
  hemp            -0.3009
  subsidy         -0.2922
  chemical        -0.2721
  conservation    -0.2702
  contractor      -0.2686
  osc             -0.2683
  corporate       -0.2620
  pollution       -