In [2]:
##import and format data
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('punkt')

#Upload complete training dataset
all114 = pd.read_csv('training_data_114_final.csv', dtype="string")
all2 = all114.astype({'nominate_dim1':'float', 'nominate_dim2': 'float'})
next114 = all2.dropna()

###variable of NOMINATE values
y = next114.nominate_dim1
y1 = next114.nominate_dim2

# OR Upload LEMMATIZED dataset -- DO NOT run this and the complete training dataset upload.  Choose one.
all114 = pd.read_csv('lemmatized_output.csv', dtype="string")
docs = all114['speech']


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
#upload and run the custom stopword list
from congress_stopwords import congress



In [4]:
# Step 4: Convert text to TF-IDF matrix and make it dense


vectorizer = TfidfVectorizer(stop_words=congress, min_df=5, max_df=0.5)
X_sparse = vectorizer.fit_transform(docs)


print(f"TF-IDF shape before PCA: {X_sparse.shape}")

TF-IDF shape before PCA: (438, 14539)


DIMENSION 1

In [5]:
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error, make_scorer

# --- Your data ---
# Replace these with your actual data
X = X_sparse
y = y

# --- Define pipeline: SVD + Random Forest ---
pipeline = Pipeline([
    ('svd', TruncatedSVD(random_state=42)),
    ('rf', RandomForestRegressor(random_state=42))
])

# --- Define hyperparameter space ---
param_grid = {
    'svd__n_components': [100, 150, 200, 250, 300],
    'rf__n_estimators': [100, 200, 400, 800, 1000],
    'rf__max_depth': [10, 20, 40],
    'rf__min_samples_split': [2, 5],
}

# --- Define multiple scoring metrics ---
scoring = {
    'r2': make_scorer(r2_score),
    'rmse': make_scorer(lambda y_true, y_pred: mean_squared_error(y_true, y_pred, squared=False)),
    'mae': make_scorer(mean_absolute_error),
    'max_error': make_scorer(max_error)
}

# --- Define outer and inner CV loops ---
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=4, shuffle=True, random_state=42)

# --- Storage for results ---
outer_results = []

# --- Outer loop ---
for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X)):
    print(f"\n=== Outer Fold {fold+1} ===")
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # --- Inner loop: hyperparameter tuning ---
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_grid,
        n_iter=30,  # number of random combinations
        scoring='r2',
        cv=inner_cv,
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    random_search.fit(X_train, y_train)

    # --- Get best model from inner CV ---
    best_model = random_search.best_estimator_
    print("Best params:", random_search.best_params_)

    # --- Evaluate on outer test set ---
    y_pred = best_model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    max_err = max_error(y_test, y_pred)

    outer_results.append({'fold': fold+1, 'r2': r2, 'rmse': rmse, 'mae': mae, 'max_error': max_err})

# --- Summary of outer results ---
print("\n=== Nested Cross-Validation Results ===")
for res in outer_results:
    print(f"Fold {res['fold']}: R2={res['r2']:.4f}, RMSE={res['rmse']:.4f}, MAE={res['mae']:.4f}, MaxErr={res['max_error']:.4f}")

mean_r2 = np.mean([r['r2'] for r in outer_results])




=== Outer Fold 1 ===
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Best params: {'svd__n_components': 100, 'rf__n_estimators': 1000, 'rf__min_samples_split': 5, 'rf__max_depth': 10}

=== Outer Fold 2 ===
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Best params: {'svd__n_components': 100, 'rf__n_estimators': 200, 'rf__min_samples_split': 5, 'rf__max_depth': 10}

=== Outer Fold 3 ===
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Best params: {'svd__n_components': 100, 'rf__n_estimators': 1000, 'rf__min_samples_split': 5, 'rf__max_depth': 10}

=== Outer Fold 4 ===
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Best params: {'svd__n_components': 100, 'rf__n_estimators': 400, 'rf__min_samples_split': 2, 'rf__max_depth': 40}

=== Outer Fold 5 ===
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Best params: {'svd__n_components': 100, 'rf__n_estimators': 400, 'rf__min_samples_split': 2, 'rf__max_depth': 40}

=== Nes

DIMENSION 2

In [5]:
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, max_error, make_scorer

# --- Your data ---
X = X_sparse
y = y1

# --- Define pipeline: SVD + Random Forest ---
pipeline = Pipeline([
    ('svd', TruncatedSVD(random_state=42)),
    ('rf', RandomForestRegressor(random_state=42))
])

# --- Define hyperparameter space ---
param_grid = {
    'svd__n_components': [100, 150, 200, 250, 300],
    'rf__n_estimators': [100, 200, 400, 800, 1000],
    'rf__max_depth': [10, 20, 40],
    'rf__min_samples_split': [2, 5],
}

# --- Define multiple scoring metrics ---
scoring = {
    'r2': make_scorer(r2_score),
    'rmse': make_scorer(lambda y_true, y_pred: mean_squared_error(y_true, y_pred)),
    'mae': make_scorer(mean_absolute_error),
    'max_error': make_scorer(max_error)
}

# --- Define outer and inner CV loops ---
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=4, shuffle=True, random_state=42)

# --- Storage for results ---
outer_results = []

# --- Outer loop ---
for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X)):
    print(f"\n=== Outer Fold {fold+1} ===")
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # --- Inner loop: hyperparameter tuning ---
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_grid,
        n_iter=30,  # number of random combinations
        scoring='r2',
        cv=inner_cv,
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    random_search.fit(X_train, y_train)

    # --- Get best model from inner CV ---
    best_model = random_search.best_estimator_
    print("Best params:", random_search.best_params_)

    # --- Evaluate on outer test set ---
    y_pred = best_model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    max_err = max_error(y_test, y_pred)

    outer_results.append({'fold': fold+1, 'r2': r2, 'rmse': rmse, 'mae': mae, 'max_error': max_err})

# --- Summary of outer results ---
print("\n=== Nested Cross-Validation Results ===")
for res in outer_results:
    print(f"Fold {res['fold']}: R2={res['r2']:.4f}, RMSE={res['rmse']:.4f}, MAE={res['mae']:.4f}, MaxErr={res['max_error']:.4f}")

mean_r2 = np.mean([r['r2'] for r in outer_results])
print(f"\nMean Outer R² (Generalization Estimate): {mean_r2:.4f}")
print(f"\nMean RMSE (Generalization Estimate): {mean_rmse:.4f}")
print(f"\nMean MAE: {mean_mae:.4f}")
print(f"\nMean Maximum Error: {mean_err:.4f}")


=== Outer Fold 1 ===
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Best params: {'svd__n_components': 100, 'rf__n_estimators': 200, 'rf__min_samples_split': 2, 'rf__max_depth': 20}

=== Outer Fold 2 ===
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Best params: {'svd__n_components': 100, 'rf__n_estimators': 200, 'rf__min_samples_split': 5, 'rf__max_depth': 10}

=== Outer Fold 3 ===
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Best params: {'svd__n_components': 100, 'rf__n_estimators': 1000, 'rf__min_samples_split': 5, 'rf__max_depth': 40}

=== Outer Fold 4 ===
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Best params: {'svd__n_components': 100, 'rf__n_estimators': 400, 'rf__min_samples_split': 2, 'rf__max_depth': 40}

=== Outer Fold 5 ===
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Best params: {'svd__n_components': 300, 'rf__n_estimators': 800, 'rf__min_samples_split': 2, 'rf__max_depth': 10}

=== Nest

NameError: name 'mean_rmse' is not defined