3/8/25: Updated from the last stable/functional version ("dim1_BOW_train-test-combined").  Two major additions:  A different cross-validation method that does not require using **negative** mean absolute error.  I have also updated and expanded the "newsroom" test dataset in the hope of resolving the negative R2 issue.

7/14/25: Implemented HK request for word2vec

In [1]:
!pip install gensim



In [2]:
##import and format data
import pandas as pd
import numpy as np

#Upload complete training dataset
all114 = pd.read_csv('training_data_114_final.csv',dtype='string')
all2 = all114.astype({'speech':'string','nominate_dim1':'float', 'nominate_dim2': 'float'})
final114 = all2.dropna()


In [3]:
#upload the custom stopword list
from congress_stopwords import congress


In [4]:
final114.speech.str.len()
#Just making sure that no speech data were truncated during preprocessing
#and that the proper number of documents are present.

Unnamed: 0,speech
0,17784
1,25356
2,113985
3,109704
4,45340
...,...
433,90749
434,31889
435,15301
436,24527


In [10]:
import numpy as np
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

# 1. Tokenize speeches
tokenized_sentences = [simple_preprocess(speech) for speech in final114.speech]

# 2. Train Word2Vec model
#tried min_count of 5, 10 -- higher number increased max error, didn't up r2
w2v_model = Word2Vec(
    sentences=tokenized_sentences,
    vector_size=100,    # dimensionality of word vectors
    window=5,           # context window size
    min_count=5,       # ignore rare words
    workers=4,          # number of threads
    sg=1                # 1 = skip-gram; 0 = CBOW
)

# 3. Define a function to average word vectors in a document
def document_vector(doc, model):
    words = [word for word in doc if word in model.wv]
    if not words:
        return np.zeros(model.vector_size)
    return np.mean([model.wv[word] for word in words], axis=0)

# 4. Transform all speeches into averaged word vectors
X = np.array([document_vector(doc, w2v_model) for doc in tokenized_sentences])

# 5. Target variables
y = final114.nominate_dim1
y1 = final114.nominate_dim2


In [11]:
#making sure I ended up with the right number of documents
len(X)

438

In [None]:
#inspecting sample arrays
X[100:102]

In [8]:
from sklearn.model_selection import RepeatedKFold, cross_val_score, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, max_error


In [None]:
#TRAINING AND CROSS-VALIDATION -- DIMENSION 1
models = {
    'Linear Regression': LinearRegression(),
    #'Random Forest Regressor': RandomForestRegressor(n_estimators=600),
    #'Gradient Boosting Regressor': GradientBoostingRegressor(n_estimators=1600)
}

# Create a RepeatedKFold cross-validator
rkf = RepeatedKFold(n_splits=4, n_repeats=1, random_state=42)

# Function to calculate all metrics
def calculate_metrics(model, X, y, cv):
    r2_scores = []
    rmse_scores = []
    mae_scores = []
    max_errors = []

    for train_index, test_index in cv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Calculate metrics
        r2_scores.append(r2_score(y_test, y_pred))
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
        mae_scores.append(mean_absolute_error(y_test, y_pred))
        max_errors.append(max_error(y_test, y_pred))

    return r2_scores, rmse_scores, mae_scores, max_errors

# Evaluate each model using cross-validation and calculate the metrics
for model_name, model in models.items():
    r2_scores, rmse_scores, mae_scores, max_errors = calculate_metrics(model, X, y, rkf)

    #print(f"{model_name} - R2 Scores: {r2_scores}")
    print(f"{model_name} - Mean R2: {np.mean(r2_scores)}")
    #print(f"{model_name} - RMSE Scores: {rmse_scores}")
    print(f"{model_name} - Mean RMSE: {np.mean(rmse_scores)}")
    #print(f"{model_name} - MAE Scores: {mae_scores}")
    print(f"{model_name} - Mean MAE: {np.mean(mae_scores)}")
    #print(f"{model_name} - Max Errors: {max_errors}")
    print(f"{model_name} - Mean Max Error: {np.mean(max_errors)}\n")
