In [None]:
##import and format data
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('punkt')

#Upload complete training dataset
all114 = pd.read_csv('training_data_114_final.csv', dtype="string")
all2 = all114.astype({'nominate_dim1':'float', 'nominate_dim2': 'float'})
next114 = all2.dropna()

###variable of NOMINATE values
y = next114.nominate_dim1
y1 = next114.nominate_dim2

# OR Upload LEMMATIZED dataset -- DO NOT run this and the complete training dataset upload.  Choose one.
all114 = pd.read_csv('lemmatized_output.csv', dtype="string")
docs = all114['speech']


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
#upload and run the custom stopword list
from congress_stopwords import congress



In [None]:
####LEMMATIZE
doc = final114

import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Function to lemmatize a single text entry
def lemmatize_text(text):
    text = str(text)
    if len(text) > 1_000_000:
        return "TEXT TOO LONG"
    doc = nlp(text)  # Ensure text is string
    return " ".join([token.lemma_ for token in doc])

# Apply to the whole column
docs = doc.apply(lemmatize_text)

# Preview result
print(docs.head())

In [None]:
# Step 4: Convert text to TF-IDF matrix and make it dense


vectorizer = TfidfVectorizer(stop_words=congress, min_df=5, max_df=0.5)
X_sparse = vectorizer.fit_transform(docs)
#X_dense = X_sparse.toarray()  # Convert to dense format for PCA

print(f"TF-IDF shape before PCA: {X_sparse.shape}")

In [None]:
from sklearn.decomposition import TruncatedSVD

#SVD = 100
n_components = 100
svd100 = TruncatedSVD(n_components=n_components)
X100 = svd100.fit_transform(X_sparse)

print(f"PCA shape after reduction: {X100.shape}") # second dimension should equal n_components

In [None]:
from sklearn.decomposition import TruncatedSVD

#SVD = 250
n_components = 250
svd250 = TruncatedSVD(n_components=n_components)
X250 = svd250.fit_transform(X_sparse)

print(f"PCA shape after reduction: {X250.shape}") # second dimension should equal n_components

VALIDATION

In [None]:
###116th CONGRESSIONAL RECORD speeches
all116 = pd.read_csv('116incCR1.txt', dtype="string", sep = ',')
all2 = all116.astype({'score':'float','dim2':'float'})
cr116_1 = all2.dropna()


In [None]:
val_speech = cr116_1['text']


In [None]:
doc = val_speech

import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Function to lemmatize a single text entry
def lemmatize_text(text):
    text = str(text)
    if len(text) > 1_000_000:
        return "TEXT TOO LONG"
    doc = nlp(text)  # Ensure text is string
    return " ".join([token.lemma_ for token in doc])

# Apply to the whole column
val_speeches = doc.apply(lemmatize_text)

# Preview result
print(val_speeches.head())

In [None]:
len(val_speeches)

In [None]:
X_val = vectorizer.transform(val_speeches)
y_val = cr116_1.score
y_val1 = cr116_1.dim2

In [None]:
X_val.shape

In [None]:
#SVD = 250
#n_components = 250
#svd250 = TruncatedSVD(n_components=n_components)
X_val_svd = svd250.transform(X_val)

print(f"shape after reduction: {X_val_svd.shape}") # second dimension should equal n_components

In [None]:
#SVD = 100
#n_components = 100
#svd250 = TruncatedSVD(n_components=n_components)
X_val_svd100 = svd100.transform(X_val)

print(f"shape after reduction: {X_val_svd100.shape}") # second dimension should equal n_components

In [None]:
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, r2_score, mean_squared_error, mean_absolute_error, max_error

###VALIDATION DATA 1 --DIMENSION 1-- 116 CR -- SVD 250
model = RandomForestRegressor(n_estimators=1000, max_depth = 10, min_samples_split = 5)
model.fit(X250, y)

y_val_pred = model.predict(X_val_svd)


r2_val = r2_score(y_val, y_val_pred)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
mae_val = mean_absolute_error(y_val, y_val_pred)
max_err_val = max_error(y_val, y_val_pred)

# Print the evaluation metrics for the validation data
print("Validation Set Evaluation:")
print(f"R-squared: {r2_val}")
print(f"RMSE: {rmse_val}")
print(f"MAE: {mae_val}")
print(f"Max Error: {max_err_val}")



In [None]:
###export trained model for SVD = 250
import joblib

joblib.dump(model,'svd250_full.pkl')

In [None]:
#### CR116 - SVD 100
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, r2_score, mean_squared_error, mean_absolute_error, max_error

###VALIDATION DATA 1 --DIMENSION 1-- 116 CR -- SVD 100
model = RandomForestRegressor(n_estimators=1000, max_depth = 10, min_samples_split = 5)
model.fit(X100, y)

y_val_pred = model.predict(X_val_svd100)


r2_val = r2_score(y_val, y_val_pred)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
mae_val = mean_absolute_error(y_val, y_val_pred)
max_err_val = max_error(y_val, y_val_pred)

# Print the evaluation metrics for the validation data
print("Validation Set Evaluation:")
print(f"R-squared: {r2_val}")
print(f"RMSE: {rmse_val}")
print(f"MAE: {mae_val}")
print(f"Max Error: {max_err_val}")



PREDICTION OF CANDIDATE VALUES

In [None]:
all116 = pd.read_csv('116cand.csv', dtype="string", sep = ',')
cand116 = all116.dropna()
cand116.shape

In [None]:
cand_speech = cand116['text']

In [None]:
doc = cand_speech

import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Function to lemmatize a single text entry
def lemmatize_text(text):
    text = str(text)
    if len(text) > 1_000_000:
        return "TEXT TOO LONG"
    doc = nlp(text)  # Ensure text is string
    return " ".join([token.lemma_ for token in doc])

# Apply to the whole column
cand_speeches = doc.apply(lemmatize_text)

# Preview result
print(cand_speeches.head())

In [None]:
cand_test = vectorizer.transform(cand_speeches)


In [None]:
#SVD = 250
#n_components = 250
#svd250 = TruncatedSVD(n_components=n_components)
X_cand_svd250 = svd250.transform(cand_test)

print(f"shape after reduction: {X_cand_svd250.shape}") # second dimension should equal n_components

In [None]:
###TEST -- DIMENSION 1 -- CANDIDATE PREDICTIONS
RandomForestRegressor(n_estimators=1000, max_depth = 10, min_samples_split = 5)
model.fit(X250,y)

import joblib
model = joblib.load('svd250_full.pkl')
y_cand_pred = model.predict(X_cand_svd250)

In [None]:
print(y_cand_pred)

In [None]:
#SVD = 100
#n_components = 100
#svd100 = TruncatedSVD(n_components=n_components)
X_cand_svd100 = svd100.transform(cand_test)

print(f"shape after reduction: {X_cand_svd100.shape}") # second dimension should equal n_components

In [None]:
###TEST -- DIMENSION 1 -- CANDIDATE PREDICTIONS
RandomForestRegressor(n_estimators=1000, max_depth = 10, min_samples_split = 5)
model.fit(X100,y)

import joblib
model = joblib.load('svd100_full.pkl')
y_cand_pred = model.predict(X_cand_svd100)

In [None]:
print(y_cand_pred)

DIMENSION 2

In [None]:
###VALIDATION DATA 1 --DIMENSION 2-- 116 CR -- SVD250
model = RandomForestRegressor(n_estimators=1000, max_depth = 10, min_samples_split = 5)
model.fit(X250, y1)

y_val_pred1 = model.predict(X_val_svd)


r2_val = r2_score(y_val1, y_val_pred1)
rmse_val = np.sqrt(mean_squared_error(y_val1, y_val_pred1))
mae_val = mean_absolute_error(y_val1, y_val_pred1)
max_err_val = max_error(y_val1, y_val_pred1)

# Print the evaluation metrics for the validation data
print("Validation Set Evaluation:")
print(f"R-squared: {r2_val}")
print(f"RMSE: {rmse_val}")
print(f"MAE: {mae_val}")
print(f"Max Error: {max_err_val}")



In [None]:
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, r2_score, mean_squared_error, mean_absolute_error, max_error

###VALIDATION DATA 1 --DIMENSION 2-- 116 CR -- SVD100
model = RandomForestRegressor(n_estimators=800, max_depth = 10, min_samples_split = 2)
model.fit(X100, y1)

y_val_pred1 = model.predict(X_val_svd100)


r2_val = r2_score(y_val1, y_val_pred1)
rmse_val = np.sqrt(mean_squared_error(y_val1, y_val_pred1))
mae_val = mean_absolute_error(y_val1, y_val_pred1)
max_err_val = max_error(y_val1, y_val_pred1)

# Print the evaluation metrics for the validation data
print("Validation Set Evaluation:")
print(f"R-squared: {r2_val}")
print(f"RMSE: {rmse_val}")
print(f"MAE: {mae_val}")
print(f"Max Error: {max_err_val}")



In [None]:
###TEST -- DIMENSION 2 -- CANDIDATE PREDICTIONS
RandomForestRegressor(n_estimators=1000, max_depth = 10, min_samples_split = 5)
model.fit(X250,y1)

import joblib
joblib.dump(model,'svd250_full.pkl')


y_cand_pred = model.predict(X_cand_svd250)

In [None]:
###TEST -- DIMENSION 2 -- CANDIDATE PREDICTIONS -- SVD 100
RandomForestRegressor(n_estimators=1000, max_depth = 10, min_samples_split = 5)
model.fit(X100,y1)

#import joblib
#joblib.dump(model,'svd100_full.pkl')

y_cand_pred = model.predict(X_cand_svd100)

In [None]:
print(y_cand_pred)