In [1]:
import pandas as pd
import random
import numpy as np
from sklearn import preprocessing

In [None]:
cox = pd.read_excel('Cox Data Mem Combined.xlsx') # Specify the path to the Excel file

In [None]:
# Load FastText Common Crawl English model

import fasttext
fasttext_model = fasttext.load_model('cc.en.300.bin')

In [2]:
# Separate into cue and target data

cue_data = cox[['Word', 'Cue Memorability']]
target_data = cox[['Word', 'Target Memorability']]

In [6]:
# Convert all words and memorability scores into lists

cue_mem = cue_data["Cue Memorability"].tolist()
target_mem = target_data["Target Memorability"].tolist()
cues = cue_data["Word"].tolist()
targets = target_data['Word'].tolist()

In [15]:
# Get fasttext embeddings for cues
cue_embeddings = [fasttext_model.get_word_vector(cue.lower()) for cue in cues]
cue_embeddings = np.array(cue_embeddings)

In [16]:
# Get fasttext embeddings for targets
target_embeddings = [fasttext_model.get_word_vector(target.lower()) for target in targets]
target_embeddings = np.array(target_embeddings)

GET BEST HYPERPARAMETERS USING RANDOMIZED SEARCH CROSS VALIDATION BY MAXIMIZING SPEARMAN CORRELATION COEFFICIENT

In [None]:
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.svm import SVR
import numpy as np
from scipy.stats import spearmanr
from sklearn.metrics import make_scorer # To create a custom scorer
import warnings

In [None]:
# Calculate the Spearman correlation coefficient and maximize it

def spearman_correlation(y_true, y_pred):
    with warnings.catch_warnings():
        # Suppress RuntimeWarning that can occur if std dev is zero
        warnings.simplefilter("ignore", category=RuntimeWarning)
        corr, _ = spearmanr(y_true, y_pred)

    # If calculation results in NaN (e.g., constant input), return 0 or a penalty
    if np.isnan(corr):
        return 0.0 # Or return a large negative number like -1 if you want to strongly penalize this case
    return corr

# Create the scorer object for GridSearchCV
# greater_is_better=True because we want to MAXIMIZE the correlation
spearman_scorer = make_scorer(spearman_correlation, greater_is_better=True)

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1, 'scale', 'auto'],
    'kernel': ['rbf', 'linear', 'poly'],
    'epsilon': [0.01, 0.1, 0.2],
    'degree': [2, 3, 4], # Only relevant for polynomial kernel
    # Add other SVR params if needed
}

# Set up K-Fold
# Use shuffle=True if your data order might have patterns
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Get best hyperparameters for cue embeddings

random_search_ft = RandomizedSearchCV(
    estimator=SVR(),
    param_distributions=param_grid,
    scoring=spearman_scorer,
    n_iter=50,
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)
random_search_ft.fit(cue_embeddings, cue_mem)


# Get the best parameters and the best score
print("Best parameters found: ", random_search_ft.best_params_)
print("Best cross-validation score (Highest corr): ", random_search_ft.best_score_)

# The best model is automatically refit on the whole training data
best_svr_model_cue_ft = random_search_ft.best_estimator_

In [None]:
# Get best hyperparameters for target embeddings

random_search_ft_t = RandomizedSearchCV(
    estimator=SVR(),
    param_distributions=param_grid,
    scoring=spearman_scorer,
    n_iter=50,
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)
# Fit the grid search to your training data
# THIS PERFORMS THE NESTED CV FOR EACH COMBINATION
random_search_ft_t.fit(target_embeddings, target_mem)

# Get the best parameters and the best score
print("Best parameters found: ", random_search_ft_t.best_params_)
print("Best cross-validation score (Highest Corr): ", random_search_ft_t.best_score_)

# The best model is automatically refit on the whole training data
best_svr_model_target_ft = random_search_ft_t.best_estimator_

CUE MODEL

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr, pearsonr
from sklearn.model_selection import KFold
from scipy.stats import spearmanr

# Initialize K-Fold cross-validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store performance metrics for each fold
mse_scores = []
spearman_scores = []

# Support Vector Regression Model
svm_model = SVR(C=0.1, degree=2, epsilon=0.2, kernel='rbf', gamma='scale')

# Perform k-fold cross validation
for fold, (train_idx, val_idx) in enumerate(kf.split(cue_embeddings)):
    # Split data into training and validation sets for this fold
    X_train_fold = cue_embeddings[train_idx]
    y_train_fold = np.array(cue_mem)[train_idx]
    X_val_fold = cue_embeddings[val_idx]
    y_val_fold = np.array(cue_mem)[val_idx]
    
    # Train the model
    svm_model.fit(X_train_fold, y_train_fold)
    
    # Make predictions on validation set
    y_val_pred = svm_model.predict(X_val_fold)
    
    # Calculate metrics
    mse = mean_squared_error(y_val_fold, y_val_pred)
    r_value, p_value = spearmanr(y_val_fold, y_val_pred)
    
    mse_scores.append(mse)
    spearman_scores.append(r_value)
    
    print(f"Fold {fold + 1}:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Spearman Correlation: {r_value:.4f}")
    print("-" * 40)

# Calculate and print average performance
print("Average Performance:")
print(f"Mean MSE: {np.mean(mse_scores):.4f} (+/- {np.std(mse_scores):.4f})")
print(f"Mean Spearman Correlation: {np.mean(spearman_scores):.4f} (+/- {np.std(spearman_scores):.4f})")

# Train final model on all data for subsequent use
svm_model.fit(cue_embeddings, cue_mem)

Fold 1:
Mean Squared Error: 0.0198
Spearman Correlation: 0.6843
----------------------------------------
Fold 2:
Mean Squared Error: 0.0162
Spearman Correlation: 0.6483
----------------------------------------
Fold 3:
Mean Squared Error: 0.0161
Spearman Correlation: 0.7014
----------------------------------------
Fold 4:
Mean Squared Error: 0.0145
Spearman Correlation: 0.6728
----------------------------------------
Fold 5:
Mean Squared Error: 0.0140
Spearman Correlation: 0.6989
----------------------------------------
Average Performance:
Mean MSE: 0.0161 (+/- 0.0020)
Mean Spearman Correlation: 0.6811 (+/- 0.0194)


TARGET MODEL

In [37]:
# Initialize K-Fold cross-validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store performance metrics for each fold
mse_scores_t = []
spearman_scores_t = []

# Support Vector Regression Model
svm_model_t = SVR(C=0.1, degree=2, epsilon=0.2, kernel='rbf', gamma='scale')

# Perform k-fold cross validation
for fold, (train_idx, val_idx) in enumerate(kf.split(target_embeddings)):
    # Split data into training and validation sets for this fold
    X_train_fold = target_embeddings[train_idx]
    y_train_fold = np.array(target_mem)[train_idx]
    X_val_fold = target_embeddings[val_idx]
    y_val_fold = np.array(target_mem)[val_idx]
    
    # Train the model
    svm_model_t.fit(X_train_fold, y_train_fold)
    
    # Make predictions on validation set
    y_val_pred = svm_model_t.predict(X_val_fold)
    
    # Calculate metrics
    mse = mean_squared_error(y_val_fold, y_val_pred)
    r_value, p_value = spearmanr(y_val_fold, y_val_pred)
    
    mse_scores_t.append(mse)
    spearman_scores_t.append(r_value)
    
    print(f"Fold {fold + 1}:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Spearman Correlation: {r_value:.4f}")
    print("-" * 40)

# Calculate and print average performance
print("Average Performance:")
print(f"Mean MSE: {np.mean(mse_scores_t):.4f} (+/- {np.std(mse_scores_t):.4f})")
print(f"Mean Spearman Correlation: {np.mean(spearman_scores_t):.4f} (+/- {np.std(spearman_scores_t):.4f})")

# Train final model on all data for subsequent use
svm_model_t.fit(target_embeddings, target_mem)

Fold 1:
Mean Squared Error: 0.0136
Spearman Correlation: 0.4085
----------------------------------------
Fold 2:
Mean Squared Error: 0.0112
Spearman Correlation: 0.2637
----------------------------------------
Fold 3:
Mean Squared Error: 0.0096
Spearman Correlation: 0.2919
----------------------------------------
Fold 4:
Mean Squared Error: 0.0122
Spearman Correlation: 0.2885
----------------------------------------
Fold 5:
Mean Squared Error: 0.0143
Spearman Correlation: 0.2178
----------------------------------------
Average Performance:
Mean MSE: 0.0122 (+/- 0.0017)
Mean Spearman Correlation: 0.2941 (+/- 0.0631)


SAVE MODELS

In [None]:
import pickle

with open('ft_svm_model_cue.pkl', 'wb') as f:
    pickle.dump(svm_model, f)
    
with open('ft_svm_model_target.pkl', 'wb') as f:  
    pickle.dump(svm_model_t, f)