In [None]:
import os
import pandas as pd
import numpy as np
import torch
import settings
import json
import pyro
import pyro.contrib.gp as gp
import pyro.distributions as dist
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
from sentence_transformers import SentenceTransformer
from scipy.stats import pearsonr
from scipy.spatial import distance
from sklearn.model_selection import KFold
from modules.nnets import BasicFFNet
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
from scipy.stats import pearsonr
load_dotenv()
os.chdir(os.path.expanduser(os.getenv('PROJECT_WORKING_DIRECTORY')))

In [None]:
sts_test = pd.read_csv('data/stsbenchmark/test.csv')
interview_data =  pd.read_excel('data/full_data.xlsx', sheet_name = 1)

In [None]:
encoder = SentenceTransformer("sentence-transformers/xlm-r-distilroberta-base-paraphrase-v1")

def gen_features(encoder,s1,s2):
    u = encoder.encode(s1)
    v = encoder.encode(s2)
    return np.concatenate([u,v, np.abs(u - v)], axis = 1)

Xsts = torch.tensor(gen_features(encoder, sts_test['sentence1'].values,
                                 sts_test['sentence2'].values
                                 ))
Xinterview = torch.tensor(gen_features(encoder, interview_data['user_text'].values,
                                       interview_data['designer_text'].values
                                       ))

ysts = torch.tensor(sts_test['score'].values / 5.).float()
yinterview = torch.tensor(interview_data['avg_EA'].values / 2.).float()

In [None]:
kernel = gp.kernels.RBF(input_dim=Xsts.shape[1])
likelihood = gp.likelihoods.Gaussian()
Xu = (Xsts.clone())[np.random.choice(Xsts.shape[0], 30)]
vsgp = gp.models.VariationalSparseGP(Xsts, ysts, Xu=Xu, kernel = kernel,
                                        likelihood = likelihood, whiten=True)
vsgp.load_state_dict(torch.load("models/stsb-sparsegp2/best_model.pt",
                      map_location=torch.device('cpu')
                      ))

ffnet = BasicFFNet(input_dim = Xsts.shape[1], hidden_layers=64, dropout=0)
ffnet.load_state_dict(torch.load("models/stsb-ffnet/best_model.pt",
                      map_location=torch.device('cpu')))

def cosine_sim(encoder,s1,s2):
    u = encoder.encode(s1)
    v = encoder.encode(s2)
    predicted_scores = [1 - distance.cosine(u[i], v[i])
                   for i in range(len(s1))]
    return predicted_scores

In [None]:
ypred1 = vsgp(Xsts)[0].detach().numpy()
ypred2 = ffnet(Xsts).detach().numpy().flatten()
ypred3 = cosine_sim(encoder, sts_test['sentence1'].values, sts_test['sentence2'].values)

In [None]:
def evals(true, predicted):
    rmse = np.sqrt(np.mean((true - predicted)**2))
    pearson = pearsonr(true, predicted)[0]
    print("Pearson:", pearson)
    print("RMSE:", rmse)
    return pearson, rmse

evals(ysts.numpy(), ypred1)
evals(ysts.numpy(), ypred2)
evals(ysts.numpy(), ypred3)

In [None]:
splits = KFold(n_splits=10, shuffle = True, random_state = 123)
folds = splits.split(interview_data)

In [None]:
results = []
for train, val in folds:
    X = Xinterview[val]
    y_pred = ffnet(X)
    pearson, rmse = evals(yinterview[val].numpy(), y_pred.detach().numpy().flatten())
    results.append({'pearson':pearson, 'rmse':rmse})

results = pd.DataFrame(results).describe()

In [None]:
results

In [None]:
y_pred

In [None]:
ypred1 = vsgp(Xinterview)[0].detach().numpy()
ypred2 = ffnet(Xinterview).detach().numpy().flatten()
ypred3 = cosine_sim(encoder, interview_data['user_text'].values, interview_data['designer_text'].values)

In [None]:
def evals(true, predicted):
    rmse = np.sqrt(np.mean((true - predicted)**2))
    pearson = pearsonr(true, predicted)[0]
    print("Pearson:", pearson)
    print("RMSE:", rmse)
    return pearson, rmse

evals(yinterview.numpy(), ypred1)
evals(yinterview.numpy(), ypred2)
evals(yinterview.numpy(), ypred3)