In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules import Module
from torch.utils.data import DataLoader, random_split
import os
import logging
import settings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import modules.coralnet as coralnet
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from coral_pytorch.losses import coral_loss
from coral_pytorch.dataset import levels_from_labelbatch
from coral_pytorch.dataset import proba_to_label
from datasets import load_dataset

os.chdir(settings.PROJECT_WORKING_DIRECTORY)
BASE_MODEL = 'sentence-transformers/all-distilroberta-v1'

def make_sentence_pairs(u,v, how = 'concat'):
    if how == 'concat':
        return np.concatenate([u,v], axis = 1)
    elif how == 'l1':
        return np.abs(u - v)
    elif how == 'prod':
        return u*v
    elif how == 'concatl1':
        return np.concatenate([u,v, np.abs(u-v)], axis = 1)

transformer = SentenceTransformer(BASE_MODEL)

In [None]:
from scipy import stats
interview_data =  pd.read_excel('data/full_data.xlsx', sheet_name = 1)
lab = interview_data['avg_EA'].unique()
lab.sort()
interview_data['discrete_label'] = interview_data['avg_EA'].apply(lambda x: np.where(x == lab)[0].item())
interview_data['mode_EA']  = interview_data['mode_EA'].fillna(1.).astype(np.int64)

In [None]:
interview_data.groupby('designer').agg(
    {'avg_EA':'mean', 'time':'count'}
)

In [None]:
model = torch.load('models/nli-coral.pt', map_location=torch.device('cpu'))

In [None]:
user_embeddings = transformer.encode(interview_data['user_text'])
designer_embeddings = transformer.encode(interview_data['designer_text'])

In [None]:
u = interview_data['user_text']
d = interview_data['designer_text']
u = u.str.lower().str.replace(':','')
d = d.str.lower().str.replace(':','')
u = u.str.replace('i was', '')
d = d.str.replace('i was', '').str.replace('s/he was','').str.replace('she was', '').str.replace('he was','')
s1 = transformer.encode(u.values)
s2 = transformer.encode(d.values)

In [None]:
sentence_pairs = make_sentence_pairs(s1, s2, how='concatl1')
sentence_pairs = torch.tensor(sentence_pairs)

In [None]:
logit, probas = model(sentence_pairs)
predicted_labels = proba_to_label(probas).float().numpy()
predicted_labels = predicted_labels.astype(np.int64)
true_labels = interview_data['mode_EA']

In [None]:
from sklearn.metrics import classification_report
print(classification_report(true_labels,predicted_labels))

In [None]:
true_labels.value_counts()