In [5]:
import numpy as np
import random
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
from tqdm.notebook import tqdm

seed = 1735
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [None]:

def get_bert_embeddings(texts):
    # Load pre-trained model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    def get_sentence_embedding(text):
        inputs = tokenizer(text, return_tensors='pt')
        with torch.no_grad():
            outputs = model(**inputs)
        last_hidden_states = outputs.last_hidden_state
        sentence_embedding = torch.mean(last_hidden_states, dim=1).numpy().flatten()
        return sentence_embedding

    # Generate embeddings for texts
    return np.array([get_sentence_embedding(text) for text in tqdm(texts)])


def get_embeddings(filepath, num):
    test_df = pd.read_csv(filepath)
    X_test = test_df.drop(columns=['id', 'review'])

    idxs = np.random.choice(len(X_test), size=num, replace=False)

    reviews = test_df.iloc[idxs]['review'].tolist()
    bert = get_bert_embeddings(reviews)
    assert bert.shape == (num, 768)

    openai = test_df.iloc[idxs].drop(columns=['id', 'review']).to_numpy()
    assert openai.shape == (num, 1536)

    return bert, openai


bert, openai = get_embeddings('./F24_Proj3_data/split_1/test.csv', 1600)
bert.shape, openai.shape


In [15]:
X = np.c_[np.ones(bert.shape[0]), bert]
x, _, _, _ = np.linalg.lstsq(X, openai)
pass

In [10]:
# Write trained split 1 model to file

# import mymain
# import os

# cwd = os.getcwd()
# os.chdir('./F24_Proj3_data/split_1')
# model = mymain.main()
# os.chdir(cwd)

# model_file = './trained_lr_model.npz'
# np.savez_compressed(model_file, intercept=model.intercept_, coef=model.coef_, features=model.feature_names_in_)

# m = np.load(model_file, allow_pickle=True)
# pass
