In [66]:
from dotenv import load_dotenv

load_dotenv()

True

In [65]:
import os

import duckdb
import matplotlib.pyplot as plt
import numpy as np
import openai
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from tqdm import tqdm

In [32]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

device

device(type='mps')

In [82]:
duckdb.sql("SELECT * FROM 'dataset/embedded_reviews.parquet' LIMIT 10").to_df()

Unnamed: 0,review,sentiment,embedding,labels,split
0,One of the other reviewers has mentioned that ...,positive,"[-0.010531049221754074, 0.07523203641176224, -...",1,train
1,A wonderful little production. <br /><br />The...,positive,"[0.016954615712165833, 0.04743356257677078, -0...",1,train
2,I thought this was a wonderful way to spend ti...,positive,"[-0.02557339146733284, 0.022649677470326424, -...",1,train
3,Basically there's a family where a little boy ...,negative,"[0.004888955038040876, 0.13877353072166443, -0...",0,train
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[-0.03958750516176224, -0.007721670437604189, ...",1,train
5,"Probably my all-time favorite movie, a story o...",positive,"[0.014508235268294811, 0.04683531075716019, -0...",1,train
6,I sure would like to see a resurrection of a u...,positive,"[0.04898536205291748, 0.02820231206715107, -0....",1,train
7,"This show was an amazing, fresh & innovative i...",negative,"[-0.00034545271773822606, 0.058210309594869614...",0,train
8,Encouraged by the positive comments about this...,negative,"[-0.02741415984928608, 0.09173857420682907, -0...",0,train
9,If you like original gut wrenching laughter yo...,positive,"[0.017252212390303612, 0.0524253286421299, -0....",1,train


In [27]:
train_df = duckdb.sql(
    "SELECT embedding, labels FROM 'dataset/embedded_reviews.parquet' WHERE split = 'train'"
).to_df()
val_df = duckdb.sql(
    "SELECT embedding, labels FROM 'dataset/embedded_reviews.parquet' WHERE split = 'val'"
).to_df()

len(train_df), len(val_df)

(39891, 10109)

In [56]:
class IMDBDataset(Dataset):
    def __init__(self, items):
        self.items = items

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        row = self.items.iloc[idx]
        return torch.tensor(
            row["embedding"],
            dtype=torch.float32,
        ), torch.tensor(
            row["labels"],
            dtype=torch.long,
        )


train_dataset = IMDBDataset(train_df)
val_dataset = IMDBDataset(val_df)

train_dataset[0], val_dataset[0]

((tensor([-0.0105,  0.0752, -0.0917,  ...,  0.0197, -0.0065, -0.0305]),
  tensor(1)),
 (tensor([-0.0001,  0.0551, -0.0257,  ...,  0.0023,  0.0300, -0.0256]),
  tensor(0)))

In [57]:
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SentimentClassifier, self).__init__()
        self.layer_1 = nn.Linear(input_dim, hidden_dim)
        self.layer_2 = nn.Linear(hidden_dim, hidden_dim)
        self.layer_3 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.relu(self.layer_1(x))
        x = self.dropout(x)
        x = self.relu(self.layer_2(x))
        x = self.dropout(x)
        x = self.layer_3(x)
        return x

In [58]:
HIDDEN_DIM = 256
BATCH_SIZE = 32
LEARNING_RATE = 0.001
NUM_EPOCHS = 10

input_dim = 1536
output_dim = 2
num_epochs = 10

model = SentimentClassifier(input_dim, HIDDEN_DIM, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [59]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [64]:
model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0

    pbar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}")
    for embeddings, labels in pbar:
        embeddings, labels = embeddings.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(embeddings)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        pbar.set_postfix_str(f"Loss: {loss.item()}")

        train_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    # Validation
    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        pbar = tqdm(val_loader, desc=f"Epoch {epoch + 1}/{num_epochs}")
        for embeddings, labels in pbar:
            embeddings, labels = embeddings.to(device), labels.to(device)
            outputs = model(embeddings)

            loss = criterion(outputs, labels)

            val_loss += loss.item()
            pbar.set_postfix_str(f"Loss: {loss.item()}")
            _, predicted = torch.max(outputs.data, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    print(f"Epoch [{epoch + 1}/{num_epochs}]")
    print(
        f"Train Loss: {train_loss / len(train_loader):.4f}, Train Acc: {100 * train_correct / train_total:.2f}%"
    )
    print(
        f"Val Loss: {val_loss / len(val_loader):.4f}, Val Acc: {100 * val_correct / val_total:.2f}%"
    )
    print("--------------------")

Epoch 1/10: 100%|██████████| 1247/1247 [00:10<00:00, 117.62it/s, Loss: 0.1267903745174408]  
Epoch 1/10: 100%|██████████| 316/316 [00:01<00:00, 215.46it/s, Loss: 0.10470446199178696] 


Epoch [1/10]
Train Loss: 0.1386, Train Acc: 94.84%
Val Loss: 0.1510, Val Acc: 94.48%
--------------------


Epoch 2/10: 100%|██████████| 1247/1247 [00:10<00:00, 119.20it/s, Loss: 0.22692397236824036] 
Epoch 2/10: 100%|██████████| 316/316 [00:01<00:00, 243.17it/s, Loss: 0.10231629014015198] 


Epoch [2/10]
Train Loss: 0.1293, Train Acc: 95.29%
Val Loss: 0.1432, Val Acc: 94.73%
--------------------


Epoch 3/10: 100%|██████████| 1247/1247 [00:10<00:00, 118.86it/s, Loss: 0.19485260546207428] 
Epoch 3/10: 100%|██████████| 316/316 [00:01<00:00, 245.25it/s, Loss: 0.10211461037397385] 


Epoch [3/10]
Train Loss: 0.1201, Train Acc: 95.61%
Val Loss: 0.1406, Val Acc: 94.93%
--------------------


Epoch 4/10: 100%|██████████| 1247/1247 [00:10<00:00, 118.17it/s, Loss: 0.04125737398862839] 
Epoch 4/10: 100%|██████████| 316/316 [00:01<00:00, 234.70it/s, Loss: 0.14750191569328308] 


Epoch [4/10]
Train Loss: 0.1091, Train Acc: 96.02%
Val Loss: 0.1516, Val Acc: 94.93%
--------------------


Epoch 5/10: 100%|██████████| 1247/1247 [00:10<00:00, 120.54it/s, Loss: 0.025885265320539474] 
Epoch 5/10: 100%|██████████| 316/316 [00:01<00:00, 237.15it/s, Loss: 0.10867998749017715]  


Epoch [5/10]
Train Loss: 0.0970, Train Acc: 96.47%
Val Loss: 0.1507, Val Acc: 95.05%
--------------------


Epoch 6/10: 100%|██████████| 1247/1247 [00:10<00:00, 116.27it/s, Loss: 0.13978135585784912] 
Epoch 6/10: 100%|██████████| 316/316 [00:01<00:00, 242.69it/s, Loss: 0.18212035298347473] 


Epoch [6/10]
Train Loss: 0.0884, Train Acc: 96.87%
Val Loss: 0.1596, Val Acc: 94.59%
--------------------


Epoch 7/10: 100%|██████████| 1247/1247 [00:10<00:00, 117.88it/s, Loss: 0.07013380527496338]  
Epoch 7/10: 100%|██████████| 316/316 [00:01<00:00, 243.17it/s, Loss: 0.14806626737117767]  


Epoch [7/10]
Train Loss: 0.0754, Train Acc: 97.28%
Val Loss: 0.1585, Val Acc: 95.14%
--------------------


Epoch 8/10: 100%|██████████| 1247/1247 [00:10<00:00, 119.50it/s, Loss: 0.32823675870895386]  
Epoch 8/10: 100%|██████████| 316/316 [00:01<00:00, 234.99it/s, Loss: 0.09305138140916824] 


Epoch [8/10]
Train Loss: 0.0673, Train Acc: 97.59%
Val Loss: 0.1621, Val Acc: 95.02%
--------------------


Epoch 9/10: 100%|██████████| 1247/1247 [00:10<00:00, 119.47it/s, Loss: 0.006871419493108988] 
Epoch 9/10: 100%|██████████| 316/316 [00:01<00:00, 234.08it/s, Loss: 0.09961969405412674]  


Epoch [9/10]
Train Loss: 0.0581, Train Acc: 97.85%
Val Loss: 0.2018, Val Acc: 94.38%
--------------------


Epoch 10/10: 100%|██████████| 1247/1247 [00:10<00:00, 116.12it/s, Loss: 0.0027116890996694565]
Epoch 10/10: 100%|██████████| 316/316 [00:01<00:00, 234.44it/s, Loss: 0.22444158792495728]  

Epoch [10/10]
Train Loss: 0.0536, Train Acc: 98.06%
Val Loss: 0.2010, Val Acc: 94.70%
--------------------





In [None]:
openai = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


def get_embedding(text):
    return openai.embeddings.create(input=text, model="text-embedding-3-small").data[0].embedding


len(get_embedding("Hello, world!"))

1536

In [86]:
# positive = 1
# negative = 0

ex_1 = torch.tensor(
    get_embedding("This was a really fun movie. I had a great time."),
    dtype=torch.float32,
    device=device,
)

In [93]:
torch.set_printoptions(sci_mode=False)

logits = model(torch.stack([ex_1]))
pred = torch.softmax(logits, dim=1)
pred

tensor([[    0.0005,     0.9995]], device='mps:0', grad_fn=<SoftmaxBackward0>)

In [83]:
test_sentences = [
    "The radiant pulse of hope dances through the quiet corridors of the mind",
    "A vibrant tapestry of dreams weaves together whispers of joy and resilience.",
    "In the gentle hum of the cosmos, optimism blooms like a celestial flower.",
    "The shimmering aura of gratitude paints every moment with soft luminescence.",
    "A symphony of vibrant energy ignites the spirit, echoing the promise of new beginnings.",
    "The serene rhythm of a heartbeat reverberates with the melody of endless possibility.",
    "Embraced by the warm glow of inspiration, the soul finds solace in its own light.",
    "Each fleeting moment bursts forth with the abstract magic of pure, unfiltered joy.",
    "The delicate interplay of sunlight and shadow reveals an ever-evolving portrait of positive transformation.",
    "In the boundless landscape of the heart, every thought radiates with the energy of a thousand smiles.",
]

test_embeddings = torch.tensor(
    [get_embedding(sentence) for sentence in test_sentences],
    dtype=torch.float32,
    device=device,
)

model.eval()
out = model(test_embeddings)
probs = torch.argmax(out, dim=1)
probs

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='mps:0')

In [84]:
def qr_decomposition(mat):
    Q, R = np.linalg.qr(mat.T)
    return Q, R


def projection_matrix(mat):
    P = mat @ mat.T
    return P


test_embeddings.cpu().numpy()

array([[ 0.03569634, -0.01072521, -0.01411251, ..., -0.02124288,
        -0.00842748, -0.01765546],
       [ 0.05399539,  0.01783927,  0.00116939, ..., -0.01873825,
        -0.01785332, -0.03553807],
       [ 0.03057423,  0.01606516,  0.01289535, ..., -0.01048198,
         0.04109222,  0.00085369],
       ...,
       [ 0.0013048 ,  0.0088174 , -0.03082495, ..., -0.02925625,
        -0.0014151 ,  0.01362154],
       [ 0.02377794, -0.01580558,  0.01782302, ..., -0.01243855,
        -0.01554123, -0.00312007],
       [ 0.01591483, -0.01888   ,  0.00251744, ..., -0.02223878,
         0.00678972,  0.02113668]], shape=(10, 1536), dtype=float32)

In [85]:
Q, R = qr_decomposition(test_embeddings.cpu().numpy())
q_proj = projection_matrix(Q)

q_proj.shape

(1536, 1536)

In [94]:
ex_1 = ex_1.cpu().numpy()

ex_1_proj = q_proj @ ex_1
ex_1_decomp = ex_1 - ex_1_proj
ex_1_decomp.shape

(1536,)

In [97]:
pred = model(torch.tensor([ex_1_decomp], dtype=torch.float32, device=device))
pred.softmax(dim=1)

  pred = model(torch.tensor([ex_1_decomp], dtype=torch.float32, device=device))


tensor([[    0.0001,     0.9999]], device='mps:0', grad_fn=<SoftmaxBackward0>)

In [108]:
positive_val = duckdb.sql(
    "SELECT embedding FROM 'dataset/embedded_reviews.parquet' WHERE labels = 1"
).to_df()
positive_val.head(), len(positive_val)

(                                           embedding
 0  [-0.010531049221754074, 0.07523203641176224, -...
 1  [0.016954615712165833, 0.04743356257677078, -0...
 2  [-0.02557339146733284, 0.022649677470326424, -...
 3  [-0.03958750516176224, -0.007721670437604189, ...
 4  [0.014508235268294811, 0.04683531075716019, -0...,
 25000)

In [111]:
# get the qr decomposition of the positive reviews
positive_embeddings = torch.tensor(
    positive_val["embedding"].tolist(), dtype=torch.float32, device=device
)
Q, R = qr_decomposition(positive_embeddings.cpu().numpy())
q_proj = projection_matrix(Q)

ex_1 = torch.tensor(
    get_embedding("This was a fun movie, i had a great time"),
    dtype=torch.float32,
    device=device,
)
ex_1 = ex_1.cpu().numpy()

ex_1_proj = q_proj @ ex_1
ex_1_decomp = ex_1 - ex_1_proj

pred = model(torch.tensor([ex_1, ex_1_decomp], dtype=torch.float32, device=device))
pred.softmax(dim=1)

tensor([[    0.0004,     0.9996],
        [    0.1166,     0.8834]], device='mps:0', grad_fn=<SoftmaxBackward0>)