In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data.dataset import random_split
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn.functional as F
from taker import Model
id = 1 


In [2]:
# Load texts
id = 1
loaded_texts  = torch.load(f'generated_texts-{id}.pth')
prompts = [i[0] for i in loaded_texts]
completions = [i[1] for i in loaded_texts]

In [None]:
# Generate activations if there are none

m = Model("mistralai/Mistral-7B-v0.1", dtype="int4") 
m.to("cuda")
m.do_activations["mlp_pre_out"] = False
m.do_activations["attn_pre_out"] = False




activations = []




with torch.no_grad():
    
    for text in [i for i in prompts]:
        activation = m.get_text_activations(text=text)
        input = activation[0]
        atten_out = activation[1]
        ff_out = activation[2]
        out = activation[3]
        
        
        
        
        activations.append((atten_out[:, -5: -1, :], ff_out[:, -5: -1, :], out))


torch.save(activations, f"text_activations-{id}.pth")

In [14]:
# Load activations 

activations = torch.load(f"text_activations-{id}.pth")

In [8]:
# Inspect text


a = 0
for i in loaded_texts:
    print("TEXTS: ", a )
    print(i[0] + "//" +  i[1])
    
    a = a + 1
    

TEXTS:  0
The Casa de la Cultura is located in the center of Fuengirola and is a building where of all kinds cultural activities are taken place.
On the ground //ﬂoor of the building there is the tourist oﬃce, where information is given about the city and the Costa del Sol. On the ﬁrst ﬂoor there is a library, the city museum and the Casa de la Juventud. On the second ﬂoor there are a theater, a room for exhibitions, a chamber for meetings and another for conferences. It is a big building with a lot of activities that you can go to. In this building there are also 2 restaurants.

Opening hours:
Tuesday to Friday: 10:30 – 15:00
Saturday: 10:30 – 14:00
Closed on Sundays and Mondays.

Address:
Plaza de la Constitución, 1 (next to the police station)

Telephone number: 952 473 584

Website: www.culturayturismo.fuengirola.es
TEXTS:  1
Au revoir, Nexus 5! Google is no longer officially selling its 2013 champion
The end of an era has come - Google has officially stopped selling the Ne//exus 5

In [9]:
# has date

import re

def contains_date_time(texts):
    # Regex pattern to match dates, times, and durations
    pattern = r"\b(?:\d{4}|\d{1,2}:\d{2}|\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)|\b\d{1,2}(?:st|nd|rd|th)?\s(?:January|February|March|April|May|June|July|August|September|October|November|December)|\d+\syears)\b"

    # List to store the results
    result = []

    for text in texts:
        # Search for the pattern in the text
        if re.search(pattern, text):
            result.append(True)
        else:
            result.append(False)


    return result


has_date = contains_date_time([i[1] for i in loaded_texts])

In [65]:
# Models
class ImprovedTwoLayerNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(ImprovedTwoLayerNN, self).__init__()
        # Increase depth and capacity
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.LeakyReLU()
        self.batch_norm1 = nn.BatchNorm1d(hidden_size)
        self.dropout1 = nn.Dropout(0.5)  # Adjust dropout rate as needed
        
        # Additional layer
        self.layer2 = nn.Linear(hidden_size, hidden_size * 2)
        self.relu2 = nn.LeakyReLU()
        self.batch_norm2 = nn.BatchNorm1d(hidden_size * 2)
        self.dropout2 = nn.Dropout(0.5)  # Adjust dropout rate as needed
        
        # Output layer
        self.layer3 = nn.Linear(hidden_size * 2, output_size)
    
    def forward(self, x):
        x = self.layer1(x)
        x = self.relu1(x)
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        
        x = self.layer2(x)
        x = self.relu2(x)
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        
        x = self.layer3(x)
        return x
    
class TwoLayerNN(nn.Module):
    def __init__(self, input_size, hidden_size,  output_size):
        super(TwoLayerNN, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x


# Takes in batch, seq_len, features
class CustomTransformerModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(CustomTransformerModel, self).__init__()
        self.embedding = nn.Linear(input_size, 4096) # Example embedding size
        self.transformer_block = nn.TransformerEncoderLayer(d_model=4096, nhead=8)
        self.output_linear = nn.Linear(4096, output_size)
        self.sigmoid = nn.Sigmoid()
        self.output_size = output_size

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(1, 0, 2) # Transformer expects [seq_len, batch, features]
        x = self.transformer_block(x)
        x = x.permute(1, 0, 2) # Revert permutation
        x = self.output_linear(x)
        x = x[:, -1, -1]
        x = self.sigmoid(x)
        return x

In [4]:
# train


def train(data_in, data_out, model, criterion, num_epochs, save=False, learning_rate=0.001):
    
    
    # Create a complete dataset
    full_dataset = TensorDataset(data_in.to("cuda"), data_out.to("cuda"))

    # Define the sizes for your training and validation sets
    total_size = len(full_dataset)
    train_size = int(0.8 * total_size)
    val_size = total_size - train_size

    # Split the dataset
    train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

    # Create DataLoaders for both training and validation sets
    train_dataloader = DataLoader(train_dataset, batch_size=400, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=500)  # No need to shuffle the validation data


    model.to("cuda")
     
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)



    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, targets in train_dataloader:
            inputs, targets = inputs.to("cuda"), targets.to("cuda")
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
         
            running_loss += loss.item() * inputs.size(0)
    
        epoch_loss = running_loss / len(train_dataloader.dataset)
        # Validation phase
        model.eval()  # Set the model to evaluation mode
        running_loss = 0.0
        with torch.no_grad():  # No gradients need to be calculated
            for inputs, targets in val_dataloader:
                inputs, targets = inputs.to("cuda").float(), targets.to("cuda").float()
                outputs = model(inputs)
                
                loss = criterion(outputs, targets)
                running_loss += loss.item() * inputs.size(0)
        val_loss = running_loss / len(val_dataloader.dataset)

        print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {epoch_loss:.8f}, Validation Loss: {val_loss:.8f}')
        
    if(save):
        # Ensure the model is in evaluation mode
        model.eval()

        predictions = []
        with torch.no_grad():  # No gradients needed for inference
            for inputs, _ in full_dataset:  # Assuming your dataset returns inputs and targets
                inputs = inputs.to('cuda').float().unsqueeze(0)
                
                # Get the model output
                outputs = model(inputs)
                

                predictions.append(outputs.cpu())

        # Concatenate all batches of predictions
        all_predictions = torch.stack(predictions, dim=0)

        # Save the tensor to a file
        torch.save(all_predictions, 'model_predictions.pth')



In [66]:
# activations 
# list 
#   atten out
#   ff out
#   output 

data_out = (torch.tensor(has_date)* 1).float()

# use last layer on four last tokens
data_in = torch.stack([i[1][-1, :, :] for i in activations]).float()    


criterion = nn.CrossEntropyLoss() 


model = CustomTransformerModel(input_size=4096, output_size=2).to("cuda")

In [63]:
data_in

tensor([[[  0.7773,   6.2734,   0.5830,  ...,   1.3037,  -1.9346,   1.3467],
         [ -2.0098,  -0.2891,  -3.8477,  ...,  -2.1738,   0.0838,   4.3867],
         [  1.1191,   1.2549,   3.3477,  ...,  -0.8857,  -0.8481,   0.9170],
         [  2.6602,  -1.6709,   3.2285,  ...,   0.9634,   0.8730,   2.1328]],

        [[  6.4375,   1.4395,   2.4395,  ...,  -3.9746,   1.0781,  -8.5625],
         [ -0.1475,  -1.6270,  -3.5098,  ...,  -9.5000,   2.4473,  -2.8086],
         [ -3.4727,  -1.8193,   0.9170,  ...,  -4.6250,  -2.0703,  -2.3867],
         [  0.0161,  -1.9463,   0.3103,  ...,  -3.3926,  -0.2717,  -5.7773]],

        [[  0.4756,   4.1289,   3.8398,  ...,   2.7793,   6.8516,  -1.1924],
         [ -5.7070,   4.3398,   1.9834,  ...,  -1.5527,   7.6680,   5.0039],
         [ -4.6562,   6.6992,   0.1177,  ...,   0.0263,  -3.3672,  -3.0430],
         [ -0.6230,   8.7500,   0.9990,  ...,  -4.9336,   2.6875,  -0.1174]],

        ...,

        [[ -2.5039,  -3.5957,   2.9297,  ...,  -3.5215, 

In [22]:

train(data_in, data_out, model, criterion, num_epochs=10, save=True, learning_rate=0.0001)

Epoch [1/10], Training Loss: 0.00000000, Validation Loss: 0.00000000
Epoch [2/10], Training Loss: 0.00000000, Validation Loss: 0.00000000
Epoch [3/10], Training Loss: 0.00000000, Validation Loss: 0.00000000
Epoch [4/10], Training Loss: 0.00000000, Validation Loss: 0.00000000
Epoch [5/10], Training Loss: 0.00000000, Validation Loss: 0.00000000
Epoch [6/10], Training Loss: 0.00000000, Validation Loss: 0.00000000
Epoch [7/10], Training Loss: 0.00000000, Validation Loss: 0.00000000
Epoch [8/10], Training Loss: 0.00000000, Validation Loss: 0.00000000
Epoch [9/10], Training Loss: 0.00000000, Validation Loss: 0.00000000
Epoch [10/10], Training Loss: 0.00000000, Validation Loss: 0.00000000


In [89]:
model(data_in[0:5]).unsqueeze(1).to("cuda").device 





device(type='cuda', index=0)

In [91]:
data_out[0:5].to("cuda").type(torch.LongTensor).to("cuda")

tensor([1, 1, 0, 0, 0], device='cuda:0')

In [None]:
criterion(model(data_in[0:5]).unsqueeze(1).to("cuda") , data_out[0:5].to("cuda").type(torch.LongTensor)  )


In [82]:
model(data_in[0:5]).unsqueeze(1).to("cuda")

tensor([[0.5513],
        [0.3506],
        [0.4965],
        [0.2890],
        [0.5728]], device='cuda:0', grad_fn=<UnsqueezeBackward0>)

In [54]:
# Takes in batch, seq_len, features
class CustomTransformerModel2(nn.Module):
    def __init__(self, input_size, output_size):
        super(CustomTransformerModel2, self).__init__()
        self.embedding = nn.Linear(input_size, 4096) # Example embedding size
        self.transformer_block = nn.TransformerEncoderLayer(d_model=4096, nhead=8)
        self.output_linear = nn.Linear(4096, output_size)
        self.sigmoid = nn.Sigmoid()
        self.output_size = output_size

    def forward(self, x):
        x = self.embedding(x)
        print(x.shape)
        x = x.permute(1, 0, 2) # Transformer expects [seq_len, batch, features]
        print(x.shape)
        x = self.transformer_block(x)
        print(x.shape)
        x = x.permute(1, 0, 2) # Revert permutation
        print(x.shape)

        x = self.output_linear(x)
        print(x.shape)
        x = self.sigmoid(x)
        return x[:, -1, -1]

In [57]:
mx = CustomTransformerModel2(10, 2)

a = mx.forward(torch.rand(3, 2, 10, 10))



torch.Size([3, 2, 10, 4096])


RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 4 is not equal to len(dims) = 3

In [56]:
a

tensor([0.6200, 0.6040], grad_fn=<SelectBackward0>)

In [74]:
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
output = loss(input, target)


In [80]:
target.dtype

torch.int64