### Not checked yet

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

model_name = "./finbert_finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

# Check device compatibility
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv('data.csv')  # Ensure data.csv has columns 'text' and 'classification'
df = df[['text', 'classification']]

# Transform classification labels
def transform_classification_label(label):
    return 1 if label == 1.0 else 0

df['classification'] = df['classification'].apply(transform_classification_label)


### Experiment with different splitting technique to see which one have meaningful sentences

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Token count splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", "!", "?", ",", " "]
)

def split_text(text):
    chunks = text_splitter.split_text(text)
    return chunks

# Apply text splitting to the dataframe
df['chunks'] = df['text'].apply(split_text)


In [None]:
import torch
from tqdm import tqdm

def get_cls_embeddings(texts, tokenizer, model, device):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # Extract CLS token
    return cls_embeddings.cpu().numpy()

# Extract CLS embeddings for all chunks
cls_embeddings_list = []
labels_list = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    chunks = row['chunks']
    label = row['classification']
    
    embeddings = get_cls_embeddings(chunks, tokenizer, model, device)
    cls_embeddings_list.extend(embeddings)
    labels_list.extend([label] * len(embeddings))


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader

# Convert lists to numpy arrays
X = np.array(cls_embeddings_list)
y = np.array(labels_list)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a custom dataset
class TextDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)

# Create DataLoader objects
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = x.unsqueeze(1)  # Add batch dimension for LSTM
        lstm_out, _ = self.lstm(x)
        out = lstm_out[:, -1, :]  # Get the last output of LSTM
        out = self.fc(out)
        return out

# Instantiate the model
input_dim = X.shape[1]
hidden_dim = 128
output_dim = 2  # Positive or Negative
num_layers = 2

lstm_model = LSTMClassifier(input_dim, hidden_dim, output_dim, num_layers).to(device)


In [None]:
import torch.optim as optim

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)

# Training function
def train_model(model, train_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()  # Zero gradients
            
            outputs = model(inputs)  # Forward pass
            
            loss = criterion(outputs, labels)  # Compute loss
            
            loss.backward()  # Backward pass
            optimizer.step()  # Optimize
            
            total_loss += loss.item()
        
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}")

# Train the model
train_model(lstm_model, train_loader, criterion, optimizer, num_epochs=10)


In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Evaluation function
def evaluate_model(model, test_loader, criterion):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds)
    
    print(f"Loss: {total_loss / len(test_loader):.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Classification Report:\n{report}")

# Evaluate the LSTM model
evaluate_model(lstm_model, test_loader, criterion)
