In [None]:
# ml.ipynb

import pandas as pd
import re
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import AdamW
from collections import Counter
from nltk.tokenize import word_tokenize
import spacy

# Load spaCy model for entity extraction
nlp = spacy.load('tr_core_news_trf')

# Preprocess text function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[çğışöü]', lambda x: {'ç': 'c', 'ğ': 'g', 'ı': 'i', 'ş': 's', 'ö': 'o', 'ü': 'u'}[x.group()], text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    return text

# Build vocabulary from training texts
def build_vocabulary(texts):
    vocab = Counter()
    for text in texts:
        tokens = word_tokenize(text)
        vocab.update(tokens)
    return vocab

# Tokenize and encode text to sequences
def text_to_sequence(text, word2idx, max_len=512):
    tokens = word_tokenize(text)
    seq = [word2idx.get(token, word2idx["<UNK>"]) for token in tokens]
    if len(seq) < max_len:
        seq += [word2idx["<PAD>"]] * (max_len - len(seq))
    return seq[:max_len]

# Define Transformer block
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, feedforward_dim):
        super(TransformerBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim, num_heads)
        self.feedforward = nn.Sequential(
            nn.Linear(embed_dim, feedforward_dim),
            nn.ReLU(),
            nn.Linear(feedforward_dim, embed_dim)
        )
        self.layernorm1 = nn.LayerNorm(embed_dim)
        self.layernorm2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        attn_output, _ = self.attention(x, x, x)
        x = self.layernorm1(x + attn_output)
        feedforward_output = self.feedforward(x)
        x = self.layernorm2(x + feedforward_output)
        return x

# Define BERT-like model
class BERT(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, feedforward_dim, num_layers, max_len):
        super(BERT, self).__init__()
        self.token_embeddings = nn.Embedding(vocab_size, embed_dim)
        self.position_embeddings = nn.Embedding(max_len, embed_dim)
        self.layers = nn.ModuleList([
            TransformerBlock(embed_dim, num_heads, feedforward_dim) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(embed_dim, 3)  # For sentiment classification (negative, neutral, positive)

    def forward(self, x):
        seq_len = x.size(1)
        positions = torch.arange(0, seq_len).unsqueeze(0).expand_as(x).to(x.device)
        x = self.token_embeddings(x) + self.position_embeddings(positions)
        for layer in self.layers:
            x = layer(x)
        x = x.mean(dim=1)  # Average pooling
        x = self.fc(x)
        return x

# Training function
def train(model, dataloader, optimizer, device, epochs=3):
    model.train()
    for epoch in range(epochs):
        for step, batch in enumerate(dataloader):
            input_ids = batch[0].to(device)
            labels = batch[1].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids)
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()

            if step % 100 == 0:
                print(f'Epoch {epoch}, Step {step}, Loss {loss.item()}')

# Evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch[0].to(device)
            true_labels = batch[1].to(device)
            outputs = model(input_ids)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(true_labels.cpu().numpy())

    f1 = f1_score(all_labels, all_preds, average='weighted')
    return f1

# Load and preprocess data
def load_and_preprocess_data(filepath):
    data = pd.read_csv(filepath)
    data['clean_text'] = data['text'].apply(preprocess_text)
    return data

# Main function
def main():
    # Load and preprocess data
    data = load_and_preprocess_data('data.csv')

    # Ensure the label column is correct and remap labels to 0, 1, 2 for CrossEntropyLoss
    label_mapping = {-1: 0, 0: 1, 1: 2}
    data['label'] = data['label'].map(label_mapping)

    # Split dataset into training and testing sets
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

    # Build vocabulary
    vocab = build_vocabulary(train_data['clean_text'])
    vocab_list = ["<PAD>", "<UNK>"] + [word for word, count in vocab.items() if count > 1]
    word2idx = {word: idx for idx, word in enumerate(vocab_list)}

    # Encode texts to sequences
    train_data['input_ids'] = train_data['clean_text'].apply(lambda x: text_to_sequence(x, word2idx))
    test_data['input_ids'] = test_data['clean_text'].apply(lambda x: text_to_sequence(x, word2idx))

    # Initialize model and set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    vocab_size = len(word2idx)
    embed_dim = 128
    num_heads = 8
    feedforward_dim = 512
    num_layers = 6
    max_len = 512
    model = BERT(vocab_size, embed_dim, num_heads, feedforward_dim, num_layers, max_len).to(device)

    # Convert data to PyTorch tensors
    input_ids = torch.tensor(list(train_data['input_ids']), dtype=torch.long).to(device)
    labels = torch.tensor(list(train_data['label']), dtype=torch.long).to(device)

    # Create DataLoader for training
    dataset = TensorDataset(input_ids, labels)
    train_sampler = RandomSampler(dataset)
    train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=8)

    # Initialize optimizer and train model
    optimizer = AdamW(model.parameters(), lr=2e-5)
    train(model, train_dataloader, optimizer, device)

    # Save the model
    torch.save(model.state_dict(), 'bert_sentiment_model.pth')
    print("Model saved as 'bert_sentiment_model.pth'")

    # Create DataLoader for evaluation
    input_ids_test = torch.tensor(list(test_data['input_ids']), dtype=torch.long).to(device)
    labels_test = torch.tensor(list(test_data['label']), dtype=torch.long).to(device)
    test_dataset = TensorDataset(input_ids_test, labels_test)
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=8)

    # Evaluate model
    f1 = evaluate(model, test_dataloader, device)
    print(f'Weighted F1 Score: {f1}')

if __name__ == "__main__":
    main()


2024-05-31 17:46:41.989304: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-31 17:46:47.482026: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-05-31 17:46:47.482061: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-05-31 17:46:59.184943: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-

Epoch 0, Step 0, Loss 0.7886448502540588
Epoch 1, Step 0, Loss 1.0135451555252075
Epoch 2, Step 0, Loss 0.7972089052200317
Model saved as 'bert_sentiment_model.pth'
Weighted F1 Score: 0.569216757741348
