In [1]:
import re
import string
import pandas as pd
import numpy as np
import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from nltk.tokenize import word_tokenize
from collections import Counter
import spacy
import matplotlib.pyplot as plt

# Download NLTK data
nltk.download('punkt')

# Load spaCy model for entity extraction
nlp = spacy.load('tr_core_news_trf')

# Preprocess text function
def preprocess_text(text):
    """
    Function to preprocess text by converting to lowercase, removing punctuation, and replacing Turkish characters.
    """
    try:
        text = text.lower()
        text = re.sub(r'[çğışöü]', lambda x: {'ç': 'c', 'ğ': 'g', 'ı': 'i', 'ş': 's', 'ö': 'o', 'ü': 'u'}[x.group()], text)
        text = re.sub(r'\d+', '', text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = text.strip()
        return text
    except Exception as e:
        print(f"Error in preprocess_text: {e}")
        return ""

# Build vocabulary from training texts
def build_vocabulary(texts):
    """
    Function to build vocabulary from a list of texts.
    """
    vocab = Counter()
    for text in texts:
        tokens = word_tokenize(text)
        vocab.update(tokens)
    return vocab

# Tokenize and encode text to sequences
def text_to_sequence(text, word2idx, max_len=512):
    """
    Function to convert text to a sequence of integers based on a provided word-to-index mapping.
    Pads or truncates sequences to a specified maximum length.
    """
    tokens = word_tokenize(text)
    seq = [word2idx.get(token, word2idx["<UNK>"]) for token in tokens]
    if len(seq) < max_len:
        seq += [word2idx["<PAD>"]] * (max_len - len(seq))
    return seq[:max_len]

# Define Transformer block
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, feedforward_dim):
        super(TransformerBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim, num_heads)
        self.feedforward = nn.Sequential(
            nn.Linear(embed_dim, feedforward_dim),
            nn.ReLU(),
            nn.Linear(feedforward_dim, embed_dim)
        )
        self.layernorm1 = nn.LayerNorm(embed_dim)
        self.layernorm2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        attn_output, _ = self.attention(x, x, x)
        x = self.layernorm1(x + attn_output)
        feedforward_output = self.feedforward(x)
        x = self.layernorm2(x + feedforward_output)
        return x

# Define BERT-like model
class BERT(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, feedforward_dim, num_layers, max_len):
        super(BERT, self).__init__()
        self.token_embeddings = nn.Embedding(vocab_size, embed_dim)
        self.position_embeddings = nn.Embedding(max_len, embed_dim)
        self.layers = nn.ModuleList([
            TransformerBlock(embed_dim, num_heads, feedforward_dim) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(embed_dim, 3)  # For sentiment classification (negative, neutral, positive)

    def forward(self, x):
        seq_len = x.size(1)
        positions = torch.arange(0, seq_len).unsqueeze(0).expand_as(x).to(x.device)
        x = self.token_embeddings(x) + self.position_embeddings(positions)
        for layer in self.layers:
            x = layer(x)
        x = x.mean(dim=1)  # Average pooling
        x = self.fc(x)
        return x

# Training function
def train(model, dataloader, optimizer, device, epochs=10):
    """
    Function to train the model with specified number of epochs and return training loss history.
    """
    model.train()
    loss_fn = nn.CrossEntropyLoss()
    train_loss_history = []

    for epoch in range(epochs):
        total_loss = 0
        for step, batch in enumerate(dataloader):
            batch_input_ids, batch_labels = batch
            batch_input_ids, batch_labels = batch_input_ids.to(device), batch_labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(batch_input_ids)
            loss = loss_fn(outputs, batch_labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            if step % 10 == 0:
                print(f"Epoch {epoch}, Step {step}, Loss {loss.item()}")
        
        avg_loss = total_loss / len(dataloader)
        train_loss_history.append(avg_loss)
        print(f"Epoch {epoch} - Average Loss: {avg_loss}")
    
    return train_loss_history

# Evaluation function with additional metrics
def evaluate(model, dataloader, device):
    """
    Function to evaluate the model and return F1 score, accuracy, precision, and recall.
    """
    model.eval()
    preds = []
    true_labels = []
    with torch.no_grad():
        for batch in dataloader:
            batch_input_ids, batch_labels = batch
            batch_input_ids, batch_labels = batch_input_ids.to(device), batch_labels.to(device)
            outputs = model(batch_input_ids)
            _, predicted = torch.max(outputs, 1)
            preds.extend(predicted.cpu().numpy())
            true_labels.extend(batch_labels.cpu().numpy())
    
    f1 = f1_score(true_labels, preds, average='weighted')
    accuracy = accuracy_score(true_labels, preds)
    precision = precision_score(true_labels, preds, average='weighted', zero_division=0)
    recall = recall_score(true_labels, preds, average='weighted')
    
    return f1, accuracy, precision, recall

# Print evaluation metrics
def print_evaluation(f1, accuracy, precision, recall):
    """
    Function to print evaluation metrics.
    """
    print(f"Weighted F1 Score: {f1}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")

# Plot training loss history
def plot_training_history(train_loss_history):
    """
    Function to plot the training loss history.
    """
    plt.plot(train_loss_history)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training Loss History')
    plt.show()

# Predict entities and sentiments
def predict_entities_and_sentiments(text, model, word2idx, device):
    """
    Function to predict entities and sentiments from a given text using a pre-trained model.
    """
    model.eval()
    clean_text = preprocess_text(text)
    input_ids = torch.tensor([text_to_sequence(clean_text, word2idx)], dtype=torch.long).to(device)
    with torch.no_grad():
        outputs = model(input_ids)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()

    entities = [ent.text for ent in nlp(text).ents]
    sentiments = ["olumsuz" if p == 0 else "nötr" if p == 1 else "olumlu" for p in preds]
    
    result = {
        "entity_list": entities,
        "results": [{"entity": ent, "sentiment": sentiments[0]} for ent in entities]
    }
    
    return result

# Custom evaluation function
def custom_evaluation(predicted_entities, predicted_sentiments, true_entities, true_sentiments):
    """
    Function to evaluate the predicted entities and sentiments against true entities and sentiments.
    """
    entity_score = 0.65 if set(predicted_entities) == set(true_entities) else 0
    sentiment_score = 0.35 if set(predicted_sentiments) == set(true_sentiments) else 0
    total_score = entity_score + sentiment_score
    return total_score

# Save model
def save_model(model, path='model.pth'):
    """
    Function to save the trained model to a file.
    """
    try:
        torch.save(model, path)
        print(f"Model saved to {path}")
    except Exception as e:
        print(f"Error saving model: {e}")

# Load model
def load_model(path='model.pth'):
    """
    Function to load a trained model from a file.
    """
    try:
        model = torch.load(path)
        print(f"Model loaded from {path}")
        return model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

# Load and preprocess data
def load_and_preprocess_data(filepath):
    """
    Function to load and preprocess data from a CSV file.
    """
    try:
        data = pd.read_csv(filepath)
        data['clean_text'] = data['text'].apply(preprocess_text)
        return data
    except Exception as e:
        print(f"Error loading data: {e}")
        return pd.DataFrame()

# Main function
def main():
    # Load and preprocess data
    data = load_and_preprocess_data('data.csv')
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

    # Build vocabulary
    vocab = build_vocabulary(train_data['clean_text'])
    vocab_list = ["<PAD>", "<UNK>"] + sorted(vocab, key=vocab.get, reverse=True)
    word2idx = {word: idx for idx, word in enumerate(vocab_list)}

    # Convert text to sequences
    train_data['input_ids'] = train_data['clean_text'].apply(lambda x: text_to_sequence(x, word2idx))
    test_data['input_ids'] = test_data['clean_text'].apply(lambda x: text_to_sequence(x, word2idx))

    # Initialize model and set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    vocab_size = len(word2idx)
    embed_dim = 128
    num_heads = 8
    feedforward_dim = 512
    num_layers = 6
    max_len = 512
    model = BERT(vocab_size, embed_dim, num_heads, feedforward_dim, num_layers, max_len).to(device)

    # Convert data to PyTorch tensors
    input_ids = torch.tensor(list(train_data['input_ids']), dtype=torch.long).to(device)
    labels = torch.tensor(list(train_data['label']), dtype=torch.long).to(device)

    # Create DataLoader for training
    train_dataset = TensorDataset(input_ids, labels)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=8)

    # Initialize optimizer
    optimizer = AdamW(model.parameters(), lr=5e-5)

    # Train model
    train_loss_history = train(model, train_dataloader, optimizer, device, epochs=10)

    # Prepare test data for evaluation
    test_input_ids = torch.tensor(list(test_data['input_ids']), dtype=torch.long).to(device)
    test_labels = torch.tensor(list(test_data['label']), dtype=torch.long).to(device)
    test_dataset = TensorDataset(test_input_ids, test_labels)
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=8)

    # Evaluate model
    f1, accuracy, precision, recall = evaluate(model, test_dataloader, device)
    print_evaluation(f1, accuracy, precision, recall)
    
    # Plot training history
    plot_training_history(train_loss_history)

    # Save model
    save_model(model)

    # Example prediction
    example_text = """Fiber 100mb SuperOnline kullanıcısıyım yaklaşık 2 haftadır
    @Twitch
    @Kick_Turkey
    gibi canlı yayın platformlarında 360p yayın izlerken donmalar
    yaşıyoruz. Başka hiç bir operatörler bu sorunu yaşamazken ben parasını verip
    alamadığım hizmeti neden ödeyeyim ?
    @Turkcell"""

    example_result = predict_entities_and_sentiments(example_text, model, word2idx, device)
    print(f"Örnek Girdi: {example_text}")
    print(f"Örnek Çıktı: {example_result}")

    # Example evaluation
    true_entities = ["SuperOnline", "Twitch", "Kick_Turkey", "Başka hiç bir operatörler", "Turkcell"]
    true_sentiments = ["olumsuz", "nötr", "nötr", "olumlu", "olumsuz"]
    predicted_entities = example_result["entity_list"]
    predicted_sentiments = [res["sentiment"] for res in example_result["results"]]

    custom_score = custom_evaluation(predicted_entities, predicted_sentiments, true_entities, true_sentiments)
    print(f"Custom Evaluation Score: {custom_score}")

if __name__ == "__main__":
    main()


2024-05-31 23:13:10.399972: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-31 23:13:15.993161: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-05-31 23:13:15.993249: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-05-31 23:13:26.695213: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-

Epoch 0, Step 0, Loss 0.8787716627120972


IndexError: Target -1 is out of bounds.