In [1]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from tqdm.auto import tqdm
import string

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
import unicodedata
import re
import torch.nn as nn
import torch.optim as optim
import math
import torch.nn.functional as F
import numpy as np
import pandas as pd
import random

In [2]:
seed = 42

random.seed(seed)  # Python rastgele sayı üreticisine seed ayarla
np.random.seed(seed)  # NumPy rastgele sayı üreticisine seed ayarla
torch.manual_seed(seed)  # CPU için seed ayarla
torch.cuda.manual_seed(seed)  # GPU için seed ayarla (CUDA kullanıyorsanız)
torch.cuda.manual_seed_all(seed)  # Birden fazla GPU kullanıyorsanız
torch.backends.cudnn.deterministic = True  # Deterministik işlemler için
torch.backends.cudnn.benchmark = False  # Performans iyileştirmelerini devre dışı bırak


In [3]:
data = pd.read_csv("/data/data.csv", encoding="utf-8-sig")
data

Unnamed: 0,review,sirket,sentiment
0,okan üniversitesi hastanesi'nde çalışanların s...,okan üniversitesi hastanesi,olumsuz
1,ekici kaşar peynirinin peynir i̇le alakası yok...,ekici,olumsuz
2,i̇lhan duman hukuk bürosu borç mesajı geliyor....,"i̇lhan duman hukuk bürosu,vodafone","olumsuz,olumsuz"
3,samsung kamera flaş sorunu. alalı 1 gün oldu s...,samsung,olumsuz
4,diadermine lift and botology gündüz bakım krem...,diadermine,olumsuz
...,...,...,...
29026,tefal i̇ade sürecinin zorluğu. siparişimden bi...,tefal,olumsuz
29027,saat & saat sattığı ürünün arkasında değil. mi...,"saat & saat,michael kors","olumsuz,nötr"
29028,"yalı spor kargo yok, geri dönüş yok, telefon y...",yalı spor,olumsuz
29029,parex'in sapı kırıldı ve parçası yok. kayseri ...,"parex,kayseri gross market","olumsuz,nötr"


In [4]:
tokenizer = Tokenizer.from_file("/data/tokenizer.json")

In [5]:
class CustomDataset(Dataset):
    def __init__(self, texts, input_ids, attention_masks, token_type_ids, labels):
        self.texts = texts
        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_masks = attention_masks
        self.labels = labels


    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item ):
        text = self.texts[item]
        input_id = torch.LongTensor(self.input_ids[item])
        token_type_id = torch.LongTensor(self.token_type_ids[item])
        attention_mask = torch.LongTensor(self.attention_masks[item])
        label = torch.LongTensor(self.labels[item])


        return {
            'text': text,
            'input_ids': input_id,
            'token_type_ids': token_type_id,
            'attention_mask': attention_mask,
            'labels': label,
        }

In [6]:
def tag_and_sent_sentence(sentence, company_names, sentiments):

    words = tokenizer.encode(sentence).tokens
    tags = ["O"] * len(words)
    company_names = company_names.split(",")
    sentiments = sentiments.split(",")
    try:
        for company,sentiment in zip(company_names,sentiments):
            company_words = company.split()
            for i in range(len(words) - len(company_words) + 1):
                words = [unicodedata.normalize('NFC', word) for word in words]
                company_words = [unicodedata.normalize('NFC', word) for word in company_words]
                if words[i:i + len(company_words)] == company_words:
                    tags[i] = "B-COMPANY"+f"_{sentiment}"
                    for j in range(1, len(company_words)):
                        tags[i + j] = "I-COMPANY"
    except:print(company_names,sentiments)
    

    return list(zip(words, tags))

In [7]:
tagged_sentences = [tag_and_sent_sentence(sentence, sirketler, sentiments) for sentence, sirketler, sentiments in tqdm(data[["review","sirket","sentiment"]].values)]

  0%|          | 0/29031 [00:00<?, ?it/s]

In [8]:
tag2id = {"O": 0, "B-COMPANY_olumsuz": 1, "B-COMPANY_nötr": 2, "B-COMPANY_olumlu": 3, "I-COMPANY": 4}
id2tag = {value: key for key, value in tag2id.items()}

labels = [[tag for word, tag in sentence] for sentence in tagged_sentences]
labels = [[tag2id[tag] for tag in doc] for doc in labels]

In [9]:
from collections import Counter
Counter([label for labelx in labels for label in labelx])

Counter({0: 3649291, 1: 30282, 4: 15302, 3: 13784, 2: 7309})

In [10]:
encodings = [tokenizer.encode(text) for text in tqdm(data["review"].values)]

  0%|          | 0/29031 [00:00<?, ?it/s]

In [11]:
texts = [encoded.tokens for encoded in encodings]
input_ids = [encoded.ids for encoded in encodings]
attention_masks = [encoded.attention_mask for encoded in encodings]
token_type_ids = [encoded.type_ids for encoded in encodings]

In [12]:
class FeedForwardSubLayer(nn.Module):
    def __init__(self, d_model, d_ff):
        super(FeedForwardSubLayer, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForwardSubLayer(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        return self.norm2(x + self.dropout(ff_output))


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # Set the number of attention heads
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % num_heads == 0 #dimension, headlere tam bölünüyormu kontrol et.
        
        self.head_dim = d_model // num_heads
        # Set up the linear transformations
        self.query_linear = nn.Linear(d_model, d_model)
        self.key_linear = nn.Linear(d_model, d_model)
        self.value_linear = nn.Linear(d_model, d_model)
        self.output_linear = nn.Linear(d_model, d_model)

    def split_heads(self, x, batch_size):
        # sequenceyi başlıklar arasında böl
        x = x.view(batch_size, -1, self.num_heads, self.head_dim)
        return x.permute(0, 2, 1, 3) 

    def compute_attention(self, query, key, mask=None):
        # Compute dot-product attention scores
        scores = torch.matmul(query, key.permute(0,1,3,2))
        mask = mask.unsqueeze(1).unsqueeze(1)


        if mask is not None:
            scores = scores.masked_fill(mask == 0, float("-1e20"))
        # Normalize attention scores into attention weights
        attention_weights = F.softmax(scores, dim=-1)
        return attention_weights

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)

        query = self.split_heads(self.query_linear(query), batch_size)
        key = self.split_heads(self.key_linear(key), batch_size)
        value = self.split_heads(self.value_linear(value), batch_size)

        attention_weights = self.compute_attention(query, key, mask)

        output = torch.matmul(attention_weights, value)
        output = output.view(batch_size, self.num_heads, -1, self.head_dim).permute(0, 2, 1, 3).contiguous().view(
            batch_size, -1, self.d_model)
        return self.output_linear(output)

class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_length):
        super(PositionalEncoder, self).__init__()
        self.d_model = d_model
        self.max_length = max_length

        pe = torch.zeros(max_length, d_model)
        position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * -(math.log(10000.0) / d_model))

        # Calculate and assign position encodings to the matrix
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x
    
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoder(d_model, max_sequence_length)
        self.layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

    def forward(self, x, mask):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        for layer in self.layers:
            x = layer(x, mask)
        return x

In [13]:
num_classes = 5 #tahmin edilcel class sayısı, 0-1-2-3-4
vocab_size = 100000 #sözlüğümüzdeki kelime sayısı
batch_size = 32 #tensorler kaçlık batchler halinde olacak
d_model = 512 # modelin dimensionu
num_heads = 8 #kaç tane attention olacak, multihead attention sayısı, burada 8 farklı context vektör tanımladık.
num_layers = 6 #6 tane encoder katmanı yapıyoruz, sırasıyla birbirlerine çıktılarını veriyolar.
d_ff = 2048 #linear layer 
sequence_length = 128 #metin uzunluğu
dropout = 0.1 #unutma katsayısı

In [14]:
dataset = CustomDataset(texts, input_ids, attention_masks, token_type_ids, labels)
dataset_val,dataset_train  = torch.utils.data.random_split(dataset, [0.05,0.95])

dataloader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=True)

In [15]:
encoder = TransformerEncoder(vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length=sequence_length)

In [16]:
def save_model(model, optimizer, epoch, path="model.pth"):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, path)

In [17]:
def train_model(model, train_dataloader, val_dataloader, num_epochs, learning_rate, device):
    model.to(device)
    criterion = nn.CrossEntropyLoss(reduction='none')  # Compute loss for each token separately
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for dataset in tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}/{num_epochs}"):
            batch_input_ids = dataset['input_ids'].to(device)
            batch_att_mask = dataset['attention_mask'].to(device)
            batch_target = dataset['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(batch_input_ids, batch_att_mask)  # Assume no mask for simplicity

            logits = outputs.view(-1, outputs.size(-1))  # Flatten the outputs
            batch_target = batch_target.view(-1)
            batch_att_mask = batch_att_mask.view(-1)

            loss = criterion(logits, batch_target)
            loss = (loss * batch_att_mask).sum() / batch_att_mask.sum()  # Apply attention mask and average

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}, Loss: {avg_loss:.4f}")
        
        
        # Validation step (if val_dataloader is provided)
        if val_dataloader is not None:
            model.eval()
            val_loss = 0
            correct = 0
            total = 0
            with torch.no_grad():
                for dataset_val in tqdm(val_dataloader, desc=f"Validation Epoch {epoch + 1}/{num_epochs}"):
                    val_input_ids = dataset_val['input_ids'].to(device)
                    val_att_mask = dataset_val['attention_mask'].to(device)
                    val_target = dataset_val['labels'].to(device)
                    
                    accuracy_mask = (val_target != 0).long()
                    accuracy_mask = accuracy_mask.view(-1)
                    
                    
                    outputs = model(val_input_ids, val_att_mask)

                    logits = outputs.view(-1, outputs.size(-1))
                    val_target = val_target.view(-1)
                    val_att_mask = val_att_mask.view(-1)

                    loss = criterion(logits, val_target)
                    loss = (loss * val_att_mask).sum() / val_att_mask.sum()  # Apply attention mask and average

                    val_loss += loss.item()
                    _, predicted = torch.max(logits, 1)
                    total += accuracy_mask.sum().item()  # Only count the masked tokens
                    correct += ((predicted == val_target) * accuracy_mask).sum().item()  # Only consider masked tokens for accuracy

            avg_val_loss = val_loss / len(val_dataloader)
            accuracy = correct / total
            print(f"Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}")

    print("Training complete!")
    save_model(model, optimizer, num_epochs)
    return model,optimizer, num_epochs

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [19]:
final_model, optimizer,epoch = train_model(model=encoder,
                              train_dataloader=dataloader,
                              val_dataloader=val_loader,
                              num_epochs=10,
                              learning_rate=1e-4,
                              device=device)

Training Epoch 1/10:   0%|          | 0/862 [00:00<?, ?it/s]

Epoch 1, Loss: 0.1201


Validation Epoch 1/10:   0%|          | 0/46 [00:00<?, ?it/s]

Validation Loss: 0.0487, Accuracy: 0.8347


Training Epoch 2/10:   0%|          | 0/862 [00:00<?, ?it/s]

Epoch 2, Loss: 0.0379


Validation Epoch 2/10:   0%|          | 0/46 [00:00<?, ?it/s]

Validation Loss: 0.0377, Accuracy: 0.8908


Training Epoch 3/10:   0%|          | 0/862 [00:00<?, ?it/s]

Epoch 3, Loss: 0.0255


Validation Epoch 3/10:   0%|          | 0/46 [00:00<?, ?it/s]

Validation Loss: 0.0364, Accuracy: 0.8807


Training Epoch 4/10:   0%|          | 0/862 [00:00<?, ?it/s]

Epoch 4, Loss: 0.0187


Validation Epoch 4/10:   0%|          | 0/46 [00:00<?, ?it/s]

Validation Loss: 0.0412, Accuracy: 0.8644


Training Epoch 5/10:   0%|          | 0/862 [00:00<?, ?it/s]

Epoch 5, Loss: 0.0153


Validation Epoch 5/10:   0%|          | 0/46 [00:00<?, ?it/s]

Validation Loss: 0.0384, Accuracy: 0.8917


Training Epoch 6/10:   0%|          | 0/862 [00:00<?, ?it/s]

Epoch 6, Loss: 0.0123


Validation Epoch 6/10:   0%|          | 0/46 [00:00<?, ?it/s]

Validation Loss: 0.0402, Accuracy: 0.9027


Training Epoch 7/10:   0%|          | 0/862 [00:00<?, ?it/s]

Epoch 7, Loss: 0.0110


Validation Epoch 7/10:   0%|          | 0/46 [00:00<?, ?it/s]

Validation Loss: 0.0420, Accuracy: 0.8923


Training Epoch 8/10:   0%|          | 0/862 [00:00<?, ?it/s]

Epoch 8, Loss: 0.0094


Validation Epoch 8/10:   0%|          | 0/46 [00:00<?, ?it/s]

Validation Loss: 0.0476, Accuracy: 0.8846


Training Epoch 9/10:   0%|          | 0/862 [00:00<?, ?it/s]

Epoch 9, Loss: 0.0087


Validation Epoch 9/10:   0%|          | 0/46 [00:00<?, ?it/s]

Validation Loss: 0.0460, Accuracy: 0.8908


Training Epoch 10/10:   0%|          | 0/862 [00:00<?, ?it/s]

Epoch 10, Loss: 0.0078


Validation Epoch 10/10:   0%|          | 0/46 [00:00<?, ?it/s]

Validation Loss: 0.0444, Accuracy: 0.8947
Training complete!


In [26]:
def predict(model, dataloader, device, padding_idx=0):
    model.to(device)
    model.eval()
    predictions = []

    with torch.no_grad():
        for dataset in tqdm(dataloader, desc="Predicting"):
            batch_input_ids = dataset['input_ids'].to(device)
            batch_att_mask = dataset['attention_mask'].to(device)

            outputs = model(batch_input_ids, batch_att_mask)
            logits = outputs.view(-1, outputs.size(-1))  # Flatten the outputs
            _, predicted = torch.max(logits, 1)

            predictions.append(predicted)
        return predictions

In [27]:
example="""Fiber 100mb SuperOnline kullanıcısıyım yaklaşık 2 haftadır @Twitch @Kick_Turkey gibi canlı yayın platformlarında 360p yayın izlerken donmalar yaşıyoruz.  Başka hiç bir operatörler bu sorunu yaşamazken ben parasını verip alamadığım hizmeti neden ödeyeyim ? @Turkcell """

In [28]:
encodings_prdict = tokenizer.encode(example)

predict_texts = [encodings_prdict.tokens]
predict_input_ids = [encodings_prdict.ids]
predict_attention_masks = [encodings_prdict.attention_mask]
predict_token_type_ids = [encodings_prdict.type_ids]
prediction_labels = [encodings_prdict.type_ids]

predict_data = CustomDataset(predict_texts, predict_input_ids, predict_attention_masks, predict_token_type_ids, prediction_labels)
predict_loader = DataLoader(predict_data, batch_size=1, shuffle=False)

predict_list = predict(model=final_model,dataloader=predict_loader,device=device)

Predicting:   0%|          | 0/1 [00:00<?, ?it/s]

In [29]:
for token,prediction,mask in zip(predict_loader.dataset[0]["text"],predict_list[0].tolist(),predict_attention_masks[0]):
    if mask !=0:
        print(token,id2tag.get(prediction))

[CLS] O
fiber O
100 O
##mb O
superonline B-COMPANY_olumsuz
kullanıcısıyım O
yaklaşık O
2 O
haftadır O
@ O
twitch B-COMPANY_olumsuz
@ O
kick B-COMPANY_olumsuz
_ O
turkey O
gibi O
canlı O
yayın O
platformlarında O
360 O
##p O
yayın O
izlerken O
donmalar O
yaşıyoruz O
. O
başka O
hiç O
bir O
operatörler O
bu O
sorunu O
yaşamazken O
ben O
parasını O
verip O
alamadığım O
hizmeti O
neden O
ödeyeyim O
? O
@ O
turkcell B-COMPANY_olumsuz
[SEP] O
[PAD] O
