Import libraries

In [1]:
pip install torch pytorch-crf unidecode

Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import os
import string
from unidecode import unidecode
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import AdamW, BertModel, AutoConfig, AutoTokenizer
from torch.optim import Adam
import torch.nn as nn
from transformers import AutoTokenizer, get_linear_schedule_with_warmup
import time
import datetime
from sklearn.model_selection import train_test_split
from torchcrf import CRF

In [3]:
if torch.cuda.is_available():  
    device = torch.device("cuda")
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

We will use the GPU: Tesla P100-PCIE-16GB


Read data

In [4]:
# nlp_train=pd.read_csv("train.csv", index_col=[0])
# nlp_test=pd.read_csv("test.csv",index_col=[0],encoding="windows-1252") 

nlp_train=pd.read_csv("/kaggle/input/nlp-project-train/train.csv", index_col=[0])
nlp_test=pd.read_csv("/kaggle/input/nlp-project-train/test.csv",index_col=[0],encoding="windows-1252")

Functions to manipulate data

In [5]:
def convert_to_ascii(text):
    return unidecode(text)

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
        text = text.replace('  ', ' ')
    return text.strip()

In [6]:
# Apply remove punctuations functions to Sentence column
nlp_train['Sentence'] = nlp_train['Sentence'].apply(remove_punctuations)

# applying the conversion functions
nlp_train["Label"] = nlp_train["Sentence"]
nlp_train["Sentence"] = nlp_train["Sentence"].apply(convert_to_ascii)

In [7]:
nlp_test

Unnamed: 0_level_0,Sentence
ID,Unnamed: 1_level_1
0,tr ekonomi ve politika haberleri turkiye nin ...
1,uye girisi
2,son guncelleme 12:12
3,Imrali Mit gorusmesi ihtiyac duyuldukca oluyor
4,Suriye deki silahli selefi muhalifler yeni ku...
...,...
1152,Yuregir Adana ilimize ait sirin bir ilcedir
1153,yuze guluculugun at oynattigi bir aydinlar ort...
1154,zavalli adami oracikta astilar ve hic kimse se...
1155,zengin cocuklarina ariz munasebetsizlikler fak...


In [8]:
sentences_train, labels_train = nlp_train.Sentence.values, nlp_train.Label.values

Tokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=251003.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=60.0, style=ProgressStyle(description_w…




Functions to segment data

In [10]:
def segment_text(sentence, label, max_length=512, overlap=50):
    tokens = tokenizer.tokenize(sentence)
    new = []
    if len(tokens) <= max_length:
        return sentence, label
    else: 
        return None, None
    
    
def data_segments(sentences, labels, max_length=128):
    all_text = []
    all_labels = []

    for sentence, label in zip(sentences, labels):
        segment_s, segment_l = segment_text(sentence, label, max_length=max_length, overlap=50)
        if segment_s:
            all_text.append(segment_s)
            all_labels.append(segment_l)
            
    return all_text, all_labels

In [11]:
train_sentences, train_labels = data_segments(sentences_train, labels_train)

In [12]:
indices=tokenizer.batch_encode_plus(train_sentences,max_length=128,add_special_tokens=True, return_attention_mask=True,pad_to_max_length=True,truncation=True)
input_ids=indices["input_ids"]
attention_masks=indices["attention_mask"]
print(input_ids[0])
print(train_sentences[0])
print(attention_masks[0])

[2, 16751, 1066, 8725, 1992, 29252, 4456, 22063, 5484, 18740, 13526, 1027, 26905, 24419, 3575, 1028, 2031, 21070, 2194, 1996, 5538, 14330, 2033, 2002, 9474, 2293, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
sinif havuz ve acik deniz calismalariyla tum dunyada gecerli basarili bir standart olusturmustur
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [13]:
indices=tokenizer.batch_encode_plus(train_labels,max_length=128,add_special_tokens=True, return_attention_mask=True,pad_to_max_length=True,truncation=True)
output_ids=indices["input_ids"]
print(output_ids[0])
print(train_labels[0])

[2, 3825, 8725, 1992, 2416, 4456, 24513, 2525, 5292, 5953, 4165, 1996, 5538, 27202, 2293, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
sınıf havuz ve açık deniz çalışmalarıyla tüm dünyada geçerli başarılı bir standart oluşturmuştur


Prepare train and test data

In [14]:
# Use 99% for training and 1% for validation.
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, output_ids, 
                                                            random_state=42, test_size=0.2)
# Do the same for the masks.
train_masks, validation_masks, _, _ = train_test_split(attention_masks, output_ids,
                                             random_state=42, test_size=0.2)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels, dtype=torch.long)
validation_labels = torch.tensor(validation_labels, dtype=torch.long)
train_masks = torch.tensor(train_masks, dtype=torch.long)
validation_masks = torch.tensor(validation_masks, dtype=torch.long)


batch_size = 16

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, pin_memory=True)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_loader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size, pin_memory=True)

In [15]:
def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [16]:
class BertSeq2Seq(nn.Module):
    def __init__(self, model_name, config):
        super(BertSeq2Seq, self).__init__()
        self.config = config
        self.bert_encoder = BertModel.from_pretrained(model_name, config=config)
        self.decoder = nn.GRU(input_size=config.hidden_size, 
                              hidden_size=config.hidden_size, 
                              num_layers=1, 
                              batch_first=True)
        self.out = nn.Linear(config.hidden_size, config.vocab_size)

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Get outputs from the encoder
        encoder_outputs = self.bert_encoder(input_ids, attention_mask=attention_mask)
        
        # Check if the encoder outputs are in a tuple and get the last hidden state
        if isinstance(encoder_outputs, tuple):  # Older versions might return a tuple
            encoder_last_hidden_state = encoder_outputs[0]
        else:  # Newer versions return a model-specific output object
            encoder_last_hidden_state = encoder_outputs.last_hidden_state

        # Pass the last hidden state to the decoder
        decoder_outputs, _ = self.decoder(encoder_last_hidden_state)
        logits = self.out(decoder_outputs)

        outputs = (logits,)

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # returns (loss, logits) if labels provided

In [17]:
config = AutoConfig.from_pretrained("dbmdz/bert-base-turkish-cased")
config.vocab_size = 32000
config.num_attention_heads = 8

# Initialize your model with the loaded configuration
model = BertSeq2Seq("dbmdz/bert-base-turkish-cased", config)

optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  betas=[0.9,0.999],
                  eps = 1e-6
                )

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# total_steps = len(train_dataloader) * epochs
total_steps = 1
epochs = 1

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=445018508.0, style=ProgressStyle(descri…




In [18]:
# # Function to measure time elapsed
# def format_time(elapsed):
#     return str(datetime.timedelta(seconds=int(round((elapsed)))))

# for epoch_i in range(epochs):
#     print(f'======== Epoch {epoch_i + 1} / {epochs} ========')
#     print('Training...')

#     total_loss = 0
#     model.train()
#     t0 = time.time()  # Start time for the epoch

#     for step, batch in enumerate(train_loader):
#         if step % 30 == 0 and step != 0:
#             elapsed = format_time(time.time() - t0)
#             print(f'  Batch {step:>5,} of {len(train_loader):>5,}. Elapsed: {elapsed}.')

#         b_input_ids = batch[0].to(device)
#         b_input_mask = batch[1].to(device)
#         b_labels = batch[2].to(device)

#         model.zero_grad()
#         outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
#         loss = outputs[0] if isinstance(outputs, tuple) else outputs.loss

#         total_loss += loss.item()
#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#         optimizer.step()
#         scheduler.step()

#     avg_train_loss = total_loss / len(train_loader)
#     print(f"  Average training loss: {avg_train_loss:.2f}")

#     # Validation
#     print("Running Validation...")
#     model.eval()
#     eval_loss, eval_accuracy, nb_eval_steps = 0, 0, 0

#     for batch in validation_loader:
#         b_input_ids = batch[0].to(device)
#         b_input_mask = batch[1].to(device)
#         b_labels = batch[2].to(device)

#         with torch.no_grad():
#             outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
#             logits = outputs[1] if isinstance(outputs, tuple) else outputs.logits

#         logits = logits.detach().cpu().numpy()
#         label_ids = b_labels.to('cpu').numpy()

#         # Example of how you might calculate accuracy for seq2seq. Adjust as necessary.
#         tmp_eval_accuracy = np.mean(np.argmax(logits, axis=-1) == label_ids)
#         eval_accuracy += tmp_eval_accuracy
#         nb_eval_steps += 1

#     print(f"  Validation Accuracy: {eval_accuracy / nb_eval_steps:.2f}")

In [19]:
class BertBiLSTMCRF(nn.Module):
    def __init__(self, bert_model, num_labels, lstm_hidden_dim, lstm_layers=1, bidirectional=True, dropout=0.1):
        super(BertBiLSTMCRF, self).__init__()
        self.num_labels = num_labels
        self.lstm_hidden_dim = lstm_hidden_dim
        self.bert = BertModel.from_pretrained(bert_model)
        self.dropout = nn.Dropout(dropout)
        self.bilstm = nn.LSTM(self.bert.config.hidden_size, lstm_hidden_dim, num_layers=lstm_layers, bidirectional=bidirectional, batch_first=True, dropout=dropout)
        self.hidden2tag = nn.Linear(lstm_hidden_dim * 2 if bidirectional else lstm_hidden_dim, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs[0])
        lstm_output, _ = self.bilstm(sequence_output)
        emissions = self.hidden2tag(lstm_output)
        if labels is not None:
            loss = -self.crf(emissions, labels, mask=attention_mask.byte())
            return loss
        else:
            predictions = self.crf.decode(emissions, mask=attention_mask.byte())
            return predictions


In [20]:
# tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
# encoded_batch = tokenizer(train_sentences, padding=True, truncation=True, return_tensors="pt")
# input_ids = encoded_batch['input_ids']
# attention_mask_train = encoded_batch['attention_mask']

In [21]:
# Assume sentences_train and labels_train are your input IDs and labels respectively
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

In [22]:
unique_labels = nlp_train['Label'].unique()
num_labels = len(unique_labels)

In [23]:
model = BertBiLSTMCRF('dbmdz/bert-base-turkish-cased', num_labels=num_labels, lstm_hidden_dim=256)
model.to(device)

  "num_layers={}".format(dropout, num_layers))


BertBiLSTMCRF(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

In [24]:
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 4

model.train()

for epoch in range(epochs):
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        model.zero_grad()

        # Forward pass
        loss = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Average loss for epoch {epoch+1}: {avg_loss}")

RuntimeError: CUDA out of memory. Tried to allocate 199.20 GiB (GPU 0; 15.89 GiB total capacity; 11.21 GiB already allocated; 3.84 GiB free; 11.33 GiB reserved in total by PyTorch)