Import libraries

In [5]:
# pip install pytorch-crf

In [6]:
import numpy as np
import pandas as pd
import os
import string
from unidecode import unidecode
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import AdamW, BertModel, AutoConfig, AutoTokenizer
from torch.optim import Adam
import torch.nn as nn
from transformers import AutoTokenizer, get_linear_schedule_with_warmup
import time
import datetime
from sklearn.model_selection import train_test_split
from torchcrf import CRF

In [7]:
if torch.cuda.is_available():  
    device = torch.device("cuda")
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


Read data

In [8]:
nlp_train=pd.read_csv("train.csv", index_col=[0])
nlp_test=pd.read_csv("test.csv",index_col=[0],encoding="windows-1252") 

#nlp_train=pd.read_csv("/kaggle/input/nlp-project-train/train.csv", index_col=[0])
#nlp_test=pd.read_csv("/kaggle/input/nlp-project-train/test.csv",index_col=[0],encoding="windows-1252")

Functions to manipulate data

In [9]:
def convert_to_ascii(text):
    return unidecode(text)

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
        text = text.replace('  ', ' ')
    return text.strip()

In [10]:
nlp_train['Sentence'] = nlp_train['Sentence'].apply(remove_punctuations)
nlp_train["Label"] = nlp_train["Sentence"]
nlp_train["Sentence"] = nlp_train["Sentence"].apply(convert_to_ascii)

In [11]:
sentences_train, labels_train = nlp_train.Sentence.values, nlp_train.Label.values

Tokenizer

In [12]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")

Functions to segment data

In [13]:
def segment_text(sentence, label, max_length=128, overlap=50):
    tokens = tokenizer.tokenize(sentence)
    new = []
    if len(tokens) <= max_length:
        return sentence, label
    else: 
        return None, None
    
    
def data_segments(sentences, labels, max_length=32):
    all_text = []
    all_labels = []

    for sentence, label in zip(sentences, labels):
        segment_s, segment_l = segment_text(sentence, label, max_length=max_length, overlap=50)
        if segment_s:
            all_text.append(segment_s)
            all_labels.append(segment_l)
            
    return all_text, all_labels

In [14]:
train_sentences_segment, train_labels_segment = data_segments(sentences_train, labels_train)

Token indices sequence length is longer than the specified maximum sequence length for this model (6086 > 512). Running this sequence through the model will result in indexing errors


In [15]:
indices=tokenizer.batch_encode_plus(train_sentences_segment,max_length=128,add_special_tokens=True, return_attention_mask=True,pad_to_max_length=True,truncation=True)
input_ids=indices["input_ids"]
attention_masks=indices["attention_mask"]
print(input_ids[0])
print(train_sentences_segment[0])
print(attention_masks[0])



[2, 16751, 1066, 8725, 1992, 29252, 4456, 22063, 5484, 18740, 13526, 1027, 26905, 24419, 3575, 1028, 2031, 21070, 2194, 1996, 5538, 14330, 2033, 2002, 9474, 2293, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
sinif havuz ve acik deniz calismalariyla tum dunyada gecerli basarili bir standart olusturmustur
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [16]:
indices=tokenizer.batch_encode_plus(train_labels_segment,max_length=128,add_special_tokens=True, return_attention_mask=True,pad_to_max_length=True,truncation=True)
output_ids=indices["input_ids"]
print(output_ids[0])
print(train_labels_segment[0])

[2, 3825, 8725, 1992, 2416, 4456, 24513, 2525, 5292, 5953, 4165, 1996, 5538, 27202, 2293, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
sınıf havuz ve açık deniz çalışmalarıyla tüm dünyada geçerli başarılı bir standart oluşturmuştur


Prepare train and test data

In [17]:
# Use 99% for training and 1% for validation.
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids,output_ids, 
                                                            random_state=42, test_size=0.2)
# Do the same for the masks.
train_masks, validation_masks, _, _ = train_test_split(attention_masks, output_ids,
                                             random_state=42, test_size=0.2)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels, dtype=torch.long)
validation_labels = torch.tensor(validation_labels, dtype=torch.long)
train_masks = torch.tensor(train_masks, dtype=torch.long)
validation_masks = torch.tensor(validation_masks, dtype=torch.long)


In [18]:
def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

## Combined Model

In [19]:
# unique_labels = nlp_train['Label'].unique()
# num_labels = len(unique_labels)

# tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
# encoded_batch = tokenizer(train_sentences_segment, padding=True, truncation=True, return_tensors="pt")
# input_ids = encoded_batch['input_ids']
# attention_mask_train = encoded_batch['attention_mask']

# train_data = TensorDataset(train_inputs, train_masks, train_labels)
# train_sampler = RandomSampler(train_data)
# train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

In [20]:
# class BertBiLSTMCRF(nn.Module):
#     def __init__(self, bert_model, num_labels, lstm_hidden_dim):
#         super(BertBiLSTMCRF, self).__init__()
#         self.bert = BertModel.from_pretrained(bert_model)
#         self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size,
#                             hidden_size=lstm_hidden_dim,
#                             num_layers=1,
#                             bidirectional=True,
#                             batch_first=True)
#         self.dropout = nn.Dropout(0.1)
#         self.classifier = nn.Linear(lstm_hidden_dim * 2, num_labels)
#         self.crf = CRF(num_labels, batch_first=True)

#     def forward(self, input_ids, attention_mask=None, labels=None):
#         outputs = self.bert(input_ids, attention_mask=attention_mask)
#         sequence_output = outputs[0]
#         lstm_output, _ = self.lstm(sequence_output)
#         lstm_output = self.dropout(lstm_output)
#         logits = self.classifier(lstm_output)
#         if labels is not None:
#             loss = -self.crf(logits, labels, mask=attention_mask.byte())
#             return loss, logits
#         else:
#             return logits

#     def predict(self, input_ids, attention_mask=None):
#         with torch.no_grad():
#             logits = self.forward(input_ids, attention_mask)
#             return self.crf.decode(logits, mask=attention_mask.byte())
        
        
class BertBiLSTMCRF(nn.Module):
    def __init__(self, bert_model, num_labels, lstm_hidden_dim, lstm_layers=1):
        super(BertBiLSTMCRF, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model)
        self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size,
                            hidden_size=lstm_hidden_dim,
                            num_layers=lstm_layers,
                            bidirectional=True,
                            batch_first=True)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(lstm_hidden_dim * 2, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        lstm_output, _ = self.lstm(sequence_output)
        lstm_output = self.dropout(lstm_output)
        logits = self.classifier(lstm_output)
        if labels is not None:
            loss = -self.crf(logits, labels, mask=attention_mask.byte())  # Calculate loss using CRF
            return loss, logits
        else:
            return logits

    def predict(self, input_ids, attention_mask=None):
        with torch.no_grad():
            logits = self.forward(input_ids, attention_mask)
            return self.crf.decode(logits, mask=attention_mask.byte())

In [21]:
def train_model(model, train_dataloader, validation_dataloader, device, epochs=4, gradient_accumulation_steps=1):
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.95)
    
    for epoch_i in range(epochs):
        print(f'======== Epoch {epoch_i + 1} / {epochs} ========')
        total_train_loss = 0
        
        model.train()
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            
            model.zero_grad()        
            loss, logits = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            
            loss = loss / gradient_accumulation_steps
            loss.backward()
            
            total_train_loss += loss.item()
            if (step + 1) % gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                model.zero_grad()
            
            if step % 40 == 0 and not step == 0:
                print(f'  Batch {step:>5,} of {len(train_dataloader):>5,}. Loss: {loss.item():.2f}')
        
        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f"  Average training loss: {avg_train_loss:.2f}")

        print("\nRunning Validation...")
        total_eval_accuracy = 0
        total_eval_loss = 0
        model.eval()

        for batch in validation_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            
            with torch.no_grad():
                loss, logits = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            
            total_eval_loss += loss.item()
            
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            total_eval_accuracy += flat_accuracy(logits, label_ids)
        
        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        print(f"  Validation accuracy: {avg_val_accuracy:.2f}")
        avg_val_loss = total_eval_loss / len(validation_dataloader)
        print(f"  Validation Loss: {avg_val_loss:.2f}")

    print("done")

In [28]:
num_labels = len(set(train_sentences_segment))
# num_labels = len(unique_labels)
# num_labels=5
lstm_hidden_dim = 64
bert_model = 'dbmdz/bert-base-turkish-cased'
epochs = 1
batch_size = 1
gradient_accumulation_steps = 5

# model = BertBiLSTMCRF(bert_model, num_labels, lstm_hidden_dim)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.cuda.empty_cache()

model = BertBiLSTMCRF(bert_model, num_labels, lstm_hidden_dim)
model.bert = model.bert.to(device)
# Do some operations if necessary
model.lstm = model.lstm.to(device)
model.classifier = model.classifier.to(device)
model.crf = model.crf.to(device)

# model.to(device)


train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_loader = DataLoader(validation_data, batch_size=batch_size, shuffle=False)

In [29]:
train_model(model, train_loader, validation_loader, device, epochs, gradient_accumulation_steps)



  score = torch.where(mask[i].unsqueeze(1), next_score, score)
