In [1]:
!pip install pytorch-crf
!pip install unidecode



In [2]:
pip install --upgrade transformers

Requirement already up-to-date: transformers in /opt/conda/lib/python3.6/site-packages (4.18.0)
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install --upgrade torch

Requirement already up-to-date: torch in /opt/conda/lib/python3.6/site-packages (1.10.2)
Note: you may need to restart the kernel to use updated packages.


In [4]:
import numpy as np
import pandas as pd
import os
import string
from unidecode import unidecode
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import AdamW, BertModel, AutoConfig, AutoTokenizer
from torch.optim import Adam
import torch.nn as nn
from transformers import AutoTokenizer, get_linear_schedule_with_warmup
import time;
import datetime
from sklearn.model_selection import train_test_split
from torchcrf import CRF

In [5]:
if torch.cuda.is_available():  
    device = torch.device("cuda")
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

We will use the GPU: Tesla P100-PCIE-16GB


In [6]:
nlp_train=pd.read_csv("/kaggle/input/nlp-project-train/train.csv", index_col=[0])
nlp_test=pd.read_csv("/kaggle/input/nlp-project-train/test.csv",index_col=[0],encoding="windows-1252") 

# nlp_train=pd.read_csv("train.csv", index_col=[0])
# nlp_test=pd.read_csv("test.csv",index_col=[0],encoding="windows-1252") 

In [7]:
def convert_to_ascii(sentence):
    text = unidecode(sentence)
    return text

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
        text = text.replace('  ', ' ')
    return text.strip()

def split_sentences(sentences, max_length=100):
    punctuations = {'.', ',', ';', ':'}
    results = []

    for sentence in sentences:
        while len(sentence) > max_length:
            # find last punctuation before max_length
            split_pos = -1
            for p in punctuations:
                pos = sentence.rfind(p, 0, max_length + 1)
                if pos > split_pos:
                    split_pos = pos
            
            # If no punctuation found, split at the last space before max_length
            if split_pos == -1:
                split_pos = sentence.rfind(' ', 0, max_length + 1)
            
            # If no space found, just split at max_length
            if split_pos == -1:
                split_pos = max_length
            
            # Append the split segment to results
            results.append(sentence[:split_pos + 1].strip())
            # Move the rest of the sentence forward
            sentence = sentence[split_pos + 1:].strip()
        
        # Append the remainder of the sentence if it's not empty
        if sentence:
            results.append(sentence)
    
    return np.array(results)


In [8]:
# Raw sentences
raw_sentences = nlp_train["Sentence"].values
# Split sentences
sentences = split_sentences(raw_sentences, max_length=200)
# Remove punctuations
sentences = [remove_punctuations(s).lower() for s in sentences]
# Apply convert to ascii to y_train
asci_sentences = [convert_to_ascii(s) for s in sentences]

In [9]:
def encode_word(sentence):
    for char in sentence:
        # print(char)
        if char in "ıöüğşç":
            # replace char with 2
            sentence = sentence.replace(char, '2')
        else:
            sentence = sentence.replace(char, '1')
    return sentence

In [10]:
sentences_diacritics = sentences.copy()
for sentence_index in range(len(sentences_diacritics)):
    sentence = sentences_diacritics[sentence_index]
    new_sentence = encode_word(sentence)
    sentences_diacritics[sentence_index] = new_sentence

In [11]:
sentences_diacritics[0]

'121211111111111122111111112112211111211111211121111111121111111211212111111111111111112111112111'

In [12]:
def align_tokens_with_diacritics(texts, diacritics, tokenizer, label_dict):
    tokenized_texts = tokenizer(texts, padding=True, truncation=True, return_offsets_mapping=True, return_tensors="pt")
    input_ids = tokenized_texts['input_ids']
    attention_masks = tokenized_texts['attention_mask']
    token_type_ids = tokenized_texts['token_type_ids']
    offset_mappings = tokenized_texts['offset_mapping']
    
    # Initialize the labels for each token to a default numeric value, e.g., label_dict['O'] for 'Other'
    labels = [[label_dict['O']] * len(input_id) for input_id in input_ids]

    for i, offsets in enumerate(offset_mappings):
        for token_index, (start, end) in enumerate(offsets):
            if start == end:  # Special tokens, skip them
                continue
            # Extract the original character sequence corresponding to the current token
            char_sequence = texts[i][start:end]
            # Assuming diacritics are stored in a similar structure as texts
            corresponding_diacritic = diacritics[i][start:end]
            # Assign the diacritic label to the first character's diacritic or any suitable logic
            # Use the dictionary to convert string labels to integers
            labels[i][token_index] = label_dict[corresponding_diacritic[0]] if corresponding_diacritic else label_dict['O']

    return input_ids, attention_masks, token_type_ids, labels


In [13]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
label_dict = {'O': 0, '1': 1, '2': 2}
input_ids, attention_masks, token_type_ids, labels = align_tokens_with_diacritics(
    asci_sentences, sentences_diacritics, tokenizer, label_dict
)

In [14]:
# Convert lists of labels into a tensor; assuming each label is already converted to a numeric form
label_tensors = torch.tensor(labels, dtype=torch.long)

# Create a TensorDataset
dataset = TensorDataset(input_ids, attention_masks, token_type_ids, label_tensors)

# You can now use this dataset with a DataLoader to create batches
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [15]:
from transformers import BertForTokenClassification, AdamW

num_labels = len(set([label for sublist in labels for label in sublist]))
"""
model = BertForTokenClassification.from_pretrained(
    "dbmdz/bert-base-turkish-cased",
    num_labels=num_labels,
    output_attentions=False,
    output_hidden_states=False,
)

# Move model to GPU if available
model.to(device)

# Setting up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)
"""

'\nmodel = BertForTokenClassification.from_pretrained(\n    "dbmdz/bert-base-turkish-cased",\n    num_labels=num_labels,\n    output_attentions=False,\n    output_hidden_states=False,\n)\n\n# Move model to GPU if available\nmodel.to(device)\n\n# Setting up the optimizer\noptimizer = AdamW(model.parameters(), lr=2e-5)\n'

In [16]:
from tqdm import tqdm
"""
epochs = 1

model.train()
for epoch in range(epochs):
    total_loss = 0
    for step, batch in enumerate(tqdm(dataloader)):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_token_types, b_labels = batch
        model.zero_grad()
        
        outputs = model(b_input_ids, token_type_ids=b_token_types, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch}, Loss: {total_loss / len(dataloader)}")
    
"""


'\nepochs = 1\n\nmodel.train()\nfor epoch in range(epochs):\n    total_loss = 0\n    for step, batch in enumerate(tqdm(dataloader)):\n        batch = tuple(t.to(device) for t in batch)\n        b_input_ids, b_input_mask, b_token_types, b_labels = batch\n        model.zero_grad()\n        \n        outputs = model(b_input_ids, token_type_ids=b_token_types, attention_mask=b_input_mask, labels=b_labels)\n        loss = outputs[0]\n        total_loss += loss.item()\n        loss.backward()\n        optimizer.step()\n\n    print(f"Epoch {epoch}, Loss: {total_loss / len(dataloader)}")\n    \n'

In [17]:
# Save the model's state dictionary
# torch.save(model.state_dict(), 'model_state_dict.pth')

# Optionally, save the entire model
# torch.save(model, 'full_model.pth')


In [18]:
"""
# Input text
input_text = "sinif havuz ve acik deniz calismalariyla tum dunyada"

# Tokenize the input
test_inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
test_inputs = {k: v.to(device) for k, v in test_inputs.items()}

# Predict
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    outputs = model(**test_inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

# Get tokens for reference
tokens = tokenizer.convert_ids_to_tokens(test_inputs['input_ids'][0])

# Define diacritics mapping
diacritics_mapping = {
    'c': {1: 'ç'},
    'g': {1: 'ğ'},
    'i': {1: 'ı'},
    'o': {1: 'ö'},
    's': {1: 'ş'},
    'u': {1: 'ü'}
}

# Function to apply diacritics based on predictions
def apply_diacritics(tokens, predictions, mapping):
    diacritized_text = []
    skip_tokens = ['[CLS]', '[SEP]', '[PAD]']  # Tokens to skip when applying diacritics

    # Accumulator for reassembling subword tokens
    current_word = ""
    for token, pred in zip(tokens, predictions[0]):
        if token in skip_tokens:
            continue

        # Check if the token is a continuation of the previous one
        if token.startswith("##"):
            current_word += token[2:]  # Append without "##" and without adding space
        else:
            # If there's accumulated word data from previous tokens, append it first
            if current_word:
                diacritized_text.append(current_word)
                current_word = ""
            # Start a new word accumulation
            current_word = token

        # Apply diacritics to the current part of the word if applicable
        if current_word in mapping and pred.item() in mapping[current_word]:
            # Replace the whole word with its diacritized version
            current_word = mapping[current_word][pred.item()]

    # Append the last accumulated word if any
    if current_word:
        diacritized_text.append(current_word)

    return " ".join(diacritized_text)

# Apply diacritics
diacritized_output = apply_diacritics(tokens, predictions, diacritics_mapping)
print(predictions)
print("Original:", input_text)
print("Diacritized:", diacritized_output)

"""

'\n# Input text\ninput_text = "sinif havuz ve acik deniz calismalariyla tum dunyada"\n\n# Tokenize the input\ntest_inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)\ntest_inputs = {k: v.to(device) for k, v in test_inputs.items()}\n\n# Predict\nmodel.eval()  # Set the model to evaluation mode\nwith torch.no_grad():\n    outputs = model(**test_inputs)\n    predictions = torch.argmax(outputs.logits, dim=-1)\n\n# Get tokens for reference\ntokens = tokenizer.convert_ids_to_tokens(test_inputs[\'input_ids\'][0])\n\n# Define diacritics mapping\ndiacritics_mapping = {\n    \'c\': {1: \'ç\'},\n    \'g\': {1: \'ğ\'},\n    \'i\': {1: \'ı\'},\n    \'o\': {1: \'ö\'},\n    \'s\': {1: \'ş\'},\n    \'u\': {1: \'ü\'}\n}\n\n# Function to apply diacritics based on predictions\ndef apply_diacritics(tokens, predictions, mapping):\n    diacritized_text = []\n    skip_tokens = [\'[CLS]\', \'[SEP]\', \'[PAD]\']  # Tokens to skip when applying diacritics\n\n    # Accumulator for

In [21]:
class BertBiLSTMCRF(nn.Module):
    def __init__(self, bert_model_name, num_tags):
        super(BertBiLSTMCRF, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.lstm = nn.LSTM(input_size=768, hidden_size=256, num_layers=1, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(512, num_tags)  # Adjust based on your number of tags
        self.crf = CRF(num_tags, batch_first=True)

    def forward(self, input_ids, attention_mask, labels=None):
        # Use your existing variables for BERT model processing
        bert_output = self.bert(input_ids, attention_mask=attention_mask)[0]
        lstm_output, _ = self.lstm(bert_output)
        emissions = self.fc(lstm_output)

        if labels is not None:
            # Calculate loss if labels are provided
            loss = -self.crf(emissions, labels, mask=attention_mask.byte())
            return loss
        else:
            # Decode the predicted sequence of tags if no labels are provided
            return self.crf.decode(emissions, mask=attention_mask.byte())

In [30]:
"""
# Initialize the model and move it to the appropriate device
model = BertBiLSTMCRF('dbmdz/bert-base-turkish-cased', num_tags=num_labels)
model = model.to(device)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 1
# Setup the learning rate scheduler
# total_steps = len(dataloader) * num_epochs  # Adjust according to your actual dataloader and num_epochs
total_steps = 10
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

# Training Loop
for epoch in range(num_epochs):
    model.train()  # Put the model into training mode
    total_loss = 0

    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)  # Move batch to the correct device
        input_ids, attention_mask, labels = batch[0], batch[1], batch[3]  # Adjust indexing if necessary

        model.zero_grad()  # Clear existing gradients
        
        # Forward pass to get loss
        loss = model(input_ids, attention_mask, labels)
        print(loss.item())
        
        # Backward pass to calculate the gradients
        loss.backward()
        
        # Track total loss for logging
        total_loss += loss.item()
        
        # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Update model parameters
        optimizer.step()
        scheduler.step()  # Update the learning rate

    # Output the average loss for the epoch
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")
"""

'\n# Initialize the model and move it to the appropriate device\nmodel = BertBiLSTMCRF(\'dbmdz/bert-base-turkish-cased\', num_tags=num_labels)\nmodel = model.to(device)\n\n# Define the optimizer\noptimizer = AdamW(model.parameters(), lr=5e-5)\nnum_epochs = 1\n# Setup the learning rate scheduler\n# total_steps = len(dataloader) * num_epochs  # Adjust according to your actual dataloader and num_epochs\ntotal_steps = 10\nscheduler = get_linear_schedule_with_warmup(optimizer,\n                                            num_warmup_steps=0,\n                                            num_training_steps=total_steps)\n\n# Training Loop\nfor epoch in range(num_epochs):\n    model.train()  # Put the model into training mode\n    total_loss = 0\n\n    for batch in dataloader:\n        batch = tuple(t.to(device) for t in batch)  # Move batch to the correct device\n        input_ids, attention_mask, labels = batch[0], batch[1], batch[3]  # Adjust indexing if necessary\n\n        model.zero_grad()

In [28]:
# Input text
input_text = "sinif havuz ve acik deniz calismalariyla tum dunyada"

# Tokenize the input
test_inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
test_inputs = {k: v.to(device) for k, v in test_inputs.items() if k in ['input_ids', 'attention_mask']}

# Predict
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    # Since CRF outputs the best tag sequence directly, there is no need for argmax
    predictions = model(**test_inputs)

# Get tokens for reference
tokens = tokenizer.convert_ids_to_tokens(test_inputs['input_ids'][0])

# Define diacritics mapping
diacritics_mapping = {
    'c': {1: 'ç'},
    'g': {1: 'ğ'},
    'i': {1: 'ı'},
    'o': {1: 'ö'},
    's': {1: 'ş'},
    'u': {1: 'ü'}
}

# Function to apply diacritics based on predictions
def apply_diacritics(tokens, predictions, mapping):
    diacritized_text = []
    skip_tokens = ['[CLS]', '[SEP]', '[PAD]']  # Tokens to skip when applying diacritics

    # Accumulator for reassembling subword tokens
    current_word = ""
    for token, pred in zip(tokens, predictions[0]):
        if token in skip_tokens:
            continue

        # Check if the token is a continuation of the previous one
        if token.startswith("##"):
            current_word += token[2:]  # Append without "##" and without adding space
        else:
            # If there's accumulated word data from previous tokens, append it first
            if current_word:
                diacritized_text.append(current_word)
                current_word = ""
            # Start a new word accumulation
            current_word = token

        # Apply diacritics to the current part of the word if applicable
        if current_word in mapping and pred in mapping[current_word]:
            # Replace the whole word with its diacritized version
            current_word = mapping[current_word][pred]

    # Append the last accumulated word if any
    if current_word:
        diacritized_text.append(current_word)

    return " ".join(diacritized_text)

# Apply diacritics
diacritized_output = apply_diacritics(tokens, predictions, diacritics_mapping)
print("Predictions:", predictions)
print("Original:", input_text)
print("Diacritized:", diacritized_output)


Predictions: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
Original: sinif havuz ve acik deniz calismalariyla tum dunyada
Diacritized: sinif havuz ve acik deniz calismalariyla tum dunyada
