In [None]:
!pip install transformers -q
!pip install request -q

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig, Adafactor
from transformers import get_linear_schedule_with_warmup
import datetime
import random
import seaborn as sns
import numpy as np
import time
import matplotlib.pyplot as plt
import requests

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
model_name = 'dbmdz/bert-base-turkish-128k-uncased'

In [None]:
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)



In [None]:
#df = pd.read_csv('/content/obs_clean_data_not_turkish_char.csv')
#df = pd.read_csv('/content/obs_clean_data_turkish_char.csv')
#df = pd.read_csv('/content/obs_clean_data.csv')
df = pd.read_csv('/kaggle/input/train/teknofest_train_final.csv', sep='|')
df['is_offensive'] = df['is_offensive'].astype(int)

df.head(10)
df.info()

In [None]:
# Etiketleri ayrıştırma
df = pd.DataFrame(df, columns=['text', 'target'])
le = LabelEncoder()
df.target = le.fit_transform(df.target)
print(df.info())
print(df.target.unique())
print(df.head())

In [None]:
url = "https://cryptic-oasis-68424.herokuapp.com/preprocess?tr_chars=false&acc_marks=true&punct=true&lower=true&offensive=false&norm_numbers=true&remove_numbers=false&remove_spaces=true&remove_stopwords=false&min_len=4"
texts = df.text.values.tolist()
preprocess_response = requests.post(url, json={"texts": texts})
processed_text = preprocess_response.json()['result']
df.text = processed_text
print(df[df.text == ''].sum())
df = df[df['text'] != '']
print(df.head())
print(df[df.text == ''].sum())

In [None]:
# check GPU
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
  device = torch.device("cuda")
  print('GPU:', torch.cuda.get_device_name(0))
else:
  raise SystemError('GPU device not found')

In [None]:
df.groupby('target').size()

In [None]:
df['text'].size #total data

In [None]:

df.head(8)

In [None]:
df['target'] = LabelEncoder().fit_transform(df['target'])

In [None]:
df.head(8)

In [None]:
"""
INSULT    --> 0
OTHER     --> 1
PROFANITY --> 2
RACIST    --> 3
SEXIST    --> 4
"""

In [None]:
training = df.groupby('target').apply(lambda x : x.sample(frac = 0.8))
test = pd.concat([df,training]).drop_duplicates(keep=False)

In [None]:
training.reset_index(drop=True)

In [None]:
test.reset_index(drop=True)

In [None]:
print("Training: ", len(training))
print("Test: ", len(test))

In [None]:
training_texts = training.text.values
training_labels = training.target.values

In [None]:
training_labels

In [None]:
training_texts

In [None]:
input_ids = []
attention_masks = []
max_len = 32


for text in training_texts:
    encoded_dict = tokenizer.encode_plus(
                        str(text),                     
                        add_special_tokens = True,
                        max_length = max_len,      
                        pad_to_max_length = True,
                        return_attention_mask = True, 
                        return_tensors = 'pt',
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(training_labels)

print('Original: ', training_texts[0])
print('Token IDs:', input_ids[0])

In [None]:
train_dataset = TensorDataset(input_ids, attention_masks, labels)

In [None]:
batch_size = 32

train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size 
        )

In [None]:
num_of_cat = len(df['target'].unique())

In [None]:
num_of_cat

In [None]:
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels = num_of_cat, 
    output_attentions = False,
    output_hidden_states = False,
)
model.cuda()

In [None]:
epochs = 8 #denemelerim sonucu kayıp 0 a 8. epochta yaklaşıyor

optimizer = AdamW(model.parameters(),
                  lr = 5e-5,
                  eps = 1e-7 
                )

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []
total_t0 = time.time()

for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    t0 = time.time()
    total_train_loss = 0
    model.train()
    
    for step, batch in enumerate(train_dataloader):
        if step % 10 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        
        output = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        loss = output['loss']
        logits = output['logits']
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            
    training_time = format_time(time.time() - t0)

    print("Average training loss: {0:.2f}".format(avg_train_loss))
    print("Training epoch took: {:}".format(training_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Training Time': training_time,
        }
    )

print("Training completed in {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
df_stats = pd.DataFrame(data=training_stats)
plt.plot(df_stats['Training Loss'], label="Training")
plt.title("Training Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.xticks([0, 1, 2, 3, 4, 5, 6, 7, 8])
plt.show()

In [None]:
test_texts = test.text.values
test_labels = test.target.values

input_ids = []
attention_masks = []

for text in test_texts:
    encoded_dict = tokenizer.encode_plus(
                        text,                     
                        add_special_tokens = True, 
                        max_length = max_len,          
                        pad_to_max_length = True,
                        return_attention_mask = True,  
                        return_tensors = 'pt',   
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(test_labels)


prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
print('Prediction started on test data')
model.eval()
predictions , true_labels = [], []

for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch

  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  predictions.append(logits)
  true_labels.append(label_ids)

print('Prediction completed')

In [None]:
prediction_set = []

for i in range(len(true_labels)):
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  prediction_set.append(pred_labels_i)

prediction_scores = [item for sublist in prediction_set for item in sublist]

In [None]:
f_score = f1_score(test_labels, prediction_scores, average='macro')
precision = precision_score(test_labels, prediction_scores, average='macro')
recall = recall_score(test_labels, prediction_scores, average='macro')
accr = accuracy_score(test_labels, prediction_scores)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

print("F-Score Macro: ", f_score)
print("Recall Macro: ", recall)
print("Precision Macro: ", precision)
print("Accuracy: ", accr)
class_names =['INSULT','OTHER','PROFANITY','RACIST','SEXIST']
cm = confusion_matrix(test_labels, prediction_scores, )
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=class_names)
disp.plot()

In [None]:
report = pd.DataFrame(classification_report(test_labels, prediction_scores, output_dict=True))

In [None]:
report = report.rename(columns={
                                '0':'INSULT',
                                '1':'OTHER',
                                '2':'PROFANITY',
                                '3':'RACIST',
                                '4':'SEXIST'})
report

In [None]:
from transformers import BertTokenizer,BertTokenizerFast, TFBertForSequenceClassification, BertConfig, TFBertModel
model_path = "/content/drive/MyDrive/Nane&Limon/2023 DDI Yarışma dokümantasyonu/bigscience_t0_model"
tokenizer_path = "/content/drive/MyDrive/Nane&Limon/2023 DDI Yarışma dokümantasyonu/bigscience_t0_tokenizer"
model = TFBertForSequenceClassification.from_pretrained(model_path, from_pt=True) # modify labels as needed.
tokenizer = BertTokenizer.from_pretrained(tokenizer_path, do_lower_case=True)


In [None]:
from transformers import TextClassificationPipeline

text = ["Selam herkese bugün güzel bir gün",
        "Aptal zihniyetinizde bir Yunan yatıyor",
        "Akşam halısahaya giderken karısından izin alanda kendine erkeğim demesin!",
        "kör olası çöpçüler aşkımı süpürmüşler",
        "sınıfımdaki deve hörgüçleri",
       "bugün de ölmedik",
       'seninle iyi anlaştık',
       'seni sevmek umitli sey ama artik umit yetmiyor bana',
       'Selam sen hariç piç',
       'bana bak kadın']

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)

In [None]:
[print(f"{text[index]} - {i['label']}") for index, i in enumerate(pipe(text))]

In [None]:
"""
INSULT    --> 0
OTHER     --> 1
PROFANITY --> 2
RACIST    --> 3
SEXIST    --> 4
"""