In [None]:
!pip install simpletransformers

In [None]:
# -*- coding: utf-8 -*-
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from transformers import logging, BertTokenizer
from simpletransformers.classification import ClassificationModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
import os
import time
from sklearn.model_selection import StratifiedKFold

In [None]:
logging.set_verbosity_error()
# set GPU
torch.device("cuda")
use_cuda = torch.cuda.is_available()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
%matplotlib inline

In [None]:
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-128k-uncased', do_lower_case=True, use_cuda=torch.cuda.is_available())

In [None]:
df = pd.read_csv('/kaggle/input/teknofest2023/obs_clean_data.csv')
df = pd.DataFrame(df, columns=['text', 'target'])

In [None]:
le = LabelEncoder()
df["target"] = le.fit_transform(df.target)

In [None]:
print(df['target'].unique())
print(df.head())

In [None]:
model = ClassificationModel('bert', 'dbmdz/bert-base-turkish-128k-uncased',
                        num_labels=5,
                        args={'reprocess_input_data': True,
                        'overwrite_output_dir': True,
                        'num_train_epochs': 3,
                        "output_dir": "bert_model",
                        "warmup_steps": 100,
                        "weight_decay":0,
                        "load_best_model_at_end": True,
                        "use_early_stopping": True,
                        "early_stopping_patience": 5,
                        "early_stopping_delta": 0.001,
                        "max_length": 80,
                        "tokenizer_name": tokenizer,
                        "manual_seed": 42},
                        use_cuda=torch.cuda.is_available())

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#kf = KFold(n_splits=2)
results = []
df["target"] = df["target"].astype(int)
model_outputs = None
val_df = None
for train_index, val_index in kf.split(df['text'], df['target']):
    try:
        # splitting Dataframe (dataset not included)
        train_df = df.iloc[train_index]
        val_df = df.iloc[val_index]
        # train and validate the model
        print(val_df.head())
        model.train_model(train_df)
        time.sleep(5)

        result, model_outputs, wrong_predictions = model.eval_model(val_df, acc=accuracy_score)
        print(f'Acc : {result["acc"]}')
        # append model score
        results.append(result["acc"])
    except Exception as ex:
        print('hata: ', ex)


In [None]:
print("results", results)
print(f"Mean-Precision: {sum(results) / len(results)}")

In [None]:
predictions = model_outputs.argmax(axis=1)
actuals = val_df.label.values

In [None]:
print(classification_report(actuals, predictions, digits=3))
print(balanced_accuracy_score(actuals, predictions))
cm = confusion_matrix(actuals, predictions, labels=[0, 1, 2, 3, 4])
print(cm)
tn, fp, fn, tp = cm
print((tn, fp, fn, tp))

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=[0, 1, 2, 3, 4])
disp.plot()

In [None]:
"""
INSULT    --> 0
OTHER     --> 1
PROFANITY --> 2
RACIST    --> 3
SEXIST    --> 4
"""