## Import Necessary Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
from torch.optim import AdamW
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
import re
import optuna

## Importing the Data

In [2]:
df = pd.read_excel("FinalUsedAnnotations.xlsx")[['Body', 'Economic_Relationship']]

In [3]:
df

Unnamed: 0,Body,Economic_Relationship
0,\n Sosyal medya platformu TikTo...,0
1,\n Çin Dışişleri Bakanlığından ...,1
2,\n Çin Dışişleri Bakanlığı'nın ...,0
3,\n Otomotiv sektöründeki fu...,0
4,\n Çin Merkez Bankasının (PBoC)...,0
...,...,...
603,'Çelikte Çin'in geri kalacağı pazarlardaki tal...,1
604,Aksa Jeneratör ile Mitsubishi ortak şirket kur...,1
605,VakıfBank’a Çin’den 140 milyon dolar kaynakVak...,1
606,Bursa’da bisiklet coşkusu başladı‘Bursa’da Bis...,0


In [4]:
df['Body'] = df['Body'].replace([None, ''], pd.NA)
df = df.dropna()

df['Economic_Relationship'] = df['Economic_Relationship'].replace([None, ''], pd.NA)
df = df.dropna()

In [5]:
df.groupby('Economic_Relationship').count()

Unnamed: 0_level_0,Body
Economic_Relationship,Unnamed: 1_level_1
0,404
1,204


In [6]:
# Split data into features and labels
texts = df['Body'].tolist()
labels = df['Economic_Relationship'].tolist()

## BERT Model

In [7]:
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-128k-cased')

class NewspaperDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
# Define training and evaluation functions
def train_epoch(model, data_loader, optimizer, device, n_examples):
    model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        _, preds = torch.max(outputs.logits, dim=1)
        loss = outputs.loss

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()

    return correct_predictions.double() / n_examples, sum(losses) / n_examples

def eval_model(model, data_loader, device, n_examples):
    model.eval()
    losses = []
    correct_predictions = 0
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            labels = d['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

            _, preds = torch.max(outputs.logits, dim=1)
            loss = outputs.loss

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            all_predictions.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = correct_predictions.double() / n_examples
    avg_loss = sum(losses) / n_examples
    class_report = classification_report(all_labels, all_predictions, output_dict=True)
    conf_matrix = confusion_matrix(all_labels, all_predictions)

    # ✅ Now returns five values including predictions
    return accuracy, avg_loss, class_report, conf_matrix, all_predictions

In [9]:
# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Data split
train_idx, val_idx = train_test_split(
    list(range(len(texts))), test_size=0.3, stratify=labels, random_state=2
)
train_texts = [texts[i] for i in train_idx]
val_texts = [texts[i] for i in val_idx]
train_labels = [labels[i] for i in train_idx]
val_labels = [labels[i] for i in val_idx]

from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

def objective(trial):
    # Sample hyperparameters
    learning_rate = trial.suggest_float("lr", 1e-5, 5e-5, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    num_epochs = trial.suggest_int("num_epochs", 3, 6)
    seed = trial.suggest_int("seed", 1, 10000)

    # Set all seeds
    import random, os
    import numpy as np
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)

    # Prepare data
    train_dataset = NewspaperDataset(train_texts, train_labels, tokenizer)
    val_dataset = NewspaperDataset(val_texts, val_labels, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Initialize model
    model = BertForSequenceClassification.from_pretrained(
        'dbmdz/bert-base-turkish-128k-cased', num_labels=2
    ).to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Train
    for epoch in range(num_epochs):
        train_epoch(model, train_loader, optimizer, device, len(train_dataset))

    # Evaluate
    _, _, _, _, val_preds = eval_model(model, val_loader, device, len(val_dataset))
    val_true = [val_labels[i] for i in range(len(val_preds))]

    # Per-class precision, recall, f1
    precision, recall, f1, support = precision_recall_fscore_support(val_true, val_preds, average=None, labels=[0,1])
    f1_macro = f1.mean()
    f2_class_1 = (5 * precision[1] * recall[1]) / (4 * precision[1] + recall[1] + 1e-10)

    # Confusion Matrix
    conf = confusion_matrix(val_true, val_preds, labels=[0, 1])
    tn, fp, fn, tp = conf.ravel()

    # Optional: log metrics as trial attributes (can be accessed later)
    trial.set_user_attr("precision_0", precision[0])
    trial.set_user_attr("recall_0", recall[0])
    trial.set_user_attr("precision_1", precision[1])
    trial.set_user_attr("recall_1", recall[1])
    trial.set_user_attr("f1_macro", f1_macro)
    trial.set_user_attr("f2_class_1", f2_class_1)
    trial.set_user_attr("conf_matrix", conf.tolist())  # so it's serializable
    trial.set_user_attr("seed", seed)

    return f1_macro, f2_class_1  # still optimizing these



# Multi-objective study
study = optuna.create_study(directions=["maximize", "maximize"])
study.optimize(objective, n_trials=50)

# Show top trials
print("\n✅ Top Trials (F1-macro & F2-class1):")
for i, t in enumerate(study.best_trials):
    print(f"\nTrial {i}")
    print(f"F1-macro: {t.values[0]:.4f}, F2 (class 1): {t.values[1]:.4f}")
    print(f"Precision (0): {t.user_attrs['precision_0']:.4f}, Recall (0): {t.user_attrs['recall_0']:.4f}")
    print(f"Precision (1): {t.user_attrs['precision_1']:.4f}, Recall (1): {t.user_attrs['recall_1']:.4f}")
    print(f"Seed: {t.user_attrs['seed']}")
    print("Confusion Matrix:")
    print(np.array(t.user_attrs["conf_matrix"]))

# Optionally pick the one with highest F1 or F2
best_trial = max(study.best_trials, key=lambda t: t.values[0])  # or `t.values[1]` for F2
best_params = best_trial.params
print("\n🎯 Using best trial based on F1:")
print(f"  F1 = {best_trial.values[0]:.4f}, F2 = {best_trial.values[1]:.4f}")
print(f"  Params = {best_params}")

# Final model training with best params
final_model = BertForSequenceClassification.from_pretrained(
    'dbmdz/bert-base-turkish-128k-cased', num_labels=2
).to(device)
final_optimizer = AdamW(final_model.parameters(), lr=best_params["lr"])

train_dataset = NewspaperDataset(train_texts, train_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=best_params["batch_size"], shuffle=True)

for epoch in range(best_params["num_epochs"]):
    train_epoch(final_model, train_loader, final_optimizer, device, len(train_dataset))

torch.save(final_model.state_dict(), 'bert_model_optimized_dual.pth')
print("\n✅ Final model saved as 'bert_model_optimized_dual.pth'")

[I 2025-04-15 23:06:37,424] A new study created in memory with name: no-name-89e80728-8737-4422-b8c8-fbe6576ca15d
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-128k-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-04-15 23:07:16,355] Trial 0 finished with values: [0.9204055003512999, 0.8986928104373212] and parameters: {'lr': 2.1711698213328696e-05, 'batch_size': 8, 'num_epochs': 4, 'seed': 3677}.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-128k-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-04-15 23:07:43,392] Trial 1 finished with values: [0.9336803610


✅ Top Trials (F1-macro & F2-class1):

Trial 0
F1-macro: 0.9575, F2 (class 1): 0.9578
Precision (0): 0.9832, Recall (0): 0.9590
Precision (1): 0.9219, Recall (1): 0.9672
Seed: 1495
Confusion Matrix:
[[117   5]
 [  2  59]]

Trial 1
F1-macro: 0.9575, F2 (class 1): 0.9578
Precision (0): 0.9832, Recall (0): 0.9590
Precision (1): 0.9219, Recall (1): 0.9672
Seed: 624
Confusion Matrix:
[[117   5]
 [  2  59]]

Trial 2
F1-macro: 0.9575, F2 (class 1): 0.9578
Precision (0): 0.9832, Recall (0): 0.9590
Precision (1): 0.9219, Recall (1): 0.9672
Seed: 4534
Confusion Matrix:
[[117   5]
 [  2  59]]

Trial 3
F1-macro: 0.9575, F2 (class 1): 0.9578
Precision (0): 0.9832, Recall (0): 0.9590
Precision (1): 0.9219, Recall (1): 0.9672
Seed: 3554
Confusion Matrix:
[[117   5]
 [  2  59]]

🎯 Using best trial based on F1:
  F1 = 0.9575, F2 = 0.9578
  Params = {'lr': 2.526098960983148e-05, 'batch_size': 16, 'num_epochs': 4, 'seed': 1495}

✅ Final model saved as 'bert_model_optimized_dual.pth'


In [11]:
from google.colab import drive
import os
import shutil

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Define the target folder in Google Drive
target_folder = '/content/drive/MyDrive/bert_predictions'
os.makedirs(target_folder, exist_ok=True)  # create it if it doesn't exist

# 3. Define the list of files you want to copy from Colab to Drive
files_to_save = [
    'bert_model_optimized_dual.pth',
]

# 4. Copy files to the Google Drive folder
for file in files_to_save:
    if os.path.exists(file):
        shutil.copy(file, target_folder)
        print(f"✅ Copied: {file}")
    else:
        print(f"❌ File not found: {file}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Copied: bert_model_optimized_dual.pth
