In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
from torch.optim import AdamW
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re

In [12]:
df = pd.read_excel("BG_Dunya_Merged_Annotations.xlsx")[['Body', 'Economic_Relationship']]

In [13]:
df

Unnamed: 0,Body,Economic_Relationship
0,\n Sosyal medya platformu TikTo...,0
1,\n Çin Dışişleri Bakanlığından ...,1
2,\n Çin Dışişleri Bakanlığı'nın ...,0
3,\n Otomotiv sektöründeki fu...,0
4,\n Çin Merkez Bankasının (PBoC)...,0
...,...,...
578,"Çin, Tayvan'ın etrafını kuşatan kapsamlı asker...",0
579,NA Çin Devlet Başkanı Şi Cinping’in bu hafta g...,0
580,"Apple, Peki yönetiminin yaptığı baskılara daha...",0
581,Lojistik ve ambalaj sanayi arasındaki sinerjin...,0


In [14]:
df['Body'] = df['Body'].replace([None, ''], pd.NA)
df = df.dropna()

df['Economic_Relationship'] = df['Economic_Relationship'].replace([None, ''], pd.NA)
df = df.dropna()

In [15]:
df.groupby('Economic_Relationship').count()

Unnamed: 0_level_0,Body
Economic_Relationship,Unnamed: 1_level_1
0,393
1,190


In [16]:
# Split data into features and labels
texts = df['Body'].tolist()
labels = df['Economic_Relationship'].tolist()

In [17]:
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-128k-cased')

class NewspaperDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [18]:
# Define training and evaluation functions
def train_epoch(model, data_loader, optimizer, device, n_examples):
    model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        _, preds = torch.max(outputs.logits, dim=1)
        loss = outputs.loss

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()

    return correct_predictions.double() / n_examples, sum(losses) / n_examples

def eval_model(model, data_loader, device, n_examples):
    model.eval()
    losses = []
    correct_predictions = 0
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            labels = d['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

            _, preds = torch.max(outputs.logits, dim=1)
            loss = outputs.loss

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            all_predictions.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = correct_predictions.double() / n_examples
    avg_loss = sum(losses) / n_examples
    class_report = classification_report(all_labels, all_predictions, output_dict=True)
    conf_matrix = confusion_matrix(all_labels, all_predictions)

    # ✅ Now returns five values including predictions
    return accuracy, avg_loss, class_report, conf_matrix, all_predictions

In [19]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Get indices for stratified split
all_indices = list(range(len(texts)))
train_idx, val_idx = train_test_split(
    all_indices, test_size=0.3, stratify=labels, random_state=2
)

# Create splits using the indices
train_texts = [texts[i] for i in train_idx]
val_texts = [texts[i] for i in val_idx]
train_labels = [labels[i] for i in train_idx]
val_labels = [labels[i] for i in val_idx]

train_dataset = NewspaperDataset(train_texts, train_labels, tokenizer)
val_dataset = NewspaperDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Initialize model
model = BertForSequenceClassification.from_pretrained('dbmdz/bert-base-turkish-128k-cased', num_labels=2).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, device, len(train_dataset))

# Save model
torch.save(model.state_dict(), 'bert_model_economic_relationship.pth')

# Evaluate
val_acc, val_loss, val_class_report, val_conf_matrix, val_preds = eval_model(
    model, val_loader, device, len(val_dataset)
)

# Save predictions to df
df.loc[val_idx, 'Predicted'] = val_preds
df.loc[val_idx, 'Model'] = 'final_model'

# Log and save
print(f'Validation accuracy: {val_acc}')
print(f'Validation loss: {val_loss}')
print('Classification Report:')
print(val_class_report)
print('Confusion Matrix:')
print(val_conf_matrix)

pd.DataFrame(val_class_report).transpose().to_excel('classification_report_economic_relationship.xlsx', engine='openpyxl')
pd.DataFrame(val_conf_matrix).to_csv('confusion_matrix_economic_relationship.csv', index=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-128k-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Validation accuracy: 0.9485714285714286
Validation loss: 0.011623072570655495
Classification Report:
{'0': {'precision': 0.957983193277311, 'recall': 0.9661016949152542, 'f1-score': 0.9620253164556962, 'support': 118.0}, '1': {'precision': 0.9285714285714286, 'recall': 0.9122807017543859, 'f1-score': 0.9203539823008849, 'support': 57.0}, 'accuracy': 0.9485714285714286, 'macro avg': {'precision': 0.9432773109243697, 'recall': 0.93919119833482, 'f1-score': 0.9411896493782905, 'support': 175.0}, 'weighted avg': {'precision': 0.948403361344538, 'recall': 0.9485714285714286, 'f1-score': 0.9484523676167006, 'support': 175.0}}
Confusion Matrix:
[[114   4]
 [  5  52]]


In [20]:
from google.colab import drive
import os
import shutil

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Define the target folder in Google Drive
target_folder = '/content/drive/MyDrive/bert_predictions'
os.makedirs(target_folder, exist_ok=True)  # create it if it doesn't exist

# 3. Define the list of files you want to copy from Colab to Drive
files_to_save = [
    'bert_model_economic_relationship.pth',
    'classification_report_economic_relationship.xlsx',
    'confusion_matrix_economic_relationship.csv'
]

# 4. Copy files to the Google Drive folder
for file in files_to_save:
    if os.path.exists(file):
        shutil.copy(file, target_folder)
        print(f"✅ Copied: {file}")
    else:
        print(f"❌ File not found: {file}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Copied: bert_model_economic_relationship.pth
✅ Copied: classification_report_economic_relationship.xlsx
✅ Copied: confusion_matrix_economic_relationship.csv
