<a href="https://colab.research.google.com/github/noktavirgul/Dil-analizi-ile-alzheimer-risk-tespiti/blob/main/anlam_belirsizligi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Çakışan paketleri temizle (sessiz)
!pip uninstall -y numpy peft transformers datasets accelerate \
  cuml-cuda cuml cudf dask-cuda \
  opencv-python opencv-contrib-python opencv-python-headless -q || true

# Uyumlu sürümleri kur
!pip install -q \
  "numpy==1.26.4" "transformers==4.41.2" "peft==0.11.1" \
  "datasets==2.20.0" "accelerate==0.31.0" "pyarrow==15.0.2" \
  "scikit-learn==1.4.2" "pandas==2.2.2" openpyxl

# Oturumu temiz başlat (önemli)
import os; os.kill(os.getpid(), 9)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m83.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m80.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os, json
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # daha stabil

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback, set_seed,
    DataCollatorWithPadding
)

# === BURAYI DEĞİŞTİRİN ===
EXCEL_FILE   = "anlam_belirsizligi_veriseti.xlsx"   # örn: sozcuk_seciminde_tutarsizlik_veriseti.xlsx
TEXT_COLUMN  = "Metin"           # sizde "text" ise "text" yazın
LABEL_COLUMN = "Etiket"          # sizde "label" ise "label" yazın
SAVE_DIR     = "berturk_finetunedab_model"

# Model & eğitim parametreleri
MODEL_NAME   = "dbmdz/bert-base-turkish-cased"
MAX_LENGTH   = 128     # OOM olursa 96 / 64 deneyin
BATCH_SIZE   = 8       # dinamik padding + grad_acc ile efektif batch ~16
EPOCHS       = 4
LR           = 2e-5
RANDOM_STATE = 42

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
set_seed(RANDOM_STATE)

print("Cihaz:", DEVICE)
!nvidia-smi


Cihaz: cpu
/bin/bash: line 1: nvidia-smi: command not found


In [None]:
from google.colab import files

def read_excel_safely(path, text_col, label_col):
    try:
        df = pd.read_excel(path)
        if text_col not in df.columns or label_col not in df.columns:
            df = pd.read_excel(path, header=None, names=[text_col, label_col])
    except FileNotFoundError:
        print("Excel bulunamadı. Lütfen yükleyin…")
        uploaded = files.upload()
        path = list(uploaded.keys())[0]
        df = pd.read_excel(path)
        if text_col not in df.columns or label_col not in df.columns:
            df = pd.read_excel(path, header=None, names=[text_col, label_col])
    return df

df = read_excel_safely(EXCEL_FILE, TEXT_COLUMN, LABEL_COLUMN)
df = df[[TEXT_COLUMN, LABEL_COLUMN]].dropna()

# Başlık kalıntılarına karşı koruma
df[TEXT_COLUMN]  = df[TEXT_COLUMN].astype(str).str.strip()
df[LABEL_COLUMN] = df[LABEL_COLUMN].astype(str).str.strip()
mask_bad = df[LABEL_COLUMN].str.lower().isin(["etiket","label"]) | df[TEXT_COLUMN].str.lower().isin(["metin","text"])
df = df[~mask_bad].reset_index(drop=True)

# Etiketleri sayısallaştır
labels_raw   = df[LABEL_COLUMN].astype(str).str.strip()
unique_labels= sorted(labels_raw.unique())
lab2id       = {lab:i for i,lab in enumerate(unique_labels)}
id2label     = {i:lab for lab,i in lab2id.items()}
df[LABEL_COLUMN] = labels_raw.map(lab2id).astype(int)

df_ren = df.rename(columns={TEXT_COLUMN:"text", LABEL_COLUMN:"labels"})
print("Sınıflar:", id2label)
df_ren.head()


Excel bulunamadı. Lütfen yükleyin…


Saving anlam_belirsizligi_veriseti.xlsx to anlam_belirsizligi_veriseti.xlsx
Sınıflar: {0: '0', 1: '1'}


Unnamed: 0,text,labels
0,Bu yaz bir yerlere gittim ama ne yaptığımı bil...,1
1,"Denize girmek çok güzeldi, ama havuza gitmedik...",0
2,"Arkadaşımı gördüm, o çok mutlu oldular.",1
3,Müzik yaparken bazen yemek de yaparım çünkü ka...,1
4,Çocukken karanlık bir odada bir şey hissetmiştim.,0


In [None]:
# 80% train_tmp + 20% test
train_tmp, test_df = train_test_split(
    df_ren, test_size=0.20, random_state=RANDOM_STATE, stratify=df_ren["labels"]
)
# train_tmp'i tekrar ayır: 90% train + 10% val
train_df, val_df = train_test_split(
    train_tmp, test_size=0.10, random_state=RANDOM_STATE, stratify=train_tmp["labels"]
)

for d in (train_df, val_df, test_df):
    d["labels"] = d["labels"].astype(int)

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds   = Dataset.from_pandas(val_df.reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True))

len(train_ds), len(val_ds), len(test_ds)


(1497, 167, 416)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(
        examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH
    )
# İsterseniz tam dinamik padding için üstte padding="max_length" yerine sadece truncation=True,
# aşağıda DataCollatorWithPadding kullanıyoruz zaten (dinamik doldurur).

train_ds = train_ds.map(tokenize_function, batched=True, remove_columns=["text"])
val_ds   = val_ds.map(tokenize_function,   batched=True, remove_columns=["text"])
test_ds  = test_ds.map(tokenize_function,  batched=True, remove_columns=["text"])

train_ds = train_ds.with_format("torch")
val_ds   = val_ds.with_format("torch")
test_ds  = test_ds.with_format("torch")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)  # dinamik padding


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/1497 [00:00<?, ? examples/s]

Map:   0%|          | 0/167 [00:00<?, ? examples/s]

Map:   0%|          | 0/416 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(unique_labels), id2label=id2label, label2id=lab2id
).to(DEVICE)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    avg = "binary" if len(np.unique(labels)) == 2 else "macro"
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average=avg, zero_division=0)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": p, "recall": r, "f1": f1}

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=2,   # efektif batch ~16
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=0.01,

    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    eval_accumulation_steps=2,
    dataloader_num_workers=2,

    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    fp16=torch.cuda.is_available(),
    report_to="none",
    seed=RANDOM_STATE
)


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,                 # test değil, validation!
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.267337,0.88024,0.858824,0.901235,0.879518
2,0.429100,0.257601,0.898204,0.863636,0.938272,0.899408


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.267337,0.88024,0.858824,0.901235,0.879518
2,0.429100,0.257601,0.898204,0.863636,0.938272,0.899408
3,0.235900,0.283556,0.916168,0.935065,0.888889,0.911392
4,0.114700,0.312188,0.91018,0.9125,0.901235,0.906832


TrainOutput(global_step=376, training_loss=0.22220034294940055, metrics={'train_runtime': 7939.9611, 'train_samples_per_second': 0.754, 'train_steps_per_second': 0.047, 'total_flos': 393877249873920.0, 'train_loss': 0.22220034294940055, 'epoch': 4.0})

In [None]:
import shutil
from google.colab import files

trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
with open(f"{SAVE_DIR}/label_map.json", "w", encoding="utf-8") as f:
    json.dump({int(k):v for k,v in id2label.items()}, f, ensure_ascii=False, indent=2)

shutil.make_archive(SAVE_DIR, "zip", SAVE_DIR)
files.download(SAVE_DIR + ".zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
test_metrics = trainer.evaluate(test_ds)
print("=== Test Metrics ===")
for k, v in test_metrics.items():
    print(k, ":", round(float(v), 4))


=== Test Metrics ===
eval_loss : 0.245
eval_accuracy : 0.9135
eval_precision : 0.956
eval_recall : 0.8614
eval_f1 : 0.9062
eval_runtime : 174.3936
eval_samples_per_second : 2.385
eval_steps_per_second : 0.298
epoch : 4.0


In [None]:
# (Oturum sıfırlandıysa SAVE_DIR zip'ini geri yükleyip unzip yapın.)
tok = AutoTokenizer.from_pretrained(SAVE_DIR)
mdl = AutoModelForSequenceClassification.from_pretrained(SAVE_DIR).to(DEVICE)
mdl.eval()

def predict_sentence(text):
    enc = tok(text, return_tensors="pt", truncation=True, padding=True, max_length=MAX_LENGTH).to(DEVICE)
    with torch.no_grad():
        logits = mdl(**enc).logits
        probs = logits.softmax(dim=-1).cpu().numpy()[0]
        pred  = probs.argmax()
    return id2label[pred], {id2label[i]: round(float(p), 3) for i, p in enumerate(probs)}

sample = "Akşam pazardan taze sebze aldım, eve gelip salata yaptım."
label, probs = predict_sentence(sample)
print("Tahmin:", label)
print("Olasılıklar:", probs)


Tahmin: 0
Olasılıklar: {'0': 0.995, '1': 0.005}
