In [None]:
from google.colab import files

uploaded = files.upload()

Saving yeni.xlsx to yeni.xlsx


In [None]:
import pandas as pd

df = pd.read_excel("yeni.xlsx")  # Burada Excel dosya adını yaz
print(df.head())  # İlk satırlara bak

# CSV olarak kaydet
df.to_csv("veri.csv", index=False)

                                             text  conj_error
0      Ders çalışmadım bu yüzden sınavı kazandım.           1
1  Yorgundum, dolayısıyla geç saate kadar uyudum.           1
2      Çok yağmur yağdı bu nedenle piknik yaptık.           1
3   Bugün çok mutluyum yani her şey kötü gidiyor.           1
4          Hava soğuktu demek ki hava güneşliydi.           1


In [None]:
# === 1. Kütüphaneler ===
!pip install -q transformers datasets scikit-learn torch

import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import numpy as np

# === 2. GPU kontrolü ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Kullanılan cihaz:", device)

# === Create veri.csv from yeni.xlsx ===
try:
    df_excel = pd.read_excel("yeni.xlsx")  # Burada Excel dosya adını yaz
    df_excel.to_csv("veri.csv", index=False)
    print("veri.csv created successfully.")
except FileNotFoundError:
    print("Error: yeni.xlsx not found. Please upload the file.")
    # Exit or handle the error appropriately if the Excel file is not found
    exit() # Exit the script if the excel file is not found


# === 3. Veri yükleme ===
df = pd.read_csv("veri.csv")
print(df.head())

# === 4. Split ===
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset  = Dataset.from_pandas(test_df)

# === 5. Tokenizer & mapping ===
model_name = "dbmdz/bert-base-turkish-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset  = test_dataset.map(tokenize_function, batched=True)

# Pandas'tan gelen index kolonu varsa temizleyelim (yoksa hata vermesin)
for col in ["text", "_index_level_0_"]:
    if col in train_dataset.column_names:
        train_dataset = train_dataset.remove_columns(col)
    if col in test_dataset.column_names:
        test_dataset = test_dataset.remove_columns(col)

# Label kolonu adınızı kontrol edin: conj_error ise labels'a çeviriyoruz
if "conj_error" in train_dataset.column_names:
    train_dataset = train_dataset.rename_column("conj_error", "labels")
    test_dataset  = test_dataset.rename_column("conj_error", "labels")
elif "label" in train_dataset.column_names:
    train_dataset = train_dataset.rename_column("label", "labels")
    test_dataset  = test_dataset.rename_column("label", "labels")
else:
    raise ValueError("CSV içinde label kolonu bulunamadı. 'conj_error' ya da 'label' bekleniyor.")


train_dataset.set_format("torch")
test_dataset.set_format("torch")

# === 6. Model ===
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

# === 7. Eğitim ayarları (güncel) ===
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    learning_rate=2e-5,
    weight_decay=0.01,

    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    report_to='none',  # Disable wandb logging

    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    fp16=torch.cuda.is_available(), # GPU varsa otomatik karışık hassasiyet
)

# === 8. Metricler ===
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    # Hem eski tuple (preds, labels) hem de yeni EvalPrediction ile çalışsın
    if hasattr(eval_pred, "predictions"):
        logits = eval_pred.predictions
        labels = eval_pred.label_ids
    else:
        logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", zero_division=0)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# === 9. Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# === 10. Eğitim ===
trainer.train()

# === 11. Kaydetme ===
#trainer.save_model("./baglac_model")
#tokenizer.save_pretrained("./baglac_model")

import os

# Masaüstü yolunu bul
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop", "baglac_model")

# Modeli masaüstüne kaydet
trainer.save_model(desktop_path)
tokenizer.save_pretrained(desktop_path)

# === 12. Tahmin ===
def predict_sentence(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=1).item()
    return "Düşük Risk" if pred == 0 else "Yüksek Risk"

# === 13. Örnek ===
ornek_metin = "Çünkü yağmur yağıyordu ve ama herkes dışarı çıktı."
print("Tahmin:", predict_sentence(ornek_metin))

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m73.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/166 [00:00<?, ? examples/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.615848,0.761905,0.782609,0.666667,0.947368
2,No log,0.603064,0.738095,0.765957,0.642857,0.947368
3,0.625900,0.548867,0.785714,0.8,0.692308,0.947368
4,0.625900,0.550666,0.785714,0.8,0.692308,0.947368


Tahmin: Yüksek Risk


In [None]:
# === 13. Örnek ===
ornek_metin = " Ders çalışmadım bu yüzden sınavı kazandım."
print("Tahmin:", predict_sentence(ornek_metin))

Tahmin: Düşük Risk
