In [6]:
!pip install datasets

# Gerekli kütüphaneleri yükleyelim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

# Örnek bir veri seti oluşturalım
# Örnek bir veri seti oluşturalım
data = {
    'text': [
        "Bu kadar saygısız insan görmek zor!",
        "Python mükemmel bir programlama dili.",
        "Bazen insanlar gerçekten çok kaba olabiliyor!",
        "Bugün hava çok güzel.",
        "Negatif insanlardan uzak durmalısın.",
        "Dağda yürüyüş yapmayı seviyorum.",
        "Sen gerçekten kötü bir insansın, saygıyı hak etmiyorsun.",
        "Boş zamanlarımda kitap okumayı seviyorum.",
        "Neden hep yalan söylüyorsun? Sana güvenemiyorum!",
        "Bugün gün batımı çok güzel görünüyor."
        "İnsanlardan nefret ediyorum."
    ],
    'label': [
        1,  # Offensive (Saldırgan)
        0,  # Not Offensive (Saldırgan olmayan)
        1,  # Offensive (Saldırgan)
        0,  # Not Offensive (Saldırgan olmayan)
        1,  # Offensive (Saldırgan)
        0,  # Not Offensive (Saldırgan olmayan)
        1,  # Offensive (Saldırgan)
        0,  # Not Offensive (Saldırgan olmayan)
        1,  # Offensive (Saldırgan)
        0,   # Not Offensive (Saldırgan olmayan)
    ]
}

# DataFrame'e dönüştürme
df = pd.DataFrame(data)

# Veri setini inceleyelim
print(df)



# Eğitim ve test verilerine ayıralım
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# TF-IDF ile metinleri vektörleştirelim
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Geleneksel yöntemlerle modeli eğitelim
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier()
}

# Sonuçları depolayacağımız bir sözlük oluşturalım
results = {}

# Modelleri eğitelim ve metrikleri hesaplayalım
for model_name, model in models.items():
    model.fit(X_train_tfidf, y_train)  # Modeli eğit
    y_pred = model.predict(X_test_tfidf)  # Test verisi üzerinde tahmin yap

    # Performans metriklerini hesaplayalım
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    f_score = f1_score(y_test, y_pred)

    # Sonuçları kaydedelim
    results[model_name] = {
        "Precision": precision,
        "Recall": recall,
        "Accuracy": accuracy,
        "F-Score": f_score
    }

# Sonuçları bir DataFrame olarak yazdıralım
results_df = pd.DataFrame(results).T
print(results_df)

import tensorflow as tf
from tensorflow.keras import layers, models

# CNN Modeli
def create_cnn_model(input_shape):
    model = models.Sequential([
        layers.Embedding(input_dim=5000, output_dim=128, input_length=input_shape),
        layers.Conv1D(128, 5, activation='relu'),
        layers.MaxPooling1D(pool_size=4),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(20, activation='softmax')  # 20 sınıf
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Modeli oluşturup eğitelim
cnn_model = create_cnn_model(X_train_tfidf.shape[1])
cnn_model.fit(X_train_tfidf.toarray(), y_train, epochs=5, batch_size=32, validation_data=(X_test_tfidf.toarray(), y_test))

# Test verisiyle tahmin yapalım
cnn_y_pred = cnn_model.predict(X_test_tfidf.toarray())
cnn_y_pred_classes = cnn_y_pred.argmax(axis=1)

# Performans metriklerini hesaplayalım
precision = precision_score(y_test, cnn_y_pred_classes, average='weighted')
recall = recall_score(y_test, cnn_y_pred_classes, average='weighted')
accuracy = accuracy_score(y_test, cnn_y_pred_classes)
f_score = f1_score(y_test, cnn_y_pred_classes, average='weighted')

print("CNN - Precision: ", precision)
print("CNN - Recall: ", recall)
print("CNN - Accuracy: ", accuracy)
print("CNN - F-Score: ", f_score)


# LSTM Modeli
def create_lstm_model(input_shape):
    model = models.Sequential([
        layers.Embedding(input_dim=5000, output_dim=128, input_length=input_shape),
        layers.LSTM(128, return_sequences=False),
        layers.Dense(128, activation='relu'),
        layers.Dense(20, activation='softmax')  # 20 sınıf
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Modeli oluşturup eğitelim
lstm_model = create_lstm_model(X_train_tfidf.shape[1])
lstm_model.fit(X_train_tfidf.toarray(), y_train, epochs=5, batch_size=32, validation_data=(X_test_tfidf.toarray(), y_test))

# Test verisiyle tahmin yapalım
lstm_y_pred = lstm_model.predict(X_test_tfidf.toarray())
lstm_y_pred_classes = lstm_y_pred.argmax(axis=1)

# Performans metriklerini hesaplayalım
precision = precision_score(y_test, lstm_y_pred_classes, average='weighted')
recall = recall_score(y_test, lstm_y_pred_classes, average='weighted')
accuracy = accuracy_score(y_test, lstm_y_pred_classes)
f_score = f1_score(y_test, lstm_y_pred_classes, average='weighted')

print("LSTM - Precision: ", precision)
print("LSTM - Recall: ", recall)
print("LSTM - Accuracy: ", accuracy)
print("LSTM - F-Score: ", f_score)


from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch

# BERT için tokenizer ve model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=20)

# Tokenizer ile veriyi dönüştürelim
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Dataset'i hazırlayalım
from datasets import Dataset
train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
test_dataset = Dataset.from_dict({"text": X_test, "label": y_test})

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# BERT modelini eğitelim
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=1e-5,  # Eğer model kötü sonuçlar veriyorsa, daha küçük bir öğrenme oranı deneyin (örneğin 1e-5)
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

# Tahmin yapalım
predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(axis=1)

# Metrikleri hesaplayalım
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)
f_score = f1_score(y_test, y_pred, average='weighted')

print("BERT - Precision: ", precision)
print("BERT - Recall: ", recall)
print("BERT - Accuracy: ", accuracy)
print("BERT - F-Score: ", f_score)


# Geleneksel modellerin metriklerini kaydetmek için
precision_log_reg = results["Logistic Regression"]["Precision"]
precision_svm = results["SVM"]["Precision"]
precision_knn = results["KNN"]["Precision"]
precision_rf = results["Random Forest"]["Precision"]
precision_cnn = precision  # CNN'den alınan metrik
precision_lstm = precision  # LSTM'den alınan metrik
precision_bert = precision  # BERT'ten alınan metrik

# Recall, Accuracy ve F-Score için de aynı şekilde
recall_log_reg = results["Logistic Regression"]["Recall"]
recall_svm = results["SVM"]["Recall"]
recall_knn = results["KNN"]["Recall"]
recall_rf = results["Random Forest"]["Recall"]
recall_cnn = recall  # CNN'den alınan metrik
recall_lstm = recall  # LSTM'den alınan metrik
recall_bert = recall  # BERT'ten alınan metrik

accuracy_log_reg = results["Logistic Regression"]["Accuracy"]
accuracy_svm = results["SVM"]["Accuracy"]
accuracy_knn = results["KNN"]["Accuracy"]
accuracy_rf = results["Random Forest"]["Accuracy"]
accuracy_cnn = accuracy  # CNN'den alınan metrik
accuracy_lstm = accuracy  # LSTM'den alınan metrik
accuracy_bert = accuracy  # BERT'ten alınan metrik

f_score_log_reg = results["Logistic Regression"]["F-Score"]
f_score_svm = results["SVM"]["F-Score"]
f_score_knn = results["KNN"]["F-Score"]
f_score_rf = results["Random Forest"]["F-Score"]
f_score_cnn = f_score  # CNN'den alınan metrik
f_score_lstm = f_score  # LSTM'den alınan metrik
f_score_bert = f_score  # BERT'ten alınan metrik

# Sonuçları birleştirerek tabloyu oluşturabilirsiniz
final_results = {
    'Model': ['Logistic Regression', 'SVM', 'KNN', 'Random Forest', 'CNN', 'LSTM', 'BERT'],
    'Precision': [precision_log_reg, precision_svm, precision_knn, precision_rf, precision_cnn, precision_lstm, precision_bert],
    'Recall': [recall_log_reg, recall_svm, recall_knn, recall_rf, recall_cnn, recall_lstm, recall_bert],
    'Accuracy': [accuracy_log_reg, accuracy_svm, accuracy_knn, accuracy_rf, accuracy_cnn, accuracy_lstm, accuracy_bert],
    'F-Score': [f_score_log_reg, f_score_svm, f_score_knn, f_score_rf, f_score_cnn, f_score_lstm, f_score_bert]
}

df = pd.DataFrame(final_results)
print(df)



Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0000e+00 - loss: 2.9920 - val_accuracy: 0.3333 - val_loss: 2.8966
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step - accuracy: 0.5714 - loss: 2.8849 - val_accuracy: 0.3333 - val_loss: 2.7825
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step - accuracy: 0.5714 - loss: 2.7637 - val_accuracy: 0.3333 - val_loss: 2.6294
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - accuracy: 0.5714 - loss: 2.5970 - val_accuracy: 0.3333 - val_loss: 2.4258
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step - accuracy: 0.5714 - loss: 2.3717 - val_accuracy: 0.3333 - val_loss: 2.1581
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step
CNN - Precision:  0.1111111111111111
CNN - Recall:  0.3333333333333333
CNN - Accuracy:  0.3333333333333333
CNN - F-Score:  0.1666666666666666

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0000e+00 - loss: 3.0087 - val_accuracy: 0.0000e+00 - val_loss: 2.9445
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - accuracy: 0.0000e+00 - loss: 2.9468 - val_accuracy: 0.6667 - val_loss: 2.8907
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step - accuracy: 0.4286 - loss: 2.8953 - val_accuracy: 0.6667 - val_loss: 2.8232
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step - accuracy: 0.4286 - loss: 2.8292 - val_accuracy: 0.6667 - val_loss: 2.7275
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step - accuracy: 0.4286 - loss: 2.7354 - val_accuracy: 0.6667 - val_loss: 2.5842
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step
LSTM - Precision:  0.4444444444444444
LSTM - Recall:  0.6666666666666666
LSTM - Accuracy:  0.6666666666666666
LSTM - F-Score:  0.533333

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,2.86037
2,No log,2.830918
3,No log,2.81347


BERT - Precision:  0.0
BERT - Recall:  0.0
BERT - Accuracy:  0.0
BERT - F-Score:  0.0
                 Model  Precision  Recall  Accuracy   F-Score
0  Logistic Regression   0.333333     1.0  0.333333  0.500000
1                  SVM   0.333333     1.0  0.333333  0.500000
2                  KNN   0.500000     1.0  0.666667  0.666667
3        Random Forest   0.333333     1.0  0.333333  0.500000
4                  CNN   0.000000     0.0  0.000000  0.000000
5                 LSTM   0.000000     0.0  0.000000  0.000000
6                 BERT   0.000000     0.0  0.000000  0.000000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
