## Geleneksel makine öğrenme metotları

In [7]:
from datasets import load_dataset
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

### AG News: Haber metinlerinin sınıflandırılması.

In [9]:
# AG News veri setini yükleme
dataset = load_dataset('ag_news')

# Eğitim veri setinden örnekleme (10.000 örnek)
train_data = pd.DataFrame(dataset['train'])
train_data_sampled = train_data.sample(10000, random_state=42)

# Test veri seti
test_data = pd.DataFrame(dataset['test'])

# Metin ve etiketleri ayırma
X_train_texts = train_data_sampled['text']
y_train = train_data_sampled['label']
X_test_texts = test_data['text']
y_test = test_data['label']

Found cached dataset parquet (file://C:/Users/onura/.cache/huggingface/datasets/parquet/ag_news-f4012edcb412d6fa/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


NotImplementedError: Loading a dataset cached in a LocalFileSystem is not supported.

In [None]:
# Veri seti bilgilerini yazdırma
for split in dataset.keys():
    print(f"--- {split.upper()} ---")
    print(f"Örnek sayısı: {len(dataset[split])}")
    
    # Ortalama metin uzunluğu
    total_text_length = sum(len(item['text']) for item in dataset[split])
    avg_text_length = total_text_length / len(dataset[split])
    print(f"Toplam metin uzunluğu: {total_text_length}")
    print(f"Ortalama metin uzunluğu: {avg_text_length:.2f} karakter")
    
    print()

### Embedding yöntemi olarak TF-IDF kullanılmıştır.

In [None]:
# TF-IDF ile metinleri vektörleştirme (özellik sayısı 1000 ile sınırlandırıldı)
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train = vectorizer.fit_transform(X_train_texts)
X_test = vectorizer.transform(X_test_texts)

### Geleneksel Makine Öğrenme Metotları

In [None]:
# Modellerin tanımlanması
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=10, n_jobs=-1)  # Paralelleştirme
}

results = []

# Her model için eğitim ve değerlendirme
for name, model in models.items():
    # Modeli eğit
    model.fit(X_train, y_train)
    
    # Test verisi üzerinde tahmin yap
    y_pred = model.predict(X_test)
    
    # Performans metriklerini hesapla
    results.append({
        "Model": name,
        "Precision": precision_score(y_test, y_pred, average='weighted'),
        "Recall": recall_score(y_test, y_pred, average='weighted'),
        "Accuracy": accuracy_score(y_test, y_pred),
        "F-Score": f1_score(y_test, y_pred, average='weighted')
    })

# Sonuçları tablo olarak göster
results_df = pd.DataFrame(results)
print(results_df)

-----------
## Derin Öğrenme Metotları

In [None]:
import gensim
from gensim.models import Word2Vec
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, LSTM, Dropout
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

#### Word2Vec ile CNN ve LSTM Modelleri:

In [None]:
# Eğitim verisini kelime listelerine ayırma
X_train_tokens = [sentence.split() for sentence in X_train_texts]
X_test_tokens = [sentence.split() for sentence in X_test_texts]

# Word2Vec modeli eğitme
word2vec_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)

# Kelime vektörlerini içeren bir sözlük
word_vectors = word2vec_model.wv
vocab_size = len(word_vectors)
print(f"Kelime vektörlerinin boyutu: {word_vectors.vector_size}")

#### Embedding Matrisini Hazırlama:

In [None]:
# Embedding matrisini oluşturma
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size + 1, embedding_dim))  # +1, bilinmeyen kelimeler için
word_index = {word: idx + 1 for idx, word in enumerate(word_vectors.index_to_key)}

for word, idx in word_index.items():
    embedding_matrix[idx] = word_vectors[word]

#### Metinleri sayısal formata çevirme

In [None]:
# Kelimeleri dizilere dönüştürme
def text_to_sequence(tokens, word_index):
    return [[word_index.get(word, 0) for word in sentence] for sentence in tokens]

X_train_seq = text_to_sequence(X_train_tokens, word_index)
X_test_seq = text_to_sequence(X_test_tokens, word_index)

# Dizileri aynı uzunluğa getirerek doldurma
max_length = 100
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

### CNN Modeli

In [None]:
# CNN modeli
cnn_model = Sequential([
    Embedding(input_dim=vocab_size + 1, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(4, activation='softmax')  # 4 sınıf
])

cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
cnn_model.summary()

# CNN modelini eğitme
cnn_model.fit(X_train_padded, y_train, epochs=5, batch_size=64, validation_data=(X_test_padded, y_test))

### LSTM Modeli

In [None]:
# LSTM modeli
lstm_model = Sequential([
    Embedding(input_dim=vocab_size + 1, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
    LSTM(128, return_sequences=True),
    Dropout(0.5),
    LSTM(64),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(4, activation='softmax')  # 4 sınıf
])

lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
lstm_model.summary()

# LSTM modelini eğitme
lstm_model.fit(X_train_padded, y_train, epochs=5, batch_size=64, validation_data=(X_test_padded, y_test))

### Performans değerlendirme

In [None]:
# CNN performans değerlendirme
cnn_preds = np.argmax(cnn_model.predict(X_test_padded), axis=1)
cnn_precision = precision_score(y_test, cnn_preds, average='weighted')
cnn_recall = recall_score(y_test, cnn_preds, average='weighted')
cnn_accuracy = accuracy_score(y_test, cnn_preds)
cnn_f1 = f1_score(y_test, cnn_preds, average='weighted')

print(f"CNN -> Precision: {cnn_precision}, Recall: {cnn_recall}, Accuracy: {cnn_accuracy}, F-Score: {cnn_f1}")

# LSTM performans değerlendirme
lstm_preds = np.argmax(lstm_model.predict(X_test_padded), axis=1)
lstm_precision = precision_score(y_test, lstm_preds, average='weighted')
lstm_recall = recall_score(y_test, lstm_preds, average='weighted')
lstm_accuracy = accuracy_score(y_test, lstm_preds)
lstm_f1 = f1_score(y_test, lstm_preds, average='weighted')

print(f"LSTM -> Precision: {lstm_precision}, Recall: {lstm_recall}, Accuracy: {lstm_accuracy}, F-Score: {lstm_f1}")

-------
## Transfer Öğrenme

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import tensorflow as tf

### Tokenizer ve model hazırlama

In [None]:
# Model ve tokenizer
model_name = "bert-base-uncased"  # İngilizce için temel BERT modeli
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=4, from_pt=True)