In [7]:
import os

import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier


# Load documents assuming all are from 'aydın'
def load_documents_from_folder(folder_path):
    documents = []
    for file_name in os.listdir(folder_path):
        author_files = folder_path + "/" + file_name
        for txt_name in os.listdir(author_files):
            if txt_name.endswith(".txt"):
                with open(
                    os.path.join(author_files, txt_name), "r", encoding="utf-8"
                ) as f:
                    text = f.read()
                    documents.append((text, file_name))
    return documents


folder_path = "datas/"
data = load_documents_from_folder(folder_path)

# Create DataFrame
df = pd.DataFrame(data, columns=["text", "author"])
le = LabelEncoder()
df["label"] = le.fit_transform(df["author"])

# Her yazının kelime sayısını hesapla
df["word_count"] = df["text"].apply(lambda x: len(x.split()))
author_word_counts = df.groupby("author")["word_count"].sum().sort_values(ascending=False)
print("Her yazarın toplam kelime sayısı:")
print(df)




X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)


Her yazarın toplam kelime sayısı:
                                                   text    author  label  \
0     Talihsiz Bedevi İle Kutup Ayısı\n\nBAŞBAKAN Er...    AHakan      2   
1      Bir De Cool Olacaklar\n\nMİRGÜN’ün “eski sevg...    AHakan      2   
2     İstihbarat Varmış\n\nDevlet, eskiden başörtüsü...    AHakan      2   
3     Ateşten Gömlek: TÜSİAD Başkanlığı\n\n“Şu anda ...    AHakan      2   
4     Antakya’da Sakallı Aradım\n\nSaat: 24.00... An...    AHakan      2   
...                                                 ...       ...    ...   
1195  Eski Hataları Tekrarlamayalım...\n\nBaşbakan Y...  MABirand     17   
1196  Org. Özel, TSK'nın Eski Faturalarını Ödüyor......  MABirand     17   
1197  PKK'nın Gövde Gösterisi Başarılı Ancak...\n\nG...  MABirand     17   
1198  İsrail, İran'ı Vurunca Bizim De Başımız Çok De...  MABirand     17   
1199  Hepimiz Kaybettik...\n\nUçakta önüme konan gaz...  MABirand     17   

      word_count  
0            913  
1            84

In [3]:
import re

def clean_text(text):
    text = re.sub(r'\*+', ' ', text)         # *** -> boşluk
    text = re.sub(r'\n+', '\n', text)        # çoklu satır boşluklarını teke indir
    text = re.sub(r'[“”]', '"', text)        # fancy tırnakları düzleştir
    text = re.sub(r'[‘’]', "'", text)
    text = re.sub(r'\s{2,}', ' ', text)      # fazla boşluğu tek boşluk yap
    return text.strip()

df['text'] = df['text'].apply(clean_text)
df["word_count"] = df["text"].apply(lambda x: len(x.split()))
author_word_counts = df.groupby("author")["word_count"].sum().sort_values(ascending=False)
print("Her yazarın toplam kelime sayısı:")
print(author_word_counts)


Her yazarın toplam kelime sayısı:
author
YCongar         58522
CCandar         31918
AHakan          31409
RMengi          30415
SOzisik         29242
AAydintasbas    27580
MABirand        26860
DCundioglu      26235
COzdemir        26208
ATuranAlkan     24581
PMagden         23978
AAltan          23903
MBaransu        23652
NBKaraca        22920
AYArslan        22653
DUAribogan      22403
ECakir          21210
GGokturk        20967
HCemal          20415
MNHazar         19778
HUluc           19525
IKucukkaya      19312
TAkyol          18445
MTonbekici      17828
EArdic          15746
MBarlas         15309
NIlicak         12763
YOzdil          12157
HBabaoglu       11271
BCoskun          9107
Name: word_count, dtype: int64


In [5]:
# Define classifiers
models = {
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "Naive Bayes": MultinomialNB(),
    "MLP": MLPClassifier(max_iter=300),
    "Decision Tree": DecisionTreeClassifier(),
}

# Define TF-IDF vectorizer settings
vectorizer_settings = {
    "word_unigram": TfidfVectorizer(analyzer="word", ngram_range=(1, 1)),
    "word_bigram_trigram": TfidfVectorizer(analyzer="word", ngram_range=(2, 3)),
    "char_bigram_trigram": TfidfVectorizer(analyzer="char", ngram_range=(2, 3)),
}

In [4]:
results = []

# TF-IDF based evaluations
for vec_name, vectorizer in vectorizer_settings.items():
    for model_name, model in models.items():
        print(f"Evaluating {model_name} with {vec_name}...")
        pipeline = Pipeline([("tfidf", vectorizer), ("clf", model)])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        report = classification_report(
            y_test, y_pred, output_dict=True, zero_division=0
        )
        results.append(
            {
                "Feature": vec_name,
                "Model": model_name,
                "Accuracy": report["accuracy"],
                "Precision": report["weighted avg"]["precision"],
                "Recall": report["weighted avg"]["recall"],
                "F1-score": report["weighted avg"]["f1-score"],
            }
        )

print("TF-IDF Results:")
for result in results:
    print(
        f"Feature: {result['Feature']}, Model: {result['Model']}, "
        f"Accuracy: {result['Accuracy']:.4f}, Precision: {result['Precision']:.4f}, "
        f"Recall: {result['Recall']:.4f}, F1-score: {result['F1-score']:.4f}"
    )

Evaluating Random Forest with word_unigram...
Evaluating SVM with word_unigram...
Evaluating XGBoost with word_unigram...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating Naive Bayes with word_unigram...
Evaluating MLP with word_unigram...
Evaluating Decision Tree with word_unigram...
Evaluating Random Forest with word_bigram_trigram...
Evaluating SVM with word_bigram_trigram...
Evaluating XGBoost with word_bigram_trigram...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating Naive Bayes with word_bigram_trigram...
Evaluating MLP with word_bigram_trigram...




Evaluating Decision Tree with word_bigram_trigram...
Evaluating Random Forest with char_bigram_trigram...
Evaluating SVM with char_bigram_trigram...
Evaluating XGBoost with char_bigram_trigram...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating Naive Bayes with char_bigram_trigram...
Evaluating MLP with char_bigram_trigram...
Evaluating Decision Tree with char_bigram_trigram...
TF-IDF Results:
Feature: word_unigram, Model: Random Forest, Accuracy: 0.7083, Precision: 0.7775, Recall: 0.7083, F1-score: 0.7096
Feature: word_unigram, Model: SVM, Accuracy: 0.4875, Precision: 0.7560, Recall: 0.4875, F1-score: 0.5082
Feature: word_unigram, Model: XGBoost, Accuracy: 0.6042, Precision: 0.6174, Recall: 0.6042, F1-score: 0.5955
Feature: word_unigram, Model: Naive Bayes, Accuracy: 0.3958, Precision: 0.6283, Recall: 0.3958, F1-score: 0.3917
Feature: word_unigram, Model: MLP, Accuracy: 0.7750, Precision: 0.8100, Recall: 0.7750, F1-score: 0.7648
Feature: word_unigram, Model: Decision Tree, Accuracy: 0.3167, Precision: 0.3400, Recall: 0.3167, F1-score: 0.3148
Feature: word_bigram_trigram, Model: Random Forest, Accuracy: 0.3542, Precision: 0.5483, Recall: 0.3542, F1-score: 0.3736
Feature: word_bigram_trigram, Model: SVM, Accuracy: 0

In [6]:
# BERT embeddings
bert_model = SentenceTransformer("all-MiniLM-L6-v2")
X_train_bert = bert_model.encode(X_train.tolist(), show_progress_bar=True)
X_test_bert = bert_model.encode(X_test.tolist(), show_progress_bar=True)

bert_compatible_models = {
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "XGBoost": XGBClassifier(eval_metric='logloss'),
    "MLP": MLPClassifier(max_iter=300),
    "Decision Tree": DecisionTreeClassifier()
}

# TMP
results = []

for model_name, model in bert_compatible_models.items(): 
    print(f"Evaluating {model_name} with BERT embeddings...")
    model.fit(X_train_bert, y_train)
    y_pred = model.predict(X_test_bert)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    results.append(
        {
            "Feature": "BERT",
            "Model": model_name,
            "Accuracy": report["accuracy"],
            "Precision": report["weighted avg"]["precision"],
            "Recall": report["weighted avg"]["recall"],
            "F1-score": report["weighted avg"]["f1-score"],
        }
    )

print("BERT Results:")
for result in results:
    if result["Feature"] == "BERT":
        print(
            f"Feature: {result['Feature']}, Model: {result['Model']}, "
            f"Accuracy: {result['Accuracy']:.4f}, Precision: {result['Precision']:.4f}, "
            f"Recall: {result['Recall']:.4f}, F1-score: {result['F1-score']:.4f}"
        )
# Save results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("results2.csv", index=False)
print("Results saved to results.csv")

Batches: 100%|██████████| 30/30 [00:08<00:00,  3.39it/s]
Batches: 100%|██████████| 8/8 [00:02<00:00,  3.62it/s]


Evaluating Random Forest with BERT embeddings...
Evaluating SVM with BERT embeddings...
Evaluating XGBoost with BERT embeddings...
Evaluating MLP with BERT embeddings...




Evaluating Decision Tree with BERT embeddings...
BERT Results:
Feature: BERT, Model: Random Forest, Accuracy: 0.1375, Precision: 0.1600, Recall: 0.1375, F1-score: 0.1409
Feature: BERT, Model: SVM, Accuracy: 0.1458, Precision: 0.1771, Recall: 0.1458, F1-score: 0.1284
Feature: BERT, Model: XGBoost, Accuracy: 0.1375, Precision: 0.1327, Recall: 0.1375, F1-score: 0.1308
Feature: BERT, Model: MLP, Accuracy: 0.1833, Precision: 0.1897, Recall: 0.1833, F1-score: 0.1801
Feature: BERT, Model: Decision Tree, Accuracy: 0.0542, Precision: 0.0535, Recall: 0.0542, F1-score: 0.0526
Results saved to results.csv
