In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# **Import and Labeled Data**

In [18]:
import os
import pandas as pd

data = []

#Eğitim verisi
dataset_path = '/content/drive/MyDrive/dataset/datasetauthor'


for author in os.listdir(dataset_path):
    author_path = os.path.join(dataset_path, author)
    if os.path.isdir(author_path):
        for file in os.listdir(author_path):
            file_path = os.path.join(author_path, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                data.append({'text': content, 'author': author})

df = pd.DataFrame(data)

In [19]:
random_data = df.sample(n=20)
random_data.head(20)

Unnamed: 0,text,author
556,"300 Darbeci!\n\nÜzgünüm, “eli silahlı olanlar ...",ECakir
394,Suriye Tuzağı Ve PKK'nın Başkanlık Yemini\n\nA...,AYArslan
80,25 Şehit Hatırası; Afyonkarahisar Halısı!\n\nM...,COzdemir
895,Stalin Mahkemesi!\n\nSAYIN Rıza Türmen değerli...,TAkyol
535,"Dersimiz Kur an, Konumuz Başörtüsü!\n\nHaydi b...",ECakir
365,Korucu İse İkinci Sınıf Şehit Mi Oluyor?\n\nHe...,AYArslan
87,Samet Söyle Yalan Mı?\n\nYarın basın toplantıs...,COzdemir
366,Aygün'ü Kaçırılmaya Karşı Uyaran Oldu Mu?\n\nC...,AYArslan
807,Trakya’nın Mutluluk Tarlaları\n\nNasıl oluyor ...,MTonbekici
655,"Bir Hafta Önce Mi, Bir Hafta Sonra Mı!\n\nBili...",GGokturk


# **Text Preprocessing**

Importing Library

In [20]:
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
import nltk as nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet') # Download wordnet, which is required by WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from nltk.stem import WordNetLemmatizer





[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Data Cleaning

In [21]:
def clean_text(data):

  # convert letters to lower case
  data["text"]=data["text"].str.lower()

  # remove punctuation and '\n'
  data["text"]=data["text"].replace('[^\w\s]','',regex=True)
  data["text"]=data["text"].replace('\n',' ',regex=True)

  # remove numbers
  data["text"]=data["text"].replace('\d',' ',regex=True)

clean_text(df)
df["text"]

Unnamed: 0,text
0,müslümanlar islâm tarihi okumalı mı hukuk fak...
1,devlet aklı tecellî etti hocam hocam diyorlar...
2,niçin ille de ben başkan olmalıyım grup topla...
3,düğüncü düğün ve hüzün bu yazının merhûm islâ...
4,naz güzeli bugün ekim cumhuriyet bayramı g...
...,...
1195,barzani sizinle gurur duyuyor gene böyle bi e...
1196,achtung ların başı ekmek parası için alman...
1197,çiçekdağı neşet ertaş kırşehirliydi bi ara y...
1198,alex filan sene evvel iki osmanlı kadırg...


**Stopwords**

In [22]:
def remove_stopwords(data):
  sw = stopwords.words('english')
  data['text']=data['text'].apply(lambda x:" ".join(x for x in str(x).split()if x not in sw))

remove_stopwords(df)
df["text"]

Unnamed: 0,text
0,müslümanlar islâm tarihi okumalı mı hukuk fakü...
1,devlet aklı tecellî etti hocam hocam diyorlar ...
2,niçin ille de ben başkan olmalıyım grup toplan...
3,düğüncü düğün hüzün bu yazının merhûm islâm âl...
4,naz güzeli bugün ekim cumhuriyet bayramı günün...
...,...
1195,barzani sizinle gurur duyuyor gene böyle bi ek...
1196,achtung ların başı ekmek parası için almanyaya...
1197,çiçekdağı neşet ertaş kırşehirliydi bi ara yoz...
1198,alex filan sene evvel iki osmanlı kadırgası ak...


**`Lemmatization`**

In [23]:
df["text"]=df["text"].apply(lambda x : " ".join([WordNetLemmatizer().lemmatize(word) for word in x.split()]))
df["text"]

Unnamed: 0,text
0,müslümanlar islâm tarihi okumalı mı hukuk fakü...
1,devlet aklı tecellî etti hocam hocam diyorlar ...
2,niçin ille de ben başkan olmalıyım grup toplan...
3,düğüncü düğün hüzün bu yazının merhûm islâm âl...
4,naz güzeli bugün ekim cumhuriyet bayramı günün...
...,...
1195,barzani sizinle gurur duyuyor gene böyle bi ek...
1196,achtung ların başı ekmek parası için almanyaya...
1197,çiçekdağı neşet ertaş kırşehirliydi bi ara yoz...
1198,alex filan sene evvel iki osmanlı kadırgası ak...


# **Data Splitting**

In [49]:
from sklearn.model_selection import train_test_split

X = df['text']
y = df['author']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# **TF-IDF**

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

tfidf_uni = TfidfVectorizer(analyzer='word', ngram_range=(1,1), max_features=5000)
X_train_uni = tfidf_uni.fit_transform(X_train)
X_test_uni = tfidf_uni.transform(X_test)

model_uni = LogisticRegression(max_iter=1000)
model_uni.fit(X_train_uni, y_train)
pred_uni = model_uni.predict(X_test_uni)

print("Unigram TF-IDF")
print(classification_report(y_test, pred_uni))



Unigram TF-IDF
              precision    recall  f1-score   support

      AAltan       0.78      0.88      0.82         8
AAydintasbas       0.45      0.62      0.53         8
      AHakan       0.89      1.00      0.94         8
 ATuranAlkan       1.00      0.62      0.77         8
    AYArslan       0.73      1.00      0.84         8
     BCoskun       0.46      0.75      0.57         8
     CCandar       0.70      0.88      0.78         8
    COzdemir       1.00      0.62      0.77         8
  DCundioglu       0.78      0.88      0.82         8
  DUAribogan       0.88      0.88      0.88         8
      EArdic       1.00      1.00      1.00         8
      ECakir       0.62      0.62      0.62         8
    GGokturk       1.00      0.50      0.67         8
   HBabaoglu       0.71      0.62      0.67         8
      HCemal       0.88      0.88      0.88         8
       HUluc       1.00      1.00      1.00         8
  IKucukkaya       0.67      0.50      0.57         8
    MABirand

**TF-IDF and Decision Tree**


In [44]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Unigram TF-IDF
tfidf_uni = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), max_features=5000)
X_train_uni = tfidf_uni.fit_transform(X_train)
X_test_uni = tfidf_uni.transform(X_test)

# Decision Tree Model
model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_train_uni, y_train)
pred_dt = model_dt.predict(X_test_uni)

# Sonuç
print("Unigram TF-IDF with Decision Tree")
print(classification_report(y_test, pred_dt))


Unigram TF-IDF with Decision Tree
              precision    recall  f1-score   support

      AAltan       0.12      0.12      0.12         8
AAydintasbas       0.50      0.62      0.56         8
      AHakan       0.40      0.25      0.31         8
 ATuranAlkan       0.33      0.12      0.18         8
    AYArslan       0.50      0.50      0.50         8
     BCoskun       0.29      0.25      0.27         8
     CCandar       0.11      0.12      0.12         8
    COzdemir       0.44      0.50      0.47         8
  DCundioglu       0.62      0.62      0.62         8
  DUAribogan       0.17      0.12      0.14         8
      EArdic       0.00      0.00      0.00         8
      ECakir       0.33      0.25      0.29         8
    GGokturk       0.20      0.25      0.22         8
   HBabaoglu       0.30      0.38      0.33         8
      HCemal       0.14      0.12      0.13         8
       HUluc       0.50      0.75      0.60         8
  IKucukkaya       0.50      0.25      0.33    

**TF-IDF + Multi-Layer Perceptron (MLP)**

In [45]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Unigram TF-IDF
tfidf_uni = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), max_features=5000)
X_train_uni = tfidf_uni.fit_transform(X_train)
X_test_uni = tfidf_uni.transform(X_test)

# MLP Model
model_mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
model_mlp.fit(X_train_uni, y_train)
pred_mlp = model_mlp.predict(X_test_uni)

# Sonuç
print("Unigram TF-IDF with MLP Classifier")
print(classification_report(y_test, pred_mlp))


Unigram TF-IDF with MLP Classifier
              precision    recall  f1-score   support

      AAltan       0.89      1.00      0.94         8
AAydintasbas       0.71      0.62      0.67         8
      AHakan       0.89      1.00      0.94         8
 ATuranAlkan       0.67      0.75      0.71         8
    AYArslan       0.89      1.00      0.94         8
     BCoskun       0.70      0.88      0.78         8
     CCandar       0.80      1.00      0.89         8
    COzdemir       1.00      1.00      1.00         8
  DCundioglu       0.89      1.00      0.94         8
  DUAribogan       0.88      0.88      0.88         8
      EArdic       1.00      0.88      0.93         8
      ECakir       0.75      0.75      0.75         8
    GGokturk       0.67      0.50      0.57         8
   HBabaoglu       0.75      0.75      0.75         8
      HCemal       0.78      0.88      0.82         8
       HUluc       1.00      0.88      0.93         8
  IKucukkaya       0.86      0.75      0.80   

**TF-IDF + Random Forest (RF)**

In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Unigram TF-IDF
tfidf_uni = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), max_features=5000)
X_train_uni = tfidf_uni.fit_transform(X_train)
X_test_uni = tfidf_uni.transform(X_test)

# Random Forest Model
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train_uni, y_train)
pred_rf = model_rf.predict(X_test_uni)

# Sonuç
print("Unigram TF-IDF with Random Forest")
print(classification_report(y_test, pred_rf))


Unigram TF-IDF with Random Forest
              precision    recall  f1-score   support

      AAltan       0.67      0.75      0.71         8
AAydintasbas       0.75      0.75      0.75         8
      AHakan       0.73      1.00      0.84         8
 ATuranAlkan       0.67      0.50      0.57         8
    AYArslan       1.00      0.88      0.93         8
     BCoskun       0.38      0.75      0.50         8
     CCandar       1.00      0.88      0.93         8
    COzdemir       0.70      0.88      0.78         8
  DCundioglu       0.89      1.00      0.94         8
  DUAribogan       1.00      0.62      0.77         8
      EArdic       0.83      0.62      0.71         8
      ECakir       0.56      0.62      0.59         8
    GGokturk       0.71      0.62      0.67         8
   HBabaoglu       0.67      0.50      0.57         8
      HCemal       0.71      0.62      0.67         8
       HUluc       0.80      1.00      0.89         8
  IKucukkaya       1.00      0.50      0.67    

 TF-IDF + Naïve Bayes (MultinomialNB)

In [47]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Unigram TF-IDF
tfidf_uni = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), max_features=5000)
X_train_uni = tfidf_uni.fit_transform(X_train)
X_test_uni = tfidf_uni.transform(X_test)

# Naive Bayes Model
model_nb = MultinomialNB()
model_nb.fit(X_train_uni, y_train)
pred_nb = model_nb.predict(X_test_uni)

# Sonuç
print("Unigram TF-IDF with Naïve Bayes")
print(classification_report(y_test, pred_nb))


Unigram TF-IDF with Naïve Bayes
              precision    recall  f1-score   support

      AAltan       0.64      0.88      0.74         8
AAydintasbas       0.33      0.62      0.43         8
      AHakan       0.88      0.88      0.88         8
 ATuranAlkan       0.80      0.50      0.62         8
    AYArslan       0.62      1.00      0.76         8
     BCoskun       0.67      0.25      0.36         8
     CCandar       0.88      0.88      0.88         8
    COzdemir       1.00      0.50      0.67         8
  DCundioglu       1.00      1.00      1.00         8
  DUAribogan       0.78      0.88      0.82         8
      EArdic       0.75      0.75      0.75         8
      ECakir       1.00      0.12      0.22         8
    GGokturk       0.60      0.38      0.46         8
   HBabaoglu       1.00      0.25      0.40         8
      HCemal       0.80      0.50      0.62         8
       HUluc       1.00      0.75      0.86         8
  IKucukkaya       0.67      0.50      0.57      

# **Word Based 2-gram and 3-gram**

**Word Based 2-gram and 3-gram LogicticRegression**

In [26]:
# 2-gram
tfidf_bi = TfidfVectorizer(analyzer='word', ngram_range=(2,2), max_features=5000)
X_train_bi = tfidf_bi.fit_transform(X_train)
X_test_bi = tfidf_bi.transform(X_test)

model_bi = LogisticRegression(max_iter=1000)
model_bi.fit(X_train_bi, y_train)
pred_bi = model_bi.predict(X_test_bi)

print("Bigram TF-IDF")
print(classification_report(y_test, pred_bi))

# 3-gram
tfidf_tri = TfidfVectorizer(analyzer='word', ngram_range=(3,3), max_features=5000)
X_train_tri = tfidf_tri.fit_transform(X_train)
X_test_tri = tfidf_tri.transform(X_test)

model_tri = LogisticRegression(max_iter=1000)
model_tri.fit(X_train_tri, y_train)
pred_tri = model_tri.predict(X_test_tri)

print("Trigram TF-IDF")
print(classification_report(y_test, pred_tri))


Bigram TF-IDF
              precision    recall  f1-score   support

      AAltan       0.86      0.75      0.80         8
AAydintasbas       0.60      0.75      0.67         8
      AHakan       0.71      0.62      0.67         8
 ATuranAlkan       0.80      0.50      0.62         8
    AYArslan       0.64      0.88      0.74         8
     BCoskun       0.36      0.50      0.42         8
     CCandar       0.78      0.88      0.82         8
    COzdemir       0.75      0.75      0.75         8
  DCundioglu       0.80      1.00      0.89         8
  DUAribogan       0.54      0.88      0.67         8
      EArdic       0.67      0.50      0.57         8
      ECakir       1.00      0.75      0.86         8
    GGokturk       0.71      0.62      0.67         8
   HBabaoglu       0.75      0.75      0.75         8
      HCemal       1.00      0.50      0.67         8
       HUluc       0.67      0.75      0.71         8
  IKucukkaya       0.80      0.50      0.62         8
    MABirand 

**Word Based 2-gram and 3-gram XGBoost**

In [43]:
!pip install -q xgboost
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Bigram TF-IDF
tfidf_bi = TfidfVectorizer(analyzer='word', ngram_range=(2, 2), max_features=5000)
X_train_bi = tfidf_bi.fit_transform(X_train)
X_test_bi = tfidf_bi.transform(X_test)

# Initialize LabelEncoder
le = LabelEncoder()

# Fit label encoder on training data and transform both train and test labels
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)


model_xgb_bi = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
model_xgb_bi.fit(X_train_bi, y_train_encoded) # Use encoded labels for training
pred_xgb_bi = model_xgb_bi.predict(X_test_bi)

print("Bigram TF-IDF with XGBoost")
print(classification_report(y_test_encoded, pred_xgb_bi)) # Use encoded labels for evaluation


# Trigram TF-IDF
tfidf_tri = TfidfVectorizer(analyzer='word', ngram_range=(3, 3), max_features=5000)
X_train_tri = tfidf_tri.fit_transform(X_train)
X_test_tri = tfidf_tri.transform(X_test)

model_xgb_tri = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
model_xgb_tri.fit(X_train_tri, y_train_encoded) # Use encoded labels for training
pred_xgb_tri = model_xgb_tri.predict(X_test_tri)

print("Trigram TF-IDF with XGBoost")
print(classification_report(y_test_encoded, pred_xgb_tri)) # Use encoded labels for evaluation

Parameters: { "use_label_encoder" } are not used.



Bigram TF-IDF with XGBoost
              precision    recall  f1-score   support

           0       0.40      0.50      0.44         8
           1       0.67      0.25      0.36         8
           2       0.33      0.12      0.18         8
           3       0.50      0.50      0.50         8
           4       0.67      0.50      0.57         8
           5       0.33      0.50      0.40         8
           6       0.60      0.38      0.46         8
           7       1.00      0.25      0.40         8
           8       0.86      0.75      0.80         8
           9       0.33      0.25      0.29         8
          10       0.20      0.12      0.15         8
          11       0.29      0.25      0.27         8
          12       0.38      0.38      0.38         8
          13       0.33      0.50      0.40         8
          14       0.00      0.00      0.00         8
          15       0.00      0.00      0.00         8
          16       0.20      0.12      0.15         8


Parameters: { "use_label_encoder" } are not used.



Trigram TF-IDF with XGBoost
              precision    recall  f1-score   support

           0       0.33      0.25      0.29         8
           1       0.44      0.50      0.47         8
           2       0.50      0.25      0.33         8
           3       0.25      0.12      0.17         8
           4       0.50      0.38      0.43         8
           5       0.50      0.12      0.20         8
           6       0.25      0.12      0.17         8
           7       0.00      0.00      0.00         8
           8       0.33      0.25      0.29         8
           9       0.38      0.38      0.38         8
          10       0.00      0.00      0.00         8
          11       0.00      0.00      0.00         8
          12       0.00      0.00      0.00         8
          13       0.00      0.00      0.00         8
          14       0.00      0.00      0.00         8
          15       0.00      0.00      0.00         8
          16       0.00      0.00      0.00         8

**Word Based 2-gram and 3-gram Random Forest**

In [48]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Bigram TF-IDF
tfidf_bi = TfidfVectorizer(analyzer='word', ngram_range=(2, 2), max_features=5000)
X_train_bi = tfidf_bi.fit_transform(X_train)
X_test_bi = tfidf_bi.transform(X_test)

# Random Forest Model
model_bi_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_bi_rf.fit(X_train_bi, y_train)
pred_bi_rf = model_bi_rf.predict(X_test_bi)

print("Bigram TF-IDF with Random Forest")
print(classification_report(y_test, pred_bi_rf))

# Trigram TF-IDF
tfidf_tri = TfidfVectorizer(analyzer='word', ngram_range=(3, 3), max_features=5000)
X_train_tri = tfidf_tri.fit_transform(X_train)
X_test_tri = tfidf_tri.transform(X_test)

# Random Forest Model
model_tri_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_tri_rf.fit(X_train_tri, y_train)
pred_tri_rf = model_tri_rf.predict(X_test_tri)

print("Trigram TF-IDF with Random Forest")
print(classification_report(y_test, pred_tri_rf))

Bigram TF-IDF with Random Forest
              precision    recall  f1-score   support

      AAltan       0.57      0.50      0.53         8
AAydintasbas       0.80      0.50      0.62         8
      AHakan       0.67      0.50      0.57         8
 ATuranAlkan       0.36      0.50      0.42         8
    AYArslan       0.80      0.50      0.62         8
     BCoskun       0.40      0.25      0.31         8
     CCandar       0.62      0.62      0.62         8
    COzdemir       0.38      0.38      0.38         8
  DCundioglu       0.89      1.00      0.94         8
  DUAribogan       0.40      0.50      0.44         8
      EArdic       0.00      0.00      0.00         8
      ECakir       0.29      0.25      0.27         8
    GGokturk       0.60      0.38      0.46         8
   HBabaoglu       0.56      0.62      0.59         8
      HCemal       0.67      0.25      0.36         8
       HUluc       0.33      0.38      0.35         8
  IKucukkaya       1.00      0.25      0.40     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **Character-based 2-gram and 3-gram**

**Character-based 2-gram and 3-gram LogicticRegression**

In [27]:
# char 2-gram
tfidf_char2 = TfidfVectorizer(analyzer='char', ngram_range=(2,2), max_features=5000)
X_train_c2 = tfidf_char2.fit_transform(X_train)
X_test_c2 = tfidf_char2.transform(X_test)

model_c2 = LogisticRegression(max_iter=1000)
model_c2.fit(X_train_c2, y_train)
pred_c2 = model_c2.predict(X_test_c2)

print("Character 2-gram")
print(classification_report(y_test, pred_c2))

# char 3-gram
tfidf_char3 = TfidfVectorizer(analyzer='char', ngram_range=(3,3), max_features=5000)
X_train_c3 = tfidf_char3.fit_transform(X_train)
X_test_c3 = tfidf_char3.transform(X_test)

model_c3 = LogisticRegression(max_iter=1000)
model_c3.fit(X_train_c3, y_train)
pred_c3 = model_c3.predict(X_test_c3)

print("Character 3-gram")
print(classification_report(y_test, pred_c3))


Character 2-gram
              precision    recall  f1-score   support

      AAltan       0.60      0.75      0.67         8
AAydintasbas       0.57      0.50      0.53         8
      AHakan       0.75      0.75      0.75         8
 ATuranAlkan       0.44      0.88      0.58         8
    AYArslan       0.64      0.88      0.74         8
     BCoskun       0.29      0.62      0.40         8
     CCandar       0.45      0.62      0.53         8
    COzdemir       0.75      0.38      0.50         8
  DCundioglu       0.89      1.00      0.94         8
  DUAribogan       0.56      0.62      0.59         8
      EArdic       0.62      0.62      0.62         8
      ECakir       0.00      0.00      0.00         8
    GGokturk       0.50      0.38      0.43         8
   HBabaoglu       0.29      0.25      0.27         8
      HCemal       1.00      0.38      0.55         8
       HUluc       0.86      0.75      0.80         8
  IKucukkaya       0.43      0.38      0.40         8
    MABira

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Character 3-gram
              precision    recall  f1-score   support

      AAltan       0.71      0.62      0.67         8
AAydintasbas       0.50      0.75      0.60         8
      AHakan       0.89      1.00      0.94         8
 ATuranAlkan       0.83      0.62      0.71         8
    AYArslan       0.58      0.88      0.70         8
     BCoskun       0.40      0.75      0.52         8
     CCandar       0.86      0.75      0.80         8
    COzdemir       1.00      0.38      0.55         8
  DCundioglu       0.88      0.88      0.88         8
  DUAribogan       0.60      0.75      0.67         8
      EArdic       0.89      1.00      0.94         8
      ECakir       1.00      0.12      0.22         8
    GGokturk       0.80      0.50      0.62         8
   HBabaoglu       0.86      0.75      0.80         8
      HCemal       1.00      0.50      0.67         8
       HUluc       0.89      1.00      0.94         8
  IKucukkaya       0.22      0.25      0.24         8
    MABira

Character-based 2-gram and 3-gram and Random Forest

In [38]:
# Random Forest modelini eğitme
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 2 Gram
model_c2_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_c2_rf.fit(X_train_c2, y_train)
pred_c2_rf = model_c2_rf.predict(X_test_c2)
print("Character 2-gram with Random Forest")
print(classification_report(y_test, pred_c2_rf))


# Gram
model_c3_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_c3_rf.fit(X_train_c3, y_train)
pred_c3_rf = model_c3_rf.predict(X_test_c3)

print("Character 3-gram with Random Forest")
print(classification_report(y_test, pred_c3_rf))

Character 2-gram with Random Forest
              precision    recall  f1-score   support

      AAltan       0.60      0.75      0.67         8
AAydintasbas       0.27      0.50      0.35         8
      AHakan       0.45      0.62      0.53         8
 ATuranAlkan       0.67      1.00      0.80         8
    AYArslan       0.36      0.50      0.42         8
     BCoskun       0.46      0.75      0.57         8
     CCandar       0.33      0.38      0.35         8
    COzdemir       0.44      0.50      0.47         8
  DCundioglu       0.44      0.50      0.47         8
  DUAribogan       0.43      0.38      0.40         8
      EArdic       0.67      0.50      0.57         8
      ECakir       0.40      0.25      0.31         8
    GGokturk       0.17      0.12      0.14         8
   HBabaoglu       0.33      0.12      0.18         8
      HCemal       0.25      0.12      0.17         8
       HUluc       0.36      0.50      0.42         8
  IKucukkaya       0.44      0.50      0.47  

# **BERT**

In [28]:
!pip install -q transformers
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm

tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')
model = BertModel.from_pretrained('dbmdz/bert-base-turkish-cased')

def get_bert_embeddings(texts):
    embeddings = []
    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        emb = outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # [CLS] token
        embeddings.append(emb)
    return embeddings

X_train_bert = get_bert_embeddings(X_train.tolist())
X_test_bert = get_bert_embeddings(X_test.tolist())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

100%|██████████| 960/960 [27:47<00:00,  1.74s/it]
100%|██████████| 240/240 [06:56<00:00,  1.74s/it]


**SVM + BERT**

In [35]:
# SVM modelini eğitme
svm_model = SVC(kernel='linear')  # Linear kernel kullanıyoruz
svm_model.fit(X_train_bert, y_train)

# Test verisi üzerinde tahmin yapma
y_pred = svm_model.predict(X_test_bert)

# Model performansını değerlendirme
print("SVM Model Performance with BERT Embeddings")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))



SVM Model Performance with BERT Embeddings
Accuracy: 0.5
Classification Report:
              precision    recall  f1-score   support

      AAltan       0.50      0.50      0.50         8
AAydintasbas       0.56      0.62      0.59         8
      AHakan       0.45      0.62      0.53         8
 ATuranAlkan       0.57      0.50      0.53         8
    AYArslan       0.57      0.50      0.53         8
     BCoskun       0.78      0.88      0.82         8
     CCandar       0.50      0.25      0.33         8
    COzdemir       0.14      0.25      0.18         8
  DCundioglu       0.55      0.75      0.63         8
  DUAribogan       0.25      0.25      0.25         8
      EArdic       0.67      0.50      0.57         8
      ECakir       0.20      0.25      0.22         8
    GGokturk       0.43      0.38      0.40         8
   HBabaoglu       0.73      1.00      0.84         8
      HCemal       0.50      0.50      0.50         8
       HUluc       0.71      0.62      0.67         8
 