# Model and Settings Comparison

## Comparison of Classification Models and Different Settings

In [1]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Turkish Stop Words
trstop = [
    'a', 'acaba', 'altı', 'altmış', 'ama', 'ancak', 'arada', 'artık', 'asla', 'aslında', 'aslında', 'ayrıca', 'az', 'bana',
    'bazen', 'bazı', 'bazıları', 'belki', 'ben', 'benden', 'beni', 'benim', 'beri', 'beş', 'bile', 'bilhassa', 'bin', 'bir',
    'biraz', 'birçoğu', 'birçok', 'biri', 'birisi', 'birkaç', 'birşey', 'biz', 'bizden', 'bize', 'bizi', 'bizim', 'böyle',
    'böylece', 'bu', 'buna', 'bunda', 'bundan', 'bunlar', 'bunları', 'bunların', 'bunu', 'bunun', 'burada', 'bütün', 'çoğu',
    'çoğunu', 'çok', 'çünkü', 'da', 'daha', 'dahi', 'dan', 'de', 'defa', 'değil', 'diğer', 'diğeri', 'diğerleri', 'diye',
    'doksan', 'dokuz', 'dolayı', 'dolayısıyla', 'dört', 'e', 'edecek', 'eden', 'ederek', 'edilecek', 'ediliyor', 'edilmesi',
    'ediyor', 'eğer', 'elbette', 'elli', 'en', 'etmesi', 'etti', 'ettiği', 'ettiğini', 'fakat', 'falan', 'filan', 'gene',
    'gereği', 'gerek', 'gibi', 'göre', 'hala', 'halde', 'halen', 'hangi', 'hangisi', 'hani', 'hatta', 'hem', 'henüz', 'hep',
    'hepsi', 'her', 'herhangi', 'herkes', 'herkese', 'herkesi', 'herkesin', 'hiç', 'hiçbir', 'hiçbiri', 'i', 'ı', 'için',
    'içinde', 'iki', 'ile', 'ilgili', 'ise', 'işte', 'itibaren', 'itibariyle', 'kaç', 'kadar', 'karşın', 'kendi', 'kendilerine',
    'kendine', 'kendini', 'kendisi', 'kendisine', 'kendisini', 'kez', 'ki', 'kim', 'kime', 'kimi', 'kimin', 'kimisi', 'kimse',
    'kırk', 'madem', 'mi', 'mı', 'milyar', 'milyon', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'nedenle', 'nerde', 'nerede', 'nereye',
    'neyse', 'niçin', 'nin', 'nın', 'niye', 'nun', 'nün', 'o', 'öbür', 'olan', 'olarak', 'oldu', 'olduğu', 'olduğunu',
    'olduklarını', 'olmadı', 'olmadığı', 'olmak', 'olması', 'olmayan', 'olmaz', 'olsa', 'olsun', 'olup', 'olur', 'olur',
    'olursa', 'oluyor', 'on', 'ön', 'ona', 'önce', 'ondan', 'onlar', 'onlara', 'onlardan', 'onları', 'onların', 'onu', 'onun',
    'orada', 'öte', 'ötürü', 'otuz', 'öyle', 'oysa', 'pek', 'rağmen', 'sana', 'sanki', 'sanki', 'şayet', 'şekilde', 'sekiz',
    'seksen', 'sen', 'senden', 'seni', 'senin', 'şey', 'şeyden', 'şeye', 'şeyi', 'şeyler', 'şimdi', 'siz', 'siz', 'sizden',
    'sizden', 'size', 'sizi', 'sizi', 'sizin', 'sizin', 'sonra', 'şöyle', 'şu', 'şuna', 'şunları', 'şunu', 'ta', 'tabii',
    'tam', 'tamam', 'tamamen', 'tarafından', 'trilyon', 'tüm', 'tümü', 'u', 'ü', 'üç', 'un', 'ün', 'üzere', 'var', 'vardı',
    've', 'veya', 'ya', 'yani', 'yapacak', 'yapılan', 'yapılması', 'yapıyor', 'yapmak', 'yaptı', 'yaptığı', 'yaptığını',
    'yaptıkları', 'ye', 'yedi', 'yerine', 'yetmiş', 'yi', 'yı', 'yine', 'yirmi', 'yoksa', 'yu', 'yüz', 'zaten', 'zira'
] # https://github.com/ahmetax/trstop/blob/master/dosyalar/turkce-stop-words

nltk_trstop = [
    'acaba', 'ama', 'aslında', 'az', 'bazı', 'belki', 'biri', 'birkaç', 'birşey', 'biz', 'bu', 'çok', 'çünkü', 'da', 'daha',
    'de', 'defa', 'diye', 'eğer', 'en', 'gibi', 'hem', 'hep', 'hepsi', 'her', 'hiç', 'için', 'ile', 'ise', 'kez', 'ki', 'kim',
    'mı', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'nerde', 'nerede', 'nereye', 'niçin', 'niye', 'o', 'sanki', 'şey', 'siz', 'şu',
    'tüm', 've', 'veya', 'ya', 'yani'
] # https://github.com/xiamx/node-nltk-stopwords/blob/master/data/stopwords/turkish

add_stop = [
    'a', 'b', 'c', 'ç', 'd', 'e', 'f', 'g', 'ğ', 'h', 'ı', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'ö', 'p', 'r', 's', 'ş', 't',
    'u', 'ü', 'v', 'y', 'z', 'li', 'lı', 'si', 'sı', 'te', 'ta', 'ın', 'in', 'na', 'ne', 'ler', 'lar', 'de', 'da', 'nın', 'nin',
    'lık', 'ım', 'im', 'yok', 'di', 'dı'
]

stop_words = sorted(list(set(trstop).union(nltk_trstop).union(add_stop)))

In [3]:
# Read preprocessed data
X_preprocessed = pd.read_csv("data/X_preprocessed.csv")["full_text"]
X_preprocessed = X_preprocessed.fillna("")
X_preprocessed_lemmatized = pd.read_csv("data/X_preprocessed_lemmatized.csv")["0"]
X_preprocessed_lemmatized = X_preprocessed_lemmatized.fillna("")
y_all = pd.read_csv("data/y_all.csv")["label"]
y_final = pd.read_csv("data/y_final.csv")["label"]

In [4]:
def fit_predict_score(feature_extraction_method, stop_words, ngram_range, lemmatization, annotator, classifier):
    """
    Fit your training data to the model of your choice, predict for test data and get F1 score.
    Available settings and classifiers:
        - feature_extraction_method: "bow" (Bag-of-Words) & "tfidf" (TF-IDF)
        - stop_words: Stop words of your choice as a list
        - ngram_range: The lower and upper boundary of the range of n-values for different word n-grams. e.g., (1, 1), (2, 3),...
        - lemmatization: True (use lemmatized X) & False (use not lemmatized X)
        - annotator: "final" (use labels in "final_annotator" data set) & "all" (use labels in "all_annotators" data set)
        - classifier: "logistic regression", "multinomial naive bayes", "support vector machine", "random forest", "knn"
    """
    
    # Feature extraction method
    if feature_extraction_method == "bow":
        vectorizer = CountVectorizer(stop_words=stop_words, ngram_range=ngram_range)
    if feature_extraction_method == "tfidf":
        vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=ngram_range)
    
    # Lemmatization
    if lemmatization:
        X = vectorizer.fit_transform(X_preprocessed_lemmatized)
    else:
        X = vectorizer.fit_transform(X_preprocessed)
    
    # Annotator
    if annotator == "final":
        y = y_final
    if annotator == "all":
        y = y_all
    
    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=530)
    
    # Classifier
    if classifier == "logistic regression":
        model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
    if classifier == "multinomial naive bayes":
        model = MultinomialNB().fit(X_train, y_train)
    if classifier == "support vector machine":
        model = SVC().fit(X_train, y_train)
    if classifier == "random forest":
        model = RandomForestClassifier().fit(X_train, y_train)
    if classifier == "knn":
        model = KNeighborsClassifier().fit(X_train, y_train)
    
        
    y_pred = model.predict(X_test)
    
    return f1_score(y_test, y_pred, average="weighted")

In [5]:
feature_extraction_methods = ["bow", "tfidf"]
ngram_ranges = [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)]
lemmatizations = [True, False]
annotators = ["final", "all"]
classifiers = ["logistic regression", "multinomial naive bayes", "support vector machine", "random forest", "knn"]

num_combinations = np.prod([len(item) for item in [feature_extraction_methods, ngram_ranges, lemmatizations, annotators, classifiers]])

In [6]:
scores_df = pd.DataFrame(columns = ["feature_extraction_method", "ngram_range", "lemmatization", "annotator", "model", "f1_score"])

i = 0

start_time = time.time()
for feature_extraction_method in feature_extraction_methods:
    for ngram_range in ngram_ranges:
        for lemmatization in lemmatizations:
            for annotator in annotators:
                for classifier in classifiers:
                    print(f"{i+1}/{num_combinations}- {feature_extraction_method} | {ngram_range} | {lemmatization} | {annotator} | {classifier} | F-1 Score: {fit_predict_score(feature_extraction_method, stop_words, ngram_range, lemmatization, annotator, classifier):.4f}")
                    scores_df = scores_df.append(pd.Series([feature_extraction_method,
                                                            ngram_range,
                                                            lemmatization,
                                                            annotator,
                                                            classifier,
                                                            fit_predict_score(feature_extraction_method,
                                                                              stop_words,
                                                                              ngram_range,
                                                                              lemmatization,
                                                                              annotator,
                                                                              classifier)],
                                                           index=scores_df.columns),
                                                 ignore_index=True)
                    i = i + 1

print(f"\nComparison of classification models and different settings took {(time.time() - start_time)/60:.2f} minutes.")

1/240- bow | (1, 1) | True | final | logistic regression | F-1 Score: 0.8784
2/240- bow | (1, 1) | True | final | multinomial naive bayes | F-1 Score: 0.8429
3/240- bow | (1, 1) | True | final | support vector machine | F-1 Score: 0.8568
4/240- bow | (1, 1) | True | final | random forest | F-1 Score: 0.8704
5/240- bow | (1, 1) | True | final | knn | F-1 Score: 0.8451
6/240- bow | (1, 1) | True | all | logistic regression | F-1 Score: 0.8677
7/240- bow | (1, 1) | True | all | multinomial naive bayes | F-1 Score: 0.8314
8/240- bow | (1, 1) | True | all | support vector machine | F-1 Score: 0.8445
9/240- bow | (1, 1) | True | all | random forest | F-1 Score: 0.8568
10/240- bow | (1, 1) | True | all | knn | F-1 Score: 0.8316
11/240- bow | (1, 1) | False | final | logistic regression | F-1 Score: 0.8694
12/240- bow | (1, 1) | False | final | multinomial naive bayes | F-1 Score: 0.8456
13/240- bow | (1, 1) | False | final | support vector machine | F-1 Score: 0.8520
14/240- bow | (1, 1) | Fa

112/240- bow | (3, 3) | False | final | multinomial naive bayes | F-1 Score: 0.5445
113/240- bow | (3, 3) | False | final | support vector machine | F-1 Score: 0.8364
114/240- bow | (3, 3) | False | final | random forest | F-1 Score: 0.8428
115/240- bow | (3, 3) | False | final | knn | F-1 Score: 0.8367
116/240- bow | (3, 3) | False | all | logistic regression | F-1 Score: 0.8247
117/240- bow | (3, 3) | False | all | multinomial naive bayes | F-1 Score: 0.5412
118/240- bow | (3, 3) | False | all | support vector machine | F-1 Score: 0.8224
119/240- bow | (3, 3) | False | all | random forest | F-1 Score: 0.8290
120/240- bow | (3, 3) | False | all | knn | F-1 Score: 0.8227
121/240- tfidf | (1, 1) | True | final | logistic regression | F-1 Score: 0.8613
122/240- tfidf | (1, 1) | True | final | multinomial naive bayes | F-1 Score: 0.8364
123/240- tfidf | (1, 1) | True | final | support vector machine | F-1 Score: 0.8565
124/240- tfidf | (1, 1) | True | final | random forest | F-1 Score: 0.

218/240- tfidf | (2, 3) | False | all | support vector machine | F-1 Score: 0.8247
219/240- tfidf | (2, 3) | False | all | random forest | F-1 Score: 0.8311
220/240- tfidf | (2, 3) | False | all | knn | F-1 Score: 0.8233
221/240- tfidf | (3, 3) | True | final | logistic regression | F-1 Score: 0.8373
222/240- tfidf | (3, 3) | True | final | multinomial naive bayes | F-1 Score: 0.8364
223/240- tfidf | (3, 3) | True | final | support vector machine | F-1 Score: 0.8379
224/240- tfidf | (3, 3) | True | final | random forest | F-1 Score: 0.8433
225/240- tfidf | (3, 3) | True | final | knn | F-1 Score: 0.8373
226/240- tfidf | (3, 3) | True | all | logistic regression | F-1 Score: 0.8233
227/240- tfidf | (3, 3) | True | all | multinomial naive bayes | F-1 Score: 0.8224
228/240- tfidf | (3, 3) | True | all | support vector machine | F-1 Score: 0.8239
229/240- tfidf | (3, 3) | True | all | random forest | F-1 Score: 0.8293
230/240- tfidf | (3, 3) | True | all | knn | F-1 Score: 0.8233
231/240- 

In [7]:
scores_df

Unnamed: 0,feature_extraction_method,ngram_range,lemmatization,annotator,model,f1_score
0,bow,"(1, 1)",True,final,logistic regression,0.878366
1,bow,"(1, 1)",True,final,multinomial naive bayes,0.842941
2,bow,"(1, 1)",True,final,support vector machine,0.856846
3,bow,"(1, 1)",True,final,random forest,0.870077
4,bow,"(1, 1)",True,final,knn,0.845095
...,...,...,...,...,...,...
235,tfidf,"(3, 3)",False,all,logistic regression,0.823278
236,tfidf,"(3, 3)",False,all,multinomial naive bayes,0.822402
237,tfidf,"(3, 3)",False,all,support vector machine,0.823899
238,tfidf,"(3, 3)",False,all,random forest,0.828008


In [8]:
scores_df.sort_values("f1_score")

Unnamed: 0,feature_extraction_method,ngram_range,lemmatization,annotator,model,f1_score
96,bow,"(2, 3)",False,all,multinomial naive bayes,0.337530
91,bow,"(2, 3)",False,final,multinomial naive bayes,0.338942
86,bow,"(2, 3)",True,all,multinomial naive bayes,0.422793
81,bow,"(2, 3)",True,final,multinomial naive bayes,0.425245
116,bow,"(3, 3)",False,all,multinomial naive bayes,0.541242
...,...,...,...,...,...,...
10,bow,"(1, 1)",False,final,logistic regression,0.869353
3,bow,"(1, 1)",True,final,random forest,0.870077
40,bow,"(1, 3)",True,final,logistic regression,0.870230
20,bow,"(1, 2)",True,final,logistic regression,0.874668


In [9]:
scores_df.to_csv("data/scores_df.csv", index=False)