In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import nltk
import textacy
import textacy.preprocessing as tprep
import random, os
import numpy as np
import torch
import os
os.environ["KERAS_BACKEND"] = "torch"
import keras
from collections import Counter
import keras_tuner
import pandas as pd
from tqdm import tqdm
from keras import layers
from itertools import chain

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
# Seed everything to make results reproducible
seed_everything(42)

## Main Part

In [3]:
# Read the full dataset
file_path = "dataset.tsv"
df = pd.read_csv(file_path, sep='\t')

In [4]:
stopwords = [tprep.remove.accents(word) for word in stopwords.words("turkish")]

# Tokenize the Turkish text so that it is ready to be used as features
def tokenize(text):
    text = text.lower()                                                           # lowercase the text
    text = text.translate(str.maketrans('', '', string.punctuation))              # remove punctuation
    text = tprep.normalize.unicode(text)                                          # normalize unicode
    text = tprep.remove.accents(text)                                             # remove accents
    text = word_tokenize(text, language='turkish')                                # split the text into words
    text = [word for word in text if not word in stopwords]                       # remove unnecessary noise words
    return text

In [5]:
# Create a simple tokenizer class to preprocess tokenization to the Bi-LSTM model

class Tokenizer:
    def __init__(self, counter: Counter, max_features: int, max_length: int = 50):
        # the word count for the input text
        self.counter = counter
        self.max_features = max_features
        self.max_length = max_length

        # only use the most common features, others are most probably noise (excluding stopwords)
        most_common_tokens = [pair[0] for pair in counter.most_common(max_features)]
        self.w2i = {token:(i+1) for i, token in enumerate(most_common_tokens)}
        self.i2w = {(i+1):token for i, token in enumerate(most_common_tokens)}
        self.pad_id = 0

    def tokenize(self, text):
        text = text.lower()                                                           # lowercase the text
        text = text.translate(str.maketrans('', '', string.punctuation))              # remove punctuation
        text = tprep.normalize.unicode(text)
        text = tprep.remove.accents(text)
        text = word_tokenize(text, language='turkish')                                # split the text into words
        text = [word for word in text if not word in stopwords] # remove unnecessary noise words
        return text
    
    def __call__(self, text):
        # first tokenize the text with some preprocessing
        tokens = self.tokenize(text)
        token_ids = []

        # if the token is in the vocabulary, add it to the token_ids list
        for token in tokens:
            if token in self.w2i:
                token_ids.append(self.w2i[token])
        return token_ids

    def decode(self, ids):
        # decode the token ids back to the text
        tokens = []
        for token_id in ids:
            if token_id != self.pad_id:
                tokens.append(self.i2w[token_id])

        return " ".join(tokens)

In [6]:
# Create a simple dataset class to preprocess the data to the Bi-LSTM model
tokens = [tokenize(c) for c in df["Comment"].tolist()]
tokens = Counter(list(chain(*tokens)))

In [7]:
# Set max_features to 5000 to use the most common 5000 words in the dataset
max_features = 5000
maxlen = 50
num_classes = 5

In [8]:
# Create the tokenizer
tokenizer = Tokenizer(tokens, (max_features - 1), maxlen)

In [9]:
# Create the input data by tokenizing the comments

X = []
for comment in tqdm(df["Comment"].tolist()):
    X.append(tokenizer(comment))

100%|██████████████████████████████████| 40450/40450 [00:02<00:00, 16288.74it/s]


In [10]:
# Convert the scores with a mapping due to indexing scheme in Python
mapping = {
    1: 0,
    2: 1,
    3: 2,
    4: 3,
    5: 4
}

In [11]:
# Create the output data by mapping the scores
y = df["Score"].apply(lambda x: mapping[x]).to_numpy()

In [12]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Pad the sequences to the same length
x_train = keras.utils.pad_sequences(x_train, maxlen=maxlen)
x_val = keras.utils.pad_sequences(x_val, maxlen=maxlen)

In [14]:
# Convert the scores to categorical data
y_train = keras.utils.to_categorical(y_train, num_classes)
y_val = keras.utils.to_categorical(y_val, num_classes)

In [15]:
# Create the Bi-LSTM model
# The model is created with Keras Tuner to find the best hyperparameters for the model

def build_model(hp):
    # Metrics to be used in the model
    p = keras.metrics.Precision()
    r = keras.metrics.Recall()
    accuracy = keras.metrics.CategoricalAccuracy

    # Hyperparameters to be tuned are the embedding dimension, hidden dimension, learning rate and weight decay
    model = keras.Sequential()
    model.add(layers.Embedding(max_features, hp.Choice('embedding_dim', [64, 128])))
    lstm_dim = hp.Choice('hidden_dim', [64, 128])
    model.add(layers.Bidirectional(layers.LSTM(lstm_dim, return_sequences=True)))
    model.add(layers.Bidirectional(layers.LSTM(lstm_dim)))
    model.add(layers.Dense(num_classes, activation="softmax"))

    hp_learning_rate = hp.Choice('learning_rate', values=[1e-3, 3e-4])
    hp_weight_decay = hp.Choice('weight_decay', values=[5e-5, 1e-5, 5e-6])
    model.compile(optimizer=keras.optimizers.AdamW(learning_rate=hp_learning_rate, weight_decay=hp_weight_decay), loss="categorical_crossentropy", metrics=[accuracy, p, r])
    return model

In [16]:
# Create the tuner
tuner = keras_tuner.RandomSearch(
    build_model,
    objective='val_categorical_accuracy',
    max_trials=15
)

Reloading Tuner from ./untitled_project/tuner0.json


In [17]:
# Perform the search
tuner.search(x_train, y_train, epochs=2, validation_split=0.1, batch_size=32)

In [18]:
# Get the best model
best_model = tuner.get_best_models()[0]

  trackable.load_own_variables(weights_store.get(inner_path))


In [19]:
# Evaluate the best model
best_model.evaluate(x_val, y_val)

[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 59ms/step - categorical_accuracy: 0.8271 - loss: 0.5339 - precision: 0.8599 - recall: 0.8035


[0.5304042100906372,
 0.8253399133682251,
 0.8596793413162231,
 0.8019777536392212]

In [20]:
# Get the predictions for further analysis

predictions = best_model.predict(x_val)
preds = predictions.argmax(axis=-1)
golds = y_val.argmax(axis=-1)

[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 58ms/step


In [22]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Calculate precision, recall, and F1 score
report = classification_report(golds, preds, output_dict=True)
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1 = report['weighted avg']['f1-score']

# Display the classification report and confusion matrix
print("\nClassification Report:\n", classification_report(golds, preds))
print("\nConfusion Matrix:\n", confusion_matrix(golds, preds))
print("\nPrecision:", f"{(precision * 100):.2f}")
print("Recall:", f"{(recall * 100):.2f}")
print("F1 Score:", f"{(f1 * 100):.2f}")
print("Accuracy:", f"{(accuracy_score(golds, preds) * 100):.2f}")


Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.81      0.69       381
           1       0.00      0.00      0.00       113
           2       0.33      0.03      0.05       272
           3       0.45      0.07      0.13       937
           4       0.85      0.99      0.91      6387

    accuracy                           0.83      8090
   macro avg       0.45      0.38      0.36      8090
weighted avg       0.76      0.83      0.77      8090


Confusion Matrix:
 [[ 307    0    4    2   68]
 [  67    0    3    3   40]
 [  55    3    8   43  163]
 [  29    3    4   69  832]
 [  53    1    5   35 6293]]

Precision: 76.38
Recall: 82.53
F1 Score: 76.98
Accuracy: 82.53


In [23]:
# Save the predictions for statitical significance test
with open("bilstm_preds.npy", "wb") as f:
    np.save(f, preds + 1)

# Save the gold labels for statitical significance test
with open("bilstm_golds.npy", "wb") as f:
    np.save(f, golds + 1)

## OOD Part

In [24]:
import pandas as pd

# Load the OOD dataset
file_path = "OsmanBaturInce_ood_dataset.tsv"
df = pd.read_csv(file_path, sep='\t', header=None, names=["Comment", "Score", "Link", "Brand", "Type"])

In [25]:
from tqdm import tqdm
# Tokenize the OOD dataset to create the feature matrix

X = []
for comment in tqdm(df["Comment"].tolist()):
    X.append(tokenizer(comment))

100%|████████████████████████████████████| 1182/1182 [00:00<00:00, 15452.24it/s]


In [26]:
# Create the output data by mapping the scores and converting them to categorical data
y = df["Score"].apply(lambda x: mapping[x]).to_numpy()
y = keras.utils.to_categorical(y, num_classes)

In [27]:
# Pad the sequences to the same length
X = keras.utils.pad_sequences(X, maxlen=maxlen)

In [28]:
# Evaluate the best model on the OOD dataset
best_model.evaluate(X, y)

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 59ms/step - categorical_accuracy: 0.8451 - loss: 0.4891 - precision: 0.8724 - recall: 0.8267


[0.49778124690055847,
 0.8443316221237183,
 0.8727436661720276,
 0.8181049227714539]

In [29]:
# Get the predictions for further analysis
predictions = best_model.predict(X)

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 60ms/step


In [30]:
preds = predictions.argmax(axis=-1)

In [31]:
golds = y.argmax(axis=-1)

In [32]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Calculate precision, recall, and F1 score
report = classification_report(golds, preds, output_dict=True)
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1 = report['weighted avg']['f1-score']

# Display the classification report and confusion matrix
print("\nClassification Report:\n", classification_report(golds, preds))
print("\nConfusion Matrix:\n", confusion_matrix(golds, preds))
print("\nPrecision:", f"{(precision * 100):.2f}")
print("Recall:", f"{(recall * 100):.2f}")
print("F1 Score:", f"{(f1 * 100):.2f}")
print("Accuracy:", f"{(accuracy_score(golds, preds) * 100):.2f}")


Classification Report:
               precision    recall  f1-score   support

           0       0.22      0.64      0.33        11
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00        36
           3       0.20      0.02      0.03       117
           4       0.87      0.98      0.92      1012

    accuracy                           0.84      1182
   macro avg       0.26      0.33      0.26      1182
weighted avg       0.76      0.84      0.79      1182


Confusion Matrix:
 [[  7   0   0   0   4]
 [  1   0   0   0   5]
 [  3   0   0   3  30]
 [  3   0   0   2 112]
 [ 18   0   0   5 989]]

Precision: 76.46
Recall: 84.43
F1 Score: 79.31
Accuracy: 84.43


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
# Save the predictions for statitical significance test
with open("bilstm_ood_preds.npy", "wb") as f:
    np.save(f, preds + 1)

In [34]:
# Save the gold labels for statitical significance test
with open("bilstm_ood_golds.npy", "wb") as f:
    np.save(f, golds + 1)

## Statistical Significance Tests

In [35]:
from statsmodels.stats.contingency_tables import mcnemar
from pprint import pprint

# McNemar's Test for statistical significance
# The test is performed on the predictions of every pair of models
def mcnemar_test(model1_preds, model2_preds, model1_golds, model2_golds):
    table = [[0, 0], [0, 0]]

    for pred1, pred2, gold in zip(model1_preds, model2_preds, model1_golds):
        if pred1 == gold and pred2 == gold:
            continue
        elif pred1 == gold and pred2 != gold:
            table[0][1] += 1
        elif pred1 != gold and pred2 == gold:
            table[1][0] += 1
        elif pred1 != gold and pred2 != gold:
            table[1][1] += 1

    if table[0][1] + table[1][0] < 50:
        exact = True
    else:
        exact = False

    result = mcnemar(table, exact=exact)
    print("McNemar's Test Statistic:", result.statistic)
    print("P-value:", result.pvalue)

In [37]:
rf_vals = {"preds": np.load("rf_ood_preds.npy"), "golds": np.load("rf_ood_golds.npy")}
bilstm_vals = {"preds": np.load("bilstm_ood_preds.npy"), "golds": np.load("bilstm_ood_golds.npy")} 
nb_vals = {"preds": np.load("nb_ood_preds.npy"), "golds": np.load("nb_ood_golds.npy")} 

vals = [rf_vals, bilstm_vals, nb_vals]
names = ["Random Forest", "Bi-LSTM", "Naive Bayes"]


from itertools import combinations

# For every pair of models, perform the McNemar's Test for OOD predictions
for model1, model2 in combinations(list(zip(vals, names)), 2):
    model1_vals, model1_name = model1
    model2_vals, model2_name = model2

    model1_preds, model1_golds = model1_vals["preds"], model1_vals["golds"]
    model2_preds, model2_golds = model2_vals["preds"], model2_vals["golds"]
    print("OOD", model1_name, "-", model2_name)
    mcnemar_test(model1_preds, model2_preds, model1_golds, model2_golds)
    print()

OOD Random Forest - Bi-LSTM
McNemar's Test Statistic: 6.0
P-value: 0.009355306625366211

OOD Random Forest - Naive Bayes
McNemar's Test Statistic: 16.0
P-value: 0.735878800856881

OOD Bi-LSTM - Naive Bayes
McNemar's Test Statistic: 14.0
P-value: 0.10812902140605732



In [38]:
rf_vals = {"preds": np.load("rf_preds.npy"), "golds": np.load("rf_golds.npy")}
bilstm_vals = {"preds": np.load("bilstm_preds.npy"), "golds": np.load("bilstm_golds.npy")} 
nb_vals = {"preds": np.load("nb_preds.npy"), "golds": np.load("nb_golds.npy")} 

vals = [rf_vals, bilstm_vals, nb_vals]
names = ["Random Forest", "Bi-LSTM", "Naive Bayes"]


from itertools import combinations

# For every pair of models, perform the McNemar's Test for IID predictions
for model1, model2 in combinations(list(zip(vals, names)), 2):
    model1_vals, model1_name = model1
    model2_vals, model2_name = model2

    model1_preds, model1_golds = model1_vals["preds"], model1_vals["golds"]
    model2_preds, model2_golds = model2_vals["preds"], model2_vals["golds"]
    print("IID", model1_name, "-", model2_name)
    mcnemar_test(model1_preds, model2_preds, model1_golds, model2_golds)
    print()

IID Random Forest - Bi-LSTM
McNemar's Test Statistic: 10.460251046025105
P-value: 0.0012197070113041527

IID Random Forest - Naive Bayes
McNemar's Test Statistic: 0.8634361233480177
P-value: 0.35277890006264423

IID Bi-LSTM - Naive Bayes
McNemar's Test Statistic: 5.020491803278689
P-value: 0.025049053512565384

