In [1]:
!pip install zemberek-python

Collecting zemberek-python
  Downloading zemberek_python-0.2.3-py3-none-any.whl.metadata (2.7 kB)
Collecting antlr4-python3-runtime==4.8 (from zemberek-python)
  Downloading antlr4-python3-runtime-4.8.tar.gz (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.4/112.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading zemberek_python-0.2.3-py3-none-any.whl (95.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: antlr4-python3-runtime
  Building wheel for antlr4-python3-runtime (setup.py) ... [?25l[?25hdone
  Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-py3-none-any.whl size=141214 sha256=fea8ebc66c8361206fc135e3a49d30ba784008397d2b59cda27a7b951ccdc492
  Stored in directory: /root/.cache/pip/wheels/a7/20/bd/e1477d664f22d99989fd28ee1a43d6633dddb5cb9

In [3]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from gensim.models import KeyedVectors, Word2Vec
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from google.colab import drive
from zemberek import TurkishMorphology, TurkishSpellChecker

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Define paths
dataset_path = '/content/drive/My Drive/DATASET/final_hate_speech.xlsx'
word2vec_path = '/content/drive/My Drive/DATASET/word2vec_tr.model'
fine_tuned_path = '/content/drive/My Drive/DATASET/word2vec_tr_finetuned.model'

# Load dataset
data = pd.read_excel(dataset_path)

In [5]:
# Initialize Zemberek for spell checking
morphology = TurkishMorphology.create_with_defaults()
spell_checker = TurkishSpellChecker(morphology)

cache = {}

def clean_turkish_text_with_cache(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])  # Keep alphanumeric and spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    corrected_words = []
    for word in text.split():
        if word in cache:
            corrected_words.append(cache[word])
        else:
            suggestions = spell_checker.suggest_for_word(word)
            correction = suggestions[0] if suggestions else word
            cache[word] = correction
            corrected_words.append(correction)
    return ' '.join(corrected_words) if corrected_words else "EMPTY"

# Apply preprocessing with caching
data['tweet_cleaned'] = data['tweet'].apply(clean_turkish_text_with_cache)

# Tokenize cleaned tweets
sentences = [tweet.split() for tweet in data['tweet_cleaned'] if tweet != "EMPTY"]

INFO:zemberek.morphology.turkish_morphology:TurkishMorphology instance initialized in 15.941426753997803


2025-01-01 16:37:29,643 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 15.941426753997803



In [6]:
 # Load pre-trained Word2Vec model
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Fine-tune Word2Vec
new_model = Word2Vec(vector_size=word2vec_model.vector_size, min_count=1)
new_model.build_vocab(sentences)
new_model.build_vocab([list(word2vec_model.key_to_index.keys())], update=True)
new_model.wv.vectors = np.copy(word2vec_model.vectors)
new_model.train(sentences, total_examples=len(sentences), epochs=10)
new_model.save(fine_tuned_path)


INFO:gensim.models.keyedvectors:loading projection weights from /content/drive/My Drive/DATASET/word2vec_tr.model


2025-01-01 17:56:49,379 - gensim.models.keyedvectors - INFO
Msg: loading projection weights from /content/drive/My Drive/DATASET/word2vec_tr.model



INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (412457, 400) matrix of type float32 from /content/drive/My Drive/DATASET/word2vec_tr.model', 'binary': True, 'encoding': 'utf8', 'datetime': '2025-01-01T17:57:13.098185', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}


2025-01-01 17:57:13,098 - gensim.utils - INFO
Msg: KeyedVectors lifecycle event {'msg': 'loaded (412457, 400) matrix of type float32 from /content/drive/My Drive/DATASET/word2vec_tr.model', 'binary': True, 'encoding': 'utf8', 'datetime': '2025-01-01T17:57:13.098185', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}



INFO:gensim.utils:Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=400, alpha=0.025>', 'datetime': '2025-01-01T17:57:13.106225', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'created'}


2025-01-01 17:57:13,106 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=400, alpha=0.025>', 'datetime': '2025-01-01T17:57:13.106225', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'created'}



INFO:gensim.models.word2vec:collecting all words and their counts


2025-01-01 17:57:13,111 - gensim.models.word2vec - INFO
Msg: collecting all words and their counts



INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


2025-01-01 17:57:13,115 - gensim.models.word2vec - INFO
Msg: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types



INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 212849 words, keeping 39990 word types


2025-01-01 17:57:13,195 - gensim.models.word2vec - INFO
Msg: PROGRESS: at sentence #10000, processed 212849 words, keeping 39990 word types



INFO:gensim.models.word2vec:collected 40610 word types from a corpus of 218268 raw words and 10224 sentences


2025-01-01 17:57:13,203 - gensim.models.word2vec - INFO
Msg: collected 40610 word types from a corpus of 218268 raw words and 10224 sentences



INFO:gensim.models.word2vec:Creating a fresh vocabulary


2025-01-01 17:57:13,208 - gensim.models.word2vec - INFO
Msg: Creating a fresh vocabulary



INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 40610 unique words (100.00% of original 40610, drops 0)', 'datetime': '2025-01-01T17:57:13.374625', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}


2025-01-01 17:57:13,374 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 40610 unique words (100.00% of original 40610, drops 0)', 'datetime': '2025-01-01T17:57:13.374625', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}



INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 218268 word corpus (100.00% of original 218268, drops 0)', 'datetime': '2025-01-01T17:57:13.379803', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}


2025-01-01 17:57:13,379 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 218268 word corpus (100.00% of original 218268, drops 0)', 'datetime': '2025-01-01T17:57:13.379803', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}



INFO:gensim.models.word2vec:deleting the raw counts dictionary of 40610 items


2025-01-01 17:57:13,607 - gensim.models.word2vec - INFO
Msg: deleting the raw counts dictionary of 40610 items



INFO:gensim.models.word2vec:sample=0.001 downsamples 34 most-common words


2025-01-01 17:57:13,614 - gensim.models.word2vec - INFO
Msg: sample=0.001 downsamples 34 most-common words



INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 197573.4735612863 word corpus (90.5%% of prior 218268)', 'datetime': '2025-01-01T17:57:13.618890', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}


2025-01-01 17:57:13,618 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 197573.4735612863 word corpus (90.5%% of prior 218268)', 'datetime': '2025-01-01T17:57:13.618890', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}



INFO:gensim.models.word2vec:estimated required memory for 40610 words and 400 dimensions: 150257000 bytes


2025-01-01 17:57:14,068 - gensim.models.word2vec - INFO
Msg: estimated required memory for 40610 words and 400 dimensions: 150257000 bytes



INFO:gensim.models.word2vec:resetting layer weights


2025-01-01 17:57:14,074 - gensim.models.word2vec - INFO
Msg: resetting layer weights



INFO:gensim.utils:Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2025-01-01T17:57:14.197615', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'build_vocab'}


2025-01-01 17:57:14,197 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2025-01-01T17:57:14.197615', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'build_vocab'}



INFO:gensim.models.word2vec:collecting all words and their counts


2025-01-01 17:57:14,211 - gensim.models.word2vec - INFO
Msg: collecting all words and their counts



INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


2025-01-01 17:57:14,216 - gensim.models.word2vec - INFO
Msg: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types



INFO:gensim.models.word2vec:collected 412457 word types from a corpus of 412457 raw words and 1 sentences


2025-01-01 17:57:14,465 - gensim.models.word2vec - INFO
Msg: collected 412457 word types from a corpus of 412457 raw words and 1 sentences



INFO:gensim.models.word2vec:Updating model with new vocabulary


2025-01-01 17:57:14,470 - gensim.models.word2vec - INFO
Msg: Updating model with new vocabulary



INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'added 388072 new unique words (94.09% of original 412457) and increased the count of 24385 pre-existing words (5.91% of original 412457)', 'datetime': '2025-01-01T17:57:17.345251', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}


2025-01-01 17:57:17,345 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'msg': 'added 388072 new unique words (94.09% of original 412457) and increased the count of 24385 pre-existing words (5.91% of original 412457)', 'datetime': '2025-01-01T17:57:17.345251', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}



INFO:gensim.models.word2vec:deleting the raw counts dictionary of 412457 items


2025-01-01 17:57:21,299 - gensim.models.word2vec - INFO
Msg: deleting the raw counts dictionary of 412457 items



INFO:gensim.models.word2vec:sample=0.001 downsamples 0 most-common words


2025-01-01 17:57:21,312 - gensim.models.word2vec - INFO
Msg: sample=0.001 downsamples 0 most-common words



INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 412457 word corpus (100.0%% of prior 412457)', 'datetime': '2025-01-01T17:57:21.317856', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}


2025-01-01 17:57:21,317 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 412457 word corpus (100.0%% of prior 412457)', 'datetime': '2025-01-01T17:57:21.317856', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}



INFO:gensim.models.word2vec:estimated required memory for 412457 words and 400 dimensions: 1526090900 bytes


2025-01-01 17:57:24,669 - gensim.models.word2vec - INFO
Msg: estimated required memory for 412457 words and 400 dimensions: 1526090900 bytes



INFO:gensim.models.word2vec:updating layer weights


2025-01-01 17:57:24,674 - gensim.models.word2vec - INFO
Msg: updating layer weights



INFO:gensim.utils:Word2Vec lifecycle event {'update': True, 'trim_rule': 'None', 'datetime': '2025-01-01T17:57:26.481259', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'build_vocab'}


2025-01-01 17:57:26,481 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'update': True, 'trim_rule': 'None', 'datetime': '2025-01-01T17:57:26.481259', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'build_vocab'}



INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'training model with 3 workers on 428682 vocabulary and 400 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2025-01-01T17:57:27.201525', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'train'}


2025-01-01 17:57:27,201 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'msg': 'training model with 3 workers on 428682 vocabulary and 400 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2025-01-01T17:57:27.201525', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'train'}



INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 73.03% examples, 158288 words/s, in_qsize 5, out_qsize 0


2025-01-01 17:57:28,253 - gensim.models.word2vec - INFO
Msg: EPOCH 0 - PROGRESS: at 73.03% examples, 158288 words/s, in_qsize 5, out_qsize 0



INFO:gensim.models.word2vec:EPOCH 0: training on 218268 raw words (216566 effective words) took 1.2s, 185352 effective words/s


2025-01-01 17:57:28,421 - gensim.models.word2vec - INFO
Msg: EPOCH 0: training on 218268 raw words (216566 effective words) took 1.2s, 185352 effective words/s



INFO:gensim.models.word2vec:EPOCH 1: training on 218268 raw words (216661 effective words) took 0.9s, 242118 effective words/s


2025-01-01 17:57:29,326 - gensim.models.word2vec - INFO
Msg: EPOCH 1: training on 218268 raw words (216661 effective words) took 0.9s, 242118 effective words/s



INFO:gensim.models.word2vec:EPOCH 2: training on 218268 raw words (216595 effective words) took 0.9s, 250446 effective words/s


2025-01-01 17:57:30,201 - gensim.models.word2vec - INFO
Msg: EPOCH 2: training on 218268 raw words (216595 effective words) took 0.9s, 250446 effective words/s



INFO:gensim.models.word2vec:EPOCH 3: training on 218268 raw words (216609 effective words) took 0.9s, 251971 effective words/s


2025-01-01 17:57:31,071 - gensim.models.word2vec - INFO
Msg: EPOCH 3: training on 218268 raw words (216609 effective words) took 0.9s, 251971 effective words/s



INFO:gensim.models.word2vec:EPOCH 4 - PROGRESS: at 73.03% examples, 153340 words/s, in_qsize 5, out_qsize 0


2025-01-01 17:57:32,119 - gensim.models.word2vec - INFO
Msg: EPOCH 4 - PROGRESS: at 73.03% examples, 153340 words/s, in_qsize 5, out_qsize 0



INFO:gensim.models.word2vec:EPOCH 4: training on 218268 raw words (216605 effective words) took 1.4s, 158263 effective words/s


2025-01-01 17:57:32,454 - gensim.models.word2vec - INFO
Msg: EPOCH 4: training on 218268 raw words (216605 effective words) took 1.4s, 158263 effective words/s



INFO:gensim.models.word2vec:EPOCH 5 - PROGRESS: at 68.17% examples, 139273 words/s, in_qsize 6, out_qsize 0


2025-01-01 17:57:33,535 - gensim.models.word2vec - INFO
Msg: EPOCH 5 - PROGRESS: at 68.17% examples, 139273 words/s, in_qsize 6, out_qsize 0



INFO:gensim.models.word2vec:EPOCH 5: training on 218268 raw words (216628 effective words) took 1.5s, 144672 effective words/s


2025-01-01 17:57:33,966 - gensim.models.word2vec - INFO
Msg: EPOCH 5: training on 218268 raw words (216628 effective words) took 1.5s, 144672 effective words/s



INFO:gensim.models.word2vec:EPOCH 6 - PROGRESS: at 68.17% examples, 142607 words/s, in_qsize 5, out_qsize 0


2025-01-01 17:57:35,022 - gensim.models.word2vec - INFO
Msg: EPOCH 6 - PROGRESS: at 68.17% examples, 142607 words/s, in_qsize 5, out_qsize 0



INFO:gensim.models.word2vec:EPOCH 6: training on 218268 raw words (216597 effective words) took 1.4s, 156452 effective words/s


2025-01-01 17:57:35,366 - gensim.models.word2vec - INFO
Msg: EPOCH 6: training on 218268 raw words (216597 effective words) took 1.4s, 156452 effective words/s



INFO:gensim.models.word2vec:EPOCH 7 - PROGRESS: at 58.48% examples, 123315 words/s, in_qsize 5, out_qsize 0


2025-01-01 17:57:36,448 - gensim.models.word2vec - INFO
Msg: EPOCH 7 - PROGRESS: at 58.48% examples, 123315 words/s, in_qsize 5, out_qsize 0



INFO:gensim.models.word2vec:EPOCH 7: training on 218268 raw words (216618 effective words) took 1.5s, 145519 effective words/s


2025-01-01 17:57:36,893 - gensim.models.word2vec - INFO
Msg: EPOCH 7: training on 218268 raw words (216618 effective words) took 1.5s, 145519 effective words/s



INFO:gensim.models.word2vec:EPOCH 8: training on 218268 raw words (216650 effective words) took 0.9s, 244495 effective words/s


2025-01-01 17:57:37,791 - gensim.models.word2vec - INFO
Msg: EPOCH 8: training on 218268 raw words (216650 effective words) took 0.9s, 244495 effective words/s



INFO:gensim.models.word2vec:EPOCH 9: training on 218268 raw words (216656 effective words) took 0.9s, 254611 effective words/s


2025-01-01 17:57:38,654 - gensim.models.word2vec - INFO
Msg: EPOCH 9: training on 218268 raw words (216656 effective words) took 0.9s, 254611 effective words/s



INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'training on 2182680 raw words (2166185 effective words) took 11.4s, 189382 effective words/s', 'datetime': '2025-01-01T17:57:38.657533', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'train'}


2025-01-01 17:57:38,657 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'msg': 'training on 2182680 raw words (2166185 effective words) took 11.4s, 189382 effective words/s', 'datetime': '2025-01-01T17:57:38.657533', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'train'}



INFO:gensim.utils:Word2Vec lifecycle event {'fname_or_handle': '/content/drive/My Drive/DATASET/word2vec_tr_finetuned.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2025-01-01T17:57:38.660660', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'saving'}


2025-01-01 17:57:38,660 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'fname_or_handle': '/content/drive/My Drive/DATASET/word2vec_tr_finetuned.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2025-01-01T17:57:38.660660', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'saving'}



INFO:gensim.utils:storing np array 'vectors' to /content/drive/My Drive/DATASET/word2vec_tr_finetuned.model.wv.vectors.npy


2025-01-01 17:57:38,663 - gensim.utils - INFO
Msg: storing np array 'vectors' to /content/drive/My Drive/DATASET/word2vec_tr_finetuned.model.wv.vectors.npy



INFO:gensim.utils:storing np array 'syn1neg' to /content/drive/My Drive/DATASET/word2vec_tr_finetuned.model.syn1neg.npy


2025-01-01 17:57:56,918 - gensim.utils - INFO
Msg: storing np array 'syn1neg' to /content/drive/My Drive/DATASET/word2vec_tr_finetuned.model.syn1neg.npy



INFO:gensim.utils:not storing attribute cum_table


2025-01-01 17:58:25,973 - gensim.utils - INFO
Msg: not storing attribute cum_table



INFO:gensim.utils:saved /content/drive/My Drive/DATASET/word2vec_tr_finetuned.model


2025-01-01 17:58:27,235 - gensim.utils - INFO
Msg: saved /content/drive/My Drive/DATASET/word2vec_tr_finetuned.model



In [None]:
data.head()

In [8]:
def text_to_word2vec(text, model, vector_size=300):
    if not text or text.strip() == "EMPTY":
        return np.zeros(vector_size, dtype=np.float32)
    words = text.split()
    embeddings = [model[word] for word in words if word in model.key_to_index]
    if not embeddings:
        return np.zeros(vector_size, dtype=np.float32)
    return np.mean(embeddings, axis=0)

In [9]:
X = np.array([text_to_word2vec(tweet, new_model.wv, vector_size=new_model.vector_size)
             for tweet in data['tweet_cleaned']])
y = data['etiket'].values
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [10]:
# Define resampling methods
resampling_methods = {
    "Original": None,
    "Oversampling": RandomOverSampler(random_state=42),
    "Undersampling": RandomUnderSampler(random_state=42),
    "Combined": SMOTEENN(random_state=42),
}

# Define ML models
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42, tree_method='hist', use_label_encoder=False),
    "LightGBM": LGBMClassifier(random_state=42),
}

In [11]:
# ANN Model
def build_ann(input_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(len(label_encoder.classes_), activation='softmax'),
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Train and evaluate models
results = []

In [12]:
for res_name, resampler in resampling_methods.items():
    if resampler:
        X_resampled, y_resampled = resampler.fit_resample(X_train, y_train)
    else:
        X_resampled, y_resampled = X_train, y_train

    for model_name, model in models.items():
        model.fit(X_resampled, y_resampled)
        y_pred = model.predict(X_test)
        results.append({
            "Model": model_name,
            "Resampling": res_name,
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred, average='macro'),
            "Recall": recall_score(y_test, y_pred, average='macro'),
            "F1-Score": f1_score(y_test, y_pred, average='macro'),
        })

    # ANN
    ann_model = build_ann(X_resampled.shape[1])
    ann_model.fit(X_resampled, y_resampled, epochs=10, batch_size=32, verbose=0)
    y_pred_ann = np.argmax(ann_model.predict(X_test), axis=1)
    results.append({
        "Model": "ANN",
        "Resampling": res_name,
        "Accuracy": accuracy_score(y_test, y_pred_ann),
        "Precision": precision_score(y_test, y_pred_ann, average='macro'),
        "Recall": recall_score(y_test, y_pred_ann, average='macro'),
        "F1-Score": f1_score(y_test, y_pred_ann, average='macro'),
    })

# Results DataFrame
results_df = pd.DataFrame(results)
print(results_df)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051503 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 102000
[LightGBM] [Info] Number of data points in the train set: 8179, number of used features: 400
[LightGBM] [Info] Start training from score -0.277827
[LightGBM] [Info] Start training from score -1.488549
[LightGBM] [Info] Start training from score -4.082071


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.118995 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 101998
[LightGBM] [Info] Number of data points in the train set: 18585, number of used features: 400
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002664 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55470
[LightGBM] [Info] Number of data points in the train set: 414, number of used features: 400
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077581 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 102000
[LightGBM] [Info] Number of data points in the train set: 12421, number of used features: 400
[LightGBM] [Info] Start training from score -3.453334
[LightGBM] [Info] Start training from score -0.755857
[LightGBM] [Info] Start training from score -0.695646




[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
            Model     Resampling  Accuracy  Precision    Recall  F1-Score
0   Random Forest       Original  0.751589   0.466130  0.344756  0.311199
1         XGBoost       Original  0.773105   0.503852  0.382541  0.378850
2        LightGBM       Original  0.776528   0.510048  0.387303  0.386086
3             ANN       Original  0.775061   0.462812  0.442086  0.448332
4   Random Forest   Oversampling  0.750122   0.478317  0.340867  0.302419
5         XGBoost   Oversampling  0.784352   0.497498  0.408351  0.415861
6        LightGBM   Oversampling  0.772616   0.460907  0.421591  0.429389
7             ANN   Oversampling  0.653790   0.474653  0.482929  0.449586
8   Random Forest  Undersampling  0.478240   0.384148  0.461181  0.341091
9         XGBoost  Undersampling  0.450856   0.398784  0.475290  0.341586
10       LightGBM  Undersampling  0.461125   0.396600  0.486017  0.345190
11            ANN  Undersampling  0.335