In [None]:
!pip install zemberek-python

Collecting zemberek-python
  Downloading zemberek_python-0.2.3-py3-none-any.whl.metadata (2.7 kB)
Collecting antlr4-python3-runtime==4.8 (from zemberek-python)
  Downloading antlr4-python3-runtime-4.8.tar.gz (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.4/112.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading zemberek_python-0.2.3-py3-none-any.whl (95.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: antlr4-python3-runtime
  Building wheel for antlr4-python3-runtime (setup.py) ... [?25l[?25hdone
  Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-py3-none-any.whl size=141214 sha256=4892517a57593b8bf31447f836cf6248140da635821e0206d0e7dd8585f8bdd9
  Stored in directory: /root/.cache/pip/wheels/a7/20/bd/e1477d664f22d99989fd28ee1a43d6633dddb5cb9

In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from gensim.models import KeyedVectors, Word2Vec
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from google.colab import drive
from zemberek import TurkishMorphology, TurkishSpellChecker

# Mount Google Drive
drive.mount('/content/drive')

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Mounted at /content/drive


In [None]:
# Define paths
dataset_path = '/content/drive/My Drive/DATASET/final_hate_speech.xlsx'
word2vec_path = '/content/drive/My Drive/DATASET/word2vec_tr.model'
fine_tuned_path = '/content/drive/My Drive/DATASET/word2vec_tr_finetuned.model'

# Load dataset
data = pd.read_excel(dataset_path)

In [None]:
# Initialize Zemberek for spell checking
morphology = TurkishMorphology.create_with_defaults()
spell_checker = TurkishSpellChecker(morphology)

cache = {}

def clean_turkish_text_with_cache(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])  # Keep alphanumeric and spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    corrected_words = []
    for word in text.split():
        if word in cache:
            corrected_words.append(cache[word])
        else:
            suggestions = spell_checker.suggest_for_word(word)
            correction = suggestions[0] if suggestions else word
            cache[word] = correction
            corrected_words.append(correction)
    return ' '.join(corrected_words) if corrected_words else "EMPTY"

# Apply preprocessing with caching
data['tweet_cleaned'] = data['tweet'].apply(clean_turkish_text_with_cache)

# Tokenize cleaned tweets
sentences = [tweet.split() for tweet in data['tweet_cleaned'] if tweet != "EMPTY"]

INFO:zemberek.morphology.turkish_morphology:TurkishMorphology instance initialized in 31.25755548477173


2025-01-03 16:04:35,878 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 31.25755548477173



In [None]:
 # Load pre-trained Word2Vec model
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Fine-tune Word2Vec
new_model = Word2Vec(vector_size=word2vec_model.vector_size, min_count=1)
new_model.build_vocab(sentences)
new_model.build_vocab([list(word2vec_model.key_to_index.keys())], update=True)
new_model.wv.vectors = np.copy(word2vec_model.vectors)
new_model.train(sentences, total_examples=len(sentences), epochs=10)
new_model.save(fine_tuned_path)

INFO:gensim.models.keyedvectors:loading projection weights from /content/drive/My Drive/DATASET/word2vec_tr.model


2025-01-03 17:19:04,087 - gensim.models.keyedvectors - INFO
Msg: loading projection weights from /content/drive/My Drive/DATASET/word2vec_tr.model



INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (412457, 400) matrix of type float32 from /content/drive/My Drive/DATASET/word2vec_tr.model', 'binary': True, 'encoding': 'utf8', 'datetime': '2025-01-03T17:19:23.219908', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}


2025-01-03 17:19:23,220 - gensim.utils - INFO
Msg: KeyedVectors lifecycle event {'msg': 'loaded (412457, 400) matrix of type float32 from /content/drive/My Drive/DATASET/word2vec_tr.model', 'binary': True, 'encoding': 'utf8', 'datetime': '2025-01-03T17:19:23.219908', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}



INFO:gensim.utils:Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=400, alpha=0.025>', 'datetime': '2025-01-03T17:19:23.241849', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'created'}


2025-01-03 17:19:23,241 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=400, alpha=0.025>', 'datetime': '2025-01-03T17:19:23.241849', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'created'}



INFO:gensim.models.word2vec:collecting all words and their counts


2025-01-03 17:19:23,246 - gensim.models.word2vec - INFO
Msg: collecting all words and their counts



INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


2025-01-03 17:19:23,253 - gensim.models.word2vec - INFO
Msg: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types



INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 212849 words, keeping 39996 word types


2025-01-03 17:19:23,403 - gensim.models.word2vec - INFO
Msg: PROGRESS: at sentence #10000, processed 212849 words, keeping 39996 word types



INFO:gensim.models.word2vec:collected 40614 word types from a corpus of 218268 raw words and 10224 sentences


2025-01-03 17:19:23,413 - gensim.models.word2vec - INFO
Msg: collected 40614 word types from a corpus of 218268 raw words and 10224 sentences



INFO:gensim.models.word2vec:Creating a fresh vocabulary


2025-01-03 17:19:23,422 - gensim.models.word2vec - INFO
Msg: Creating a fresh vocabulary



INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 40614 unique words (100.00% of original 40614, drops 0)', 'datetime': '2025-01-03T17:19:23.690674', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}


2025-01-03 17:19:23,690 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 40614 unique words (100.00% of original 40614, drops 0)', 'datetime': '2025-01-03T17:19:23.690674', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}



INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 218268 word corpus (100.00% of original 218268, drops 0)', 'datetime': '2025-01-03T17:19:23.697284', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}


2025-01-03 17:19:23,697 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 218268 word corpus (100.00% of original 218268, drops 0)', 'datetime': '2025-01-03T17:19:23.697284', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}



INFO:gensim.models.word2vec:deleting the raw counts dictionary of 40614 items


2025-01-03 17:19:24,022 - gensim.models.word2vec - INFO
Msg: deleting the raw counts dictionary of 40614 items



INFO:gensim.models.word2vec:sample=0.001 downsamples 35 most-common words


2025-01-03 17:19:24,039 - gensim.models.word2vec - INFO
Msg: sample=0.001 downsamples 35 most-common words



INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 198187.92663335195 word corpus (90.8%% of prior 218268)', 'datetime': '2025-01-03T17:19:24.043115', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}


2025-01-03 17:19:24,043 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 198187.92663335195 word corpus (90.8%% of prior 218268)', 'datetime': '2025-01-03T17:19:24.043115', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}



INFO:gensim.models.word2vec:estimated required memory for 40614 words and 400 dimensions: 150271800 bytes


2025-01-03 17:19:25,101 - gensim.models.word2vec - INFO
Msg: estimated required memory for 40614 words and 400 dimensions: 150271800 bytes



INFO:gensim.models.word2vec:resetting layer weights


2025-01-03 17:19:25,106 - gensim.models.word2vec - INFO
Msg: resetting layer weights



INFO:gensim.utils:Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2025-01-03T17:19:25.517372', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'build_vocab'}


2025-01-03 17:19:25,517 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2025-01-03T17:19:25.517372', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'build_vocab'}



INFO:gensim.models.word2vec:collecting all words and their counts


2025-01-03 17:19:25,559 - gensim.models.word2vec - INFO
Msg: collecting all words and their counts



INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


2025-01-03 17:19:25,564 - gensim.models.word2vec - INFO
Msg: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types



INFO:gensim.models.word2vec:collected 412457 word types from a corpus of 412457 raw words and 1 sentences


2025-01-03 17:19:26,074 - gensim.models.word2vec - INFO
Msg: collected 412457 word types from a corpus of 412457 raw words and 1 sentences



INFO:gensim.models.word2vec:Updating model with new vocabulary


2025-01-03 17:19:26,078 - gensim.models.word2vec - INFO
Msg: Updating model with new vocabulary



INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'added 388042 new unique words (94.08% of original 412457) and increased the count of 24415 pre-existing words (5.92% of original 412457)', 'datetime': '2025-01-03T17:19:31.248236', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}


2025-01-03 17:19:31,248 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'msg': 'added 388042 new unique words (94.08% of original 412457) and increased the count of 24415 pre-existing words (5.92% of original 412457)', 'datetime': '2025-01-03T17:19:31.248236', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}



INFO:gensim.models.word2vec:deleting the raw counts dictionary of 412457 items


2025-01-03 17:19:33,776 - gensim.models.word2vec - INFO
Msg: deleting the raw counts dictionary of 412457 items



INFO:gensim.models.word2vec:sample=0.001 downsamples 0 most-common words


2025-01-03 17:19:33,788 - gensim.models.word2vec - INFO
Msg: sample=0.001 downsamples 0 most-common words



INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 412457 word corpus (100.0%% of prior 412457)', 'datetime': '2025-01-03T17:19:33.792662', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}


2025-01-03 17:19:33,792 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 412457 word corpus (100.0%% of prior 412457)', 'datetime': '2025-01-03T17:19:33.792662', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}



INFO:gensim.models.word2vec:estimated required memory for 412457 words and 400 dimensions: 1526090900 bytes


2025-01-03 17:19:37,233 - gensim.models.word2vec - INFO
Msg: estimated required memory for 412457 words and 400 dimensions: 1526090900 bytes



INFO:gensim.models.word2vec:updating layer weights


2025-01-03 17:19:37,237 - gensim.models.word2vec - INFO
Msg: updating layer weights



INFO:gensim.utils:Word2Vec lifecycle event {'update': True, 'trim_rule': 'None', 'datetime': '2025-01-03T17:19:39.137634', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'build_vocab'}


2025-01-03 17:19:39,137 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'update': True, 'trim_rule': 'None', 'datetime': '2025-01-03T17:19:39.137634', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'build_vocab'}



INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'training model with 3 workers on 428656 vocabulary and 400 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2025-01-03T17:19:40.239571', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'train'}


2025-01-03 17:19:40,239 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'msg': 'training model with 3 workers on 428656 vocabulary and 400 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2025-01-03T17:19:40.239571', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'train'}



INFO:gensim.models.word2vec:EPOCH 0: training on 218268 raw words (216608 effective words) took 0.9s, 252512 effective words/s


2025-01-03 17:19:41,109 - gensim.models.word2vec - INFO
Msg: EPOCH 0: training on 218268 raw words (216608 effective words) took 0.9s, 252512 effective words/s



INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 100.00% examples, 213343 words/s, in_qsize 0, out_qsize 1


2025-01-03 17:19:42,135 - gensim.models.word2vec - INFO
Msg: EPOCH 1 - PROGRESS: at 100.00% examples, 213343 words/s, in_qsize 0, out_qsize 1



INFO:gensim.models.word2vec:EPOCH 1: training on 218268 raw words (216637 effective words) took 1.0s, 212201 effective words/s


2025-01-03 17:19:42,140 - gensim.models.word2vec - INFO
Msg: EPOCH 1: training on 218268 raw words (216637 effective words) took 1.0s, 212201 effective words/s



INFO:gensim.models.word2vec:EPOCH 2 - PROGRESS: at 68.17% examples, 144509 words/s, in_qsize 5, out_qsize 0


2025-01-03 17:19:43,199 - gensim.models.word2vec - INFO
Msg: EPOCH 2 - PROGRESS: at 68.17% examples, 144509 words/s, in_qsize 5, out_qsize 0



INFO:gensim.models.word2vec:EPOCH 2: training on 218268 raw words (216635 effective words) took 1.4s, 152310 effective words/s


2025-01-03 17:19:43,593 - gensim.models.word2vec - INFO
Msg: EPOCH 2: training on 218268 raw words (216635 effective words) took 1.4s, 152310 effective words/s



INFO:gensim.models.word2vec:EPOCH 3 - PROGRESS: at 68.17% examples, 144447 words/s, in_qsize 6, out_qsize 0


2025-01-03 17:19:44,641 - gensim.models.word2vec - INFO
Msg: EPOCH 3 - PROGRESS: at 68.17% examples, 144447 words/s, in_qsize 6, out_qsize 0



INFO:gensim.models.word2vec:EPOCH 3: training on 218268 raw words (216615 effective words) took 1.4s, 154214 effective words/s


2025-01-03 17:19:45,018 - gensim.models.word2vec - INFO
Msg: EPOCH 3: training on 218268 raw words (216615 effective words) took 1.4s, 154214 effective words/s



INFO:gensim.models.word2vec:EPOCH 4 - PROGRESS: at 73.03% examples, 140294 words/s, in_qsize 5, out_qsize 0


2025-01-03 17:19:46,169 - gensim.models.word2vec - INFO
Msg: EPOCH 4 - PROGRESS: at 73.03% examples, 140294 words/s, in_qsize 5, out_qsize 0



INFO:gensim.models.word2vec:EPOCH 4: training on 218268 raw words (216659 effective words) took 1.4s, 154074 effective words/s


2025-01-03 17:19:46,446 - gensim.models.word2vec - INFO
Msg: EPOCH 4: training on 218268 raw words (216659 effective words) took 1.4s, 154074 effective words/s



INFO:gensim.models.word2vec:EPOCH 5 - PROGRESS: at 57.53% examples, 127259 words/s, in_qsize 6, out_qsize 0


2025-01-03 17:19:47,471 - gensim.models.word2vec - INFO
Msg: EPOCH 5 - PROGRESS: at 57.53% examples, 127259 words/s, in_qsize 6, out_qsize 0



INFO:gensim.models.word2vec:EPOCH 5: training on 218268 raw words (216633 effective words) took 1.4s, 151334 effective words/s


2025-01-03 17:19:47,891 - gensim.models.word2vec - INFO
Msg: EPOCH 5: training on 218268 raw words (216633 effective words) took 1.4s, 151334 effective words/s



INFO:gensim.models.word2vec:EPOCH 6: training on 218268 raw words (216608 effective words) took 0.8s, 257719 effective words/s


2025-01-03 17:19:48,744 - gensim.models.word2vec - INFO
Msg: EPOCH 6: training on 218268 raw words (216608 effective words) took 0.8s, 257719 effective words/s



INFO:gensim.models.word2vec:EPOCH 7: training on 218268 raw words (216610 effective words) took 0.9s, 248276 effective words/s


2025-01-03 17:19:49,628 - gensim.models.word2vec - INFO
Msg: EPOCH 7: training on 218268 raw words (216610 effective words) took 0.9s, 248276 effective words/s



INFO:gensim.models.word2vec:EPOCH 8: training on 218268 raw words (216618 effective words) took 0.8s, 259597 effective words/s


2025-01-03 17:19:50,473 - gensim.models.word2vec - INFO
Msg: EPOCH 8: training on 218268 raw words (216618 effective words) took 0.8s, 259597 effective words/s



INFO:gensim.models.word2vec:EPOCH 9: training on 218268 raw words (216644 effective words) took 0.8s, 258809 effective words/s


2025-01-03 17:19:51,320 - gensim.models.word2vec - INFO
Msg: EPOCH 9: training on 218268 raw words (216644 effective words) took 0.8s, 258809 effective words/s



INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'training on 2182680 raw words (2166267 effective words) took 11.1s, 195496 effective words/s', 'datetime': '2025-01-03T17:19:51.324872', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'train'}


2025-01-03 17:19:51,324 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'msg': 'training on 2182680 raw words (2166267 effective words) took 11.1s, 195496 effective words/s', 'datetime': '2025-01-03T17:19:51.324872', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'train'}



INFO:gensim.utils:Word2Vec lifecycle event {'fname_or_handle': '/content/drive/My Drive/DATASET/word2vec_tr_finetuned.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2025-01-03T17:19:51.328354', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'saving'}


2025-01-03 17:19:51,328 - gensim.utils - INFO
Msg: Word2Vec lifecycle event {'fname_or_handle': '/content/drive/My Drive/DATASET/word2vec_tr_finetuned.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2025-01-03T17:19:51.328354', 'gensim': '4.3.3', 'python': '3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]', 'platform': 'Linux-6.1.85+-x86_64-with-glibc2.35', 'event': 'saving'}



INFO:gensim.utils:storing np array 'vectors' to /content/drive/My Drive/DATASET/word2vec_tr_finetuned.model.wv.vectors.npy


2025-01-03 17:19:51,331 - gensim.utils - INFO
Msg: storing np array 'vectors' to /content/drive/My Drive/DATASET/word2vec_tr_finetuned.model.wv.vectors.npy



INFO:gensim.utils:storing np array 'syn1neg' to /content/drive/My Drive/DATASET/word2vec_tr_finetuned.model.syn1neg.npy


2025-01-03 17:20:10,906 - gensim.utils - INFO
Msg: storing np array 'syn1neg' to /content/drive/My Drive/DATASET/word2vec_tr_finetuned.model.syn1neg.npy



INFO:gensim.utils:not storing attribute cum_table


2025-01-03 17:20:35,828 - gensim.utils - INFO
Msg: not storing attribute cum_table



INFO:gensim.utils:saved /content/drive/My Drive/DATASET/word2vec_tr_finetuned.model


2025-01-03 17:20:36,861 - gensim.utils - INFO
Msg: saved /content/drive/My Drive/DATASET/word2vec_tr_finetuned.model



In [None]:
data.head()

Unnamed: 0,ID,tweet,etiket,alt etiket,favorited,retweeted,is retweet,user - followers,time,Keyword,tweet_cleaned
0,1,ya orospu cocuklari hepiniz niye ayni anda yaz...,nefret,etnik,12,1,0,2860,2019-09-05 11:33:56,suriyeli,da orospum cocuklari hepimiz diye aynı anda ya...
1,2,Ciddiye alan tüm dünyanın beynini sileyim.. \n...,saldırgan,,3,0,0,314,2019-09-05 11:32:09,suriyeli,ciddiye olan tüm dünyanın beynin bileyim iki g...
2,3,Kayıtlı İstihdama geçiş programına göre (?)\nŞ...,hiçbiri,,26,1,0,12548,2019-09-05 11:29:34,suriyeli,kayıtlı istihdam geniş programında göre şimdil...
3,4,Hastaneye git Suriyeli. PTT ye git Suriyeli. P...,nefret,etnik,0,0,0,9,2019-09-05 11:26:50,suriyeli,hastaneye ait Suriye'yi ptt de ait Suriye'yi p...
4,5,Cölesi bitmiş suriyeli gibiyim bugün,hiçbiri,,48,2,0,488,2019-09-05 11:26:19,suriyeli,kölesi gitmiş Suriye'yi gibiyim bugün


In [None]:
def text_to_word2vec(text, model, vector_size=300):
    if not text or text.strip() == "EMPTY":
        return np.zeros(vector_size, dtype=np.float32)
    words = text.split()
    embeddings = [model[word] for word in words if word in model.key_to_index]
    if not embeddings:
        return np.zeros(vector_size, dtype=np.float32)
    return np.mean(embeddings, axis=0)

In [None]:
X = np.array([text_to_word2vec(tweet, new_model.wv, vector_size=new_model.vector_size)
             for tweet in data['tweet_cleaned']])
y = data['etiket'].values
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [None]:
# Define resampling methods
resampling_methods = {
    "Original": None,
    "Oversampling": RandomOverSampler(random_state=42),
    "Undersampling": RandomUnderSampler(random_state=42),
    "Combined": SMOTEENN(random_state=42),
}

# Define ML models
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42, tree_method='hist', use_label_encoder=False),
    "LightGBM": LGBMClassifier(random_state=42),
}

In [None]:
# ANN Model
def build_ann(input_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(len(label_encoder.classes_), activation='softmax'),
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Train and evaluate models
results = []

In [None]:
for res_name, resampler in resampling_methods.items():
    if resampler:
        X_resampled, y_resampled = resampler.fit_resample(X_train, y_train)
    else:
        X_resampled, y_resampled = X_train, y_train

    for model_name, model in models.items():
        model.fit(X_resampled, y_resampled)
        y_pred = model.predict(X_test)
        results.append({
            "Model": model_name,
            "Resampling": res_name,
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred, average='macro'),
            "Recall": recall_score(y_test, y_pred, average='macro'),
            "F1-Score": f1_score(y_test, y_pred, average='macro'),
        })

    # ANN
    ann_model = build_ann(X_resampled.shape[1])
    ann_model.fit(X_resampled, y_resampled, epochs=10, batch_size=32, verbose=0)
    y_pred_ann = np.argmax(ann_model.predict(X_test), axis=1)
    results.append({
        "Model": "ANN",
        "Resampling": res_name,
        "Accuracy": accuracy_score(y_test, y_pred_ann),
        "Precision": precision_score(y_test, y_pred_ann, average='macro'),
        "Recall": recall_score(y_test, y_pred_ann, average='macro'),
        "F1-Score": f1_score(y_test, y_pred_ann, average='macro'),
    })


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051025 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 102000
[LightGBM] [Info] Number of data points in the train set: 8179, number of used features: 400
[LightGBM] [Info] Start training from score -0.277827
[LightGBM] [Info] Start training from score -1.488549
[LightGBM] [Info] Start training from score -4.082071


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.114726 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 101997
[LightGBM] [Info] Number of data points in the train set: 18585, number of used features: 400
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004576 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55476
[LightGBM] [Info] Number of data points in the train set: 414, number of used features: 400
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.134205 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 102000
[LightGBM] [Info] Number of data points in the train set: 12428, number of used features: 400
[LightGBM] [Info] Start training from score -3.423820
[LightGBM] [Info] Start training from score -0.757278
[LightGBM] [Info] Start training from score -0.696209




[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [None]:
# Results DataFrame
results_df = pd.DataFrame(results)
print(results_df)

            Model     Resampling  Accuracy  Precision    Recall  F1-Score
0   Random Forest       Original  0.751100   0.453180  0.344538  0.311074
1         XGBoost       Original  0.769682   0.481268  0.386095  0.385376
2        LightGBM       Original  0.767237   0.476826  0.380384  0.376482
3             ANN       Original  0.774083   0.462608  0.434258  0.441731
4   Random Forest   Oversampling  0.753545   0.492702  0.346091  0.312985
5         XGBoost   Oversampling  0.768215   0.462489  0.401148  0.406536
6        LightGBM   Oversampling  0.771149   0.459967  0.429252  0.436811
7             ANN   Oversampling  0.666504   0.442940  0.483660  0.443997
8   Random Forest  Undersampling  0.468949   0.372720  0.415507  0.328953
9         XGBoost  Undersampling  0.450856   0.384170  0.414821  0.328603
10       LightGBM  Undersampling  0.451834   0.388504  0.446622  0.331269
11            ANN  Undersampling  0.411247   0.401675  0.487946  0.292155
12  Random Forest       Combined  0.24

In [None]:
# SMOTEENN uygulaması
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

# Dengeleme sonrası performans sonuçlarını tutmak için bir liste
sampling_results = []

for model_name, model in models.items():
    # Modeli yeniden eğit (SMOTEENN ile dengelenmiş veriyle)
    model.fit(X_resampled, y_resampled)

    # Tahmin yap
    y_pred = model.predict(X_test)

    # Performans metriklerini hesapla
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Sonuçları kaydet
    sampling_results.append({
        "Model": model_name,
        "Sampling Type": "SMOTEENN",
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    })

# Sonuçları bir DataFrame'e çevir
sampling_df = pd.DataFrame(sampling_results)

# Tabloyu yazdır
print("\nDengeleme Yöntemleri ile Karşılaştırma:")
print(sampling_df)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.130179 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 102000
[LightGBM] [Info] Number of data points in the train set: 12428, number of used features: 400
[LightGBM] [Info] Start training from score -3.423820
[LightGBM] [Info] Start training from score -0.757278
[LightGBM] [Info] Start training from score -0.696209

Dengeleme Yöntemleri ile Karşılaştırma:
           Model Sampling Type  Accuracy  Precision    Recall  F1-Score
0  Random Forest      SMOTEENN  0.245966   0.754695  0.245966  0.106624
1        XGBoost      SMOTEENN  0.290954   0.731104  0.290954  0.206200
2       LightGBM      SMOTEENN  0.292421   0.731347  0.292421  0.206263




In [None]:
# Define the paths in your Google Drive
performance_results_path = '/content/drive/My Drive/DATASET/performance_results.csv'
sampling_comparison_path = '/content/drive/My Drive/DATASET/sampling_comparison.csv'

# Save the DataFrames to the specified paths
results_df.to_csv(performance_results_path, index=False)
sampling_df.to_csv(sampling_comparison_path, index=False)

print(f"Performance results saved to: {performance_results_path}")
print(f"Sampling comparison saved to: {sampling_comparison_path}")

Performance results saved to: /content/drive/My Drive/DATASET/performance_results.csv
Sampling comparison saved to: /content/drive/My Drive/DATASET/sampling_comparison.csv
