In [None]:
!pip install zemberek-python

Collecting zemberek-python
  Downloading zemberek_python-0.2.3-py3-none-any.whl.metadata (2.7 kB)
Collecting antlr4-python3-runtime==4.8 (from zemberek-python)
  Downloading antlr4-python3-runtime-4.8.tar.gz (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.4/112.4 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading zemberek_python-0.2.3-py3-none-any.whl (95.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: antlr4-python3-runtime
  Building wheel for antlr4-python3-runtime (setup.py) ... [?25l[?25hdone
  Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-py3-none-any.whl size=141214 sha256=e2e3240c0d73cbd8e47a3d3563452b929ac428a354da72b4eec781bd1b09e20c
  Stored in directory: /root/.cache/pip/wheels/a7/20/bd/e1477d664f22d99989fd28ee1a43d6633dddb5cb9

In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from gensim.models import KeyedVectors, Word2Vec
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from google.colab import drive
import matplotlib.pyplot as plt
import seaborn as sns
from zemberek import TurkishMorphology, TurkishSpellChecker

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
drive.mount('/content/drive')
# Define paths
dataset_path = '/content/drive/My Drive/DATASET/final_hate_speech.xlsx'
word2vec_path = '/content/drive/My Drive/DATASET/word2vec_tr.model'
fine_tuned_path = '/content/drive/My Drive/DATASET/word2vec_tr_finetuned.model'

# Load dataset
data = pd.read_excel(dataset_path)

Mounted at /content/drive


In [None]:
# Initialize Zemberek for spell checking
morphology = TurkishMorphology.create_with_defaults()
spell_checker = TurkishSpellChecker(morphology)

cache = {}

def clean_turkish_text_with_cache(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])  # Keep alphanumeric and spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    corrected_words = []
    for word in text.split():
        if word in cache:
            corrected_words.append(cache[word])
        else:
            suggestions = spell_checker.suggest_for_word(word)
            correction = suggestions[0] if suggestions else word
            cache[word] = correction
            corrected_words.append(correction)
    return ' '.join(corrected_words) if corrected_words else "EMPTY"

# Apply preprocessing with caching
data['tweet_cleaned'] = data['tweet'].apply(clean_turkish_text_with_cache)

INFO:zemberek.morphology.turkish_morphology:TurkishMorphology instance initialized in 10.18822169303894


2025-01-05 14:03:25,446 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 10.18822169303894



In [None]:
# Tokenize cleaned tweets
sentences = [tweet.split() for tweet in data['tweet_cleaned'] if tweet != "EMPTY"]

# Load pre-trained Word2Vec model
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Fine-tune Word2Vec
new_model = Word2Vec(vector_size=word2vec_model.vector_size, min_count=1)
new_model.build_vocab(sentences)
new_model.build_vocab([list(word2vec_model.key_to_index.keys())], update=True)
new_model.wv.vectors = np.copy(word2vec_model.vectors)
new_model.train(sentences, total_examples=len(sentences), epochs=10)
new_model.save(fine_tuned_path)

In [None]:
data.head()

In [None]:
def text_to_word2vec(text, model, vector_size=300):
    if not text or text.strip() == "EMPTY":
        return np.zeros(vector_size, dtype=np.float32)
    words = text.split()
    embeddings = [model[word] for word in words if word in model.key_to_index]
    if not embeddings:
        return np.zeros(vector_size, dtype=np.float32)
    return np.mean(embeddings, axis=0)

X = np.array([text_to_word2vec(tweet, new_model.wv, vector_size=new_model.vector_size)
             for tweet in data['tweet_cleaned']])
y = data['etiket'].values

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Define resampling methods
resampling_methods = {
    "Original": None,
    "Oversampling": RandomOverSampler(random_state=42),
    "Undersampling": RandomUnderSampler(random_state=42),
    "Combined": SMOTEENN(random_state=42),
}

# Define ML models
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42, tree_method='hist', use_label_encoder=False),
    "LightGBM": LGBMClassifier(random_state=42),
}

In [None]:
# ANN Model
def build_ann(input_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(len(label_encoder.classes_), activation='softmax'),
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
# Train and evaluate models
results = []
for res_name, resampler in resampling_methods.items():
    if resampler:
        X_resampled, y_resampled = resampler.fit_resample(X_train, y_train)
    else:
        X_resampled, y_resampled = X_train, y_train

    for model_name, model in models.items():
        model.fit(X_resampled, y_resampled)
        y_pred = model.predict(X_test)
        results.append({
            "Model": model_name,
            "Resampling": res_name,
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred, average='macro'),
            "Recall": recall_score(y_test, y_pred, average='macro'),
            "F1-Score": f1_score(y_test, y_pred, average='macro'),
        })

    # ANN
    ann_model = build_ann(X_resampled.shape[1])
    ann_model.fit(X_resampled, y_resampled, epochs=10, batch_size=32, verbose=0)
    y_pred_ann = np.argmax(ann_model.predict(X_test), axis=1)
    results.append({
        "Model": "ANN",
        "Resampling": res_name,
        "Accuracy": accuracy_score(y_test, y_pred_ann),
        "Precision": precision_score(y_test, y_pred_ann, average='macro'),
        "Recall": recall_score(y_test, y_pred_ann, average='macro'),
        "F1-Score": f1_score(y_test, y_pred_ann, average='macro'),
    })

In [None]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)
print(results_df)

In [None]:
#SMOTEENN specific evaluation
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

sampling_results = []
for model_name, model in models.items():
    model.fit(X_resampled, y_resampled)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    sampling_results.append({
        "Model": model_name,
        "Sampling Type": "SMOTEENN",
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    })


In [None]:
sampling_df = pd.DataFrame(sampling_results)
print("\nDengeleme Yöntemleri ile Karşılaştırma:")
print(sampling_df)

In [None]:
# Visualizations
plt.figure(figsize=(12, 6))
sns.barplot(data=results_df, x='Model', y='Accuracy', hue='Resampling')
plt.title('Model Accuracy with Different Resampling Methods')
plt.show()

plt.figure(figsize=(12, 6))
sns.barplot(data=results_df, x='Model', y='F1-Score', hue='Resampling')
plt.title('Model F1-Score with Different Resampling Methods')
plt.show()

In [None]:
# Save Results
performance_results_path = '/content/drive/My Drive/DATASET/performance_results.csv'
results_df.to_csv(performance_results_path, index=False)

In [None]:
print("\nPerformance Results:")
print(results_df)