In [None]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import gc

In [None]:
############################ FILE PATHS ########################################
DATA_PATH = "/content/teknofest_preprocessed_data_10fold.csv"

TEST_FOLD = 9
VALID_FOLD = 8

In [None]:
all_df = pd.read_csv(DATA_PATH)

train_df = all_df[~all_df.kfold_10.isin([TEST_FOLD, VALID_FOLD])]
val_df = all_df[all_df.kfold_10.isin([VALID_FOLD])]
test_df = all_df[all_df.kfold_10.isin([TEST_FOLD])]

In [None]:
train_df['is_offensive'] = train_df['is_offensive'].astype(int)
val_df['is_offensive'] = val_df['is_offensive'].astype(int)
test_df['is_offensive'] = test_df['is_offensive'].astype(int)

In [None]:
train_text = train_df['text']
val_text = val_df['text']
test_text = test_df['text']

all_text = pd.concat([train_text, val_text,test_text])

In [None]:
print("TFIDF")
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    norm='l2',
    min_df=0,
    smooth_idf=False,
    max_features=15000)

word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
valid_word_features = word_vectorizer.transform(val_text)

TFIDF


In [None]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(2, 6),
    norm='l2',
    min_df=0,
    smooth_idf=False,
    max_features=50000)

char_vectorizer.fit(all_text)

train_char_features = char_vectorizer.transform(train_text)
valid_char_features = char_vectorizer.transform(val_text)

In [None]:
train_features = hstack([train_char_features, train_word_features])
del train_char_features,train_word_features

val_features = hstack([valid_char_features, valid_word_features])
del valid_char_features,valid_word_features

In [None]:
print(train_features.shape)
print(val_features.shape)

(9969, 65000)
(1246, 65000)


In [None]:
label2id = {label:id for id, label in enumerate(train_df['target'].unique())}
id2label = {id:label for label,id in label2id.items()}

train_df['target'] = train_df['target'].map(label2id)
val_df['target'] = val_df['target'].map(label2id)
test_df['target'] = test_df['target'].map(label2id)

In [None]:
val_features, len(val_df['is_offensive'].tolist())

(<1246x65000 sparse matrix of type '<class 'numpy.float64'>'
 	with 194998 stored elements in Compressed Sparse Row format>,
 1246)

In [None]:
train_features, len(train_df['is_offensive'].tolist())

(<9969x65000 sparse matrix of type '<class 'numpy.float64'>'
 	with 1544081 stored elements in Compressed Sparse Row format>,
 9969)

In [None]:
import xgboost as xgb

target_name = "is_offensive"

xgb_params = {
    'eta': 0.3, 
    'max_depth': 5, 
    'subsample': 0.8, 
    'colsample_bytree': 0.8, 
    'objective':'binary:logistic',
    'num_class':2,
    'seed': 23
          }

# Eğitim sırasında F1 skoru hesaplamak ve eğitimi durdurmak için early stopping kullanın
eval_set = [(train_features, train_df[target_name].tolist()), (val_features, val_df[target_name].tolist())]

model = xgb.XGBClassifier()

model.fit(train_features, 
          train_df[target_name].tolist(), 
          early_stopping_rounds=10,
          eval_set=eval_set)

[0]	validation_0-logloss:0.59388	validation_1-logloss:0.60385
[1]	validation_0-logloss:0.53360	validation_1-logloss:0.54806
[2]	validation_0-logloss:0.48919	validation_1-logloss:0.51042
[3]	validation_0-logloss:0.45438	validation_1-logloss:0.47979
[4]	validation_0-logloss:0.43046	validation_1-logloss:0.45853
[5]	validation_0-logloss:0.40601	validation_1-logloss:0.43888
[6]	validation_0-logloss:0.39079	validation_1-logloss:0.42668
[7]	validation_0-logloss:0.37460	validation_1-logloss:0.41657
[8]	validation_0-logloss:0.36510	validation_1-logloss:0.40836
[9]	validation_0-logloss:0.35238	validation_1-logloss:0.39877
[10]	validation_0-logloss:0.34307	validation_1-logloss:0.39154
[11]	validation_0-logloss:0.33564	validation_1-logloss:0.38514
[12]	validation_0-logloss:0.32733	validation_1-logloss:0.37969
[13]	validation_0-logloss:0.31935	validation_1-logloss:0.37282
[14]	validation_0-logloss:0.31421	validation_1-logloss:0.36739
[15]	validation_0-logloss:0.30603	validation_1-logloss:0.36245
[1

In [None]:
test_word_features = word_vectorizer.transform(test_text)
test_char_features = char_vectorizer.transform(test_text)

test_features = hstack([test_char_features, test_word_features])

In [None]:
predictions = model.predict(test_features)

In [None]:
predictions

array([0, 1, 1, ..., 1, 1, 1])

In [None]:
gold = test_df[target_name]

In [None]:
from sklearn.metrics import f1_score
f1 = f1_score(gold, predictions, average='macro')
f1

0.8337014139837071

In [None]:
from sklearn.metrics import f1_score, classification_report
print(classification_report(gold, predictions))

              precision    recall  f1-score   support

           0       0.82      0.70      0.75       349
           1       0.89      0.94      0.91       897

    accuracy                           0.87      1246
   macro avg       0.85      0.82      0.83      1246
weighted avg       0.87      0.87      0.87      1246

