In [17]:
import pandas as pd

train_df = pd.read_csv('../dataset/train.csv')
test_df = pd.read_csv('../dataset/test.csv')

In [18]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

train_df['cleaned_tweet'] = train_df['tweet'].apply(preprocess_text)
test_df['cleaned_tweet'] = test_df['tweet'].apply(preprocess_text)

original_train_df = train_df.copy()

print(train_df.head())
print(test_df.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/orkunkinay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/orkunkinay/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/orkunkinay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


   id  label                                              tweet  \
0   1      0   @user when a father is dysfunctional and is s...   
1   2      0  @user @user thanks for #lyft credit i can't us...   
2   3      0                                bihday your majesty   
3   4      0  #model   i love u take with u all the time in ...   
4   5      0             factsguide: society now    #motivation   

                                       cleaned_tweet  
0  father dysfunctional selfish drag kid dysfunct...  
1  thanks lyft credit cant use cause dont offer w...  
2                                     bihday majesty  
3                        model love u take u time ur  
4                      factsguide society motivation  
      id                                              tweet  \
0  31963  #studiolife #aislife #requires #passion #dedic...   
1  31964   @user #white #supremacists want everyone to s...   
2  31965  safe ways to heal your #acne!!    #altwaystohe...   
3  31966  is th

In [19]:
# Method 1: Remove outliers based on text length
Q1 = train_df['cleaned_tweet'].apply(len).quantile(0.25)
Q3 = train_df['cleaned_tweet'].apply(len).quantile(0.75)
IQR = Q3 - Q1
outlier_threshold_high = Q3 + 1.5 * IQR

train_df_removed_outliers = train_df[train_df['cleaned_tweet'].apply(len) <= outlier_threshold_high]
print(f"Number of observations after removing outliers: {train_df_removed_outliers.shape[0]}")


Number of observations after removing outliers: 31957


In [22]:
# Method 2: Cap outliers at threshold
outlier_threshold_high = int(outlier_threshold_high)

train_df_capped_outliers = train_df.copy()
train_df_capped_outliers['cleaned_tweet'] = train_df_capped_outliers['cleaned_tweet'].apply(
    lambda x: x if len(x) <= outlier_threshold_high else x[:outlier_threshold_high]
)
print(f"Number of observations after capping outliers: {train_df_capped_outliers.shape[0]}")


Number of observations after capping outliers: 31962


In [23]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(texts, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
        with torch.no_grad():
            outputs = model(**encodings)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
            embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

train_features_removed_outliers = get_bert_embeddings(train_df_removed_outliers['cleaned_tweet'].tolist())
train_features_capped_outliers = get_bert_embeddings(train_df_capped_outliers['cleaned_tweet'].tolist())
test_features = get_bert_embeddings(test_df['cleaned_tweet'].tolist())

np.save('train_features_removed_outliers.npy', train_features_removed_outliers)
np.save('train_features_capped_outliers.npy', train_features_capped_outliers)
np.save('test_features.npy', test_features)




In [26]:
def manual_oversample(X, y):
    X_majority = X[y == 0]
    y_majority = y[y == 0]
    X_minority = X[y == 1]
    y_minority = y[y == 1]

    num_samples_to_generate = len(y_majority) - len(y_minority)

    indices = np.random.choice(range(len(X_minority)), size=num_samples_to_generate, replace=True)
    X_oversampled = np.vstack([X_majority, X_minority, X_minority[indices]])
    y_oversampled = np.hstack([y_majority, y_minority, y_minority[indices]])

    return X_oversampled, y_oversampled

X_resampled_removed, y_resampled_removed = manual_oversample(train_features_removed_outliers, train_df_removed_outliers['label'].values)
X_resampled_capped, y_resampled_capped = manual_oversample(train_features_capped_outliers, train_df_capped_outliers['label'].values)

print("Class distribution after manual oversampling (removed outliers):")
print(pd.Series(y_resampled_removed).value_counts())
print("Class distribution after manual oversampling (capped outliers):")
print(pd.Series(y_resampled_capped).value_counts())


Class distribution after manual oversampling (removed outliers):
0    29715
1    29715
dtype: int64
Class distribution after manual oversampling (capped outliers):
0    29720
1    29720
dtype: int64


In [28]:
from sklearn.neighbors import NearestNeighbors

def custom_smote(X, y, k_neighbors=5):
    X_majority = X[y == 0]
    y_majority = y[y == 0]
    X_minority = X[y == 1]
    y_minority = y[y == 1]

    neighbors = NearestNeighbors(n_neighbors=k_neighbors).fit(X_minority)
    num_samples_to_generate = len(y_majority) - len(y_minority)
    X_synthetic = []
    y_synthetic = []

    for _ in range(num_samples_to_generate):
        index = np.random.randint(0, len(X_minority))
        nn_index = neighbors.kneighbors(X_minority[index].reshape(1, -1), return_distance=False).flatten()
        neighbor = X_minority[nn_index[np.random.randint(1, k_neighbors)]]
        diff = neighbor - X_minority[index]
        gap = np.random.rand()
        X_synthetic.append(X_minority[index] + gap * diff)
        y_synthetic.append(1)

    X_smote = np.vstack([X_majority, X_minority, np.array(X_synthetic)])
    y_smote = np.hstack([y_majority, y_minority, np.array(y_synthetic)])

    return X_smote, y_smote

X_smote_removed, y_smote_removed = custom_smote(train_features_removed_outliers, train_df_removed_outliers['label'].values)
X_smote_capped, y_smote_capped = custom_smote(train_features_capped_outliers, train_df_capped_outliers['label'].values)

print("Class distribution after custom SMOTE (removed outliers):")
print(pd.Series(y_smote_removed).value_counts())
print("Class distribution after custom SMOTE (capped outliers):")
print(pd.Series(y_smote_capped).value_counts())


Class distribution after custom SMOTE (removed outliers):
0    29715
1    29715
dtype: int64
Class distribution after custom SMOTE (capped outliers):
0    29720
1    29720
dtype: int64


In [34]:
class HateSpeechDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features[idx]
        label = self.labels[idx]
        return torch.tensor(feature, dtype=torch.float), torch.tensor(label, dtype=torch.long)

class HateSpeechClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(HateSpeechClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

def train_model(train_loader, model, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for features, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
    return model

def evaluate_model(val_loader, model):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for features, labels in val_loader:
            outputs = model(features)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = correct / total
    return accuracy

hidden_dim = 128
batch_size = 32
learning_rate = 0.001

X_train, X_val, y_train, y_val = train_test_split(train_features_removed_outliers, train_df_removed_outliers['label'].values, test_size=0.2, random_state=42)
val_dataset = HateSpeechDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

train_dataset_base = HateSpeechDataset(X_train, y_train)
train_loader_base = DataLoader(train_dataset_base, batch_size=batch_size, shuffle=True)

train_dataset_ros_removed = HateSpeechDataset(X_resampled_removed, y_resampled_removed)
train_loader_ros_removed = DataLoader(train_dataset_ros_removed, batch_size=batch_size, shuffle=True)
train_dataset_ros_capped = HateSpeechDataset(X_resampled_capped, y_resampled_capped)
train_loader_ros_capped = DataLoader(train_dataset_ros_capped, batch_size=batch_size, shuffle=True)

train_dataset_smote_removed = HateSpeechDataset(X_smote_removed, y_smote_removed)
train_loader_smote_removed = DataLoader(train_dataset_smote_removed, batch_size=batch_size, shuffle=True)
train_dataset_smote_capped = HateSpeechDataset(X_smote_capped, y_smote_capped)
train_loader_smote_capped = DataLoader(train_dataset_smote_capped, batch_size=batch_size, shuffle=True)

model_base = HateSpeechClassifier(input_dim=X_train.shape[1], hidden_dim=hidden_dim, output_dim=2)
model_ros_removed = HateSpeechClassifier(input_dim=X_resampled_removed.shape[1], hidden_dim=hidden_dim, output_dim=2)
model_ros_capped = HateSpeechClassifier(input_dim=X_resampled_capped.shape[1], hidden_dim=hidden_dim, output_dim=2)
model_smote_removed = HateSpeechClassifier(input_dim=X_smote_removed.shape[1], hidden_dim=hidden_dim, output_dim=2)
model_smote_capped = HateSpeechClassifier(input_dim=X_smote_capped.shape[1], hidden_dim=hidden_dim, output_dim=2)
criterion = nn.CrossEntropyLoss()
optimizer_base = optim.Adam(model_base.parameters(), lr=learning_rate)
optimizer_ros_removed = optim.Adam(model_ros_removed.parameters(), lr=learning_rate)
optimizer_ros_capped = optim.Adam(model_ros_capped.parameters(), lr=learning_rate)
optimizer_smote_removed = optim.Adam(model_smote_removed.parameters(), lr=learning_rate)
optimizer_smote_capped = optim.Adam(model_smote_capped.parameters(), lr=learning_rate)


model_base = train_model(train_loader_base, model_base, criterion, optimizer_base, epochs=10)
accuracy_base = evaluate_model(val_loader, model_base)
print(f'Validation Accuracy with Base Data: {accuracy_base * 100:.2f}%')

model_ros_removed = train_model(train_loader_ros_removed, model_ros_removed, criterion, optimizer_ros_removed, epochs=10)
accuracy_ros_removed = evaluate_model(val_loader, model_ros_removed)
print(f'Validation Accuracy after Oversampling (removed outliers): {accuracy_ros_removed * 100:.2f}%')

model_ros_capped = train_model(train_loader_ros_capped, model_ros_capped, criterion, optimizer_ros_capped, epochs=10)
accuracy_ros_capped = evaluate_model(val_loader, model_ros_capped)
print(f'Validation Accuracy after Oversampling (capped outliers): {accuracy_ros_capped * 100:.2f}%')

model_smote_removed = train_model(train_loader_smote_removed, model_smote_removed, criterion, optimizer_smote_removed, epochs=10)
accuracy_smote_removed = evaluate_model(val_loader, model_smote_removed)
print(f'Validation Accuracy after SMOTE (removed outliers): {accuracy_smote_removed * 100:.2f}%')

model_smote_capped = train_model(train_loader_smote_capped, model_smote_capped, criterion, optimizer_smote_capped, epochs=10)
accuracy_smote_capped = evaluate_model(val_loader, model_smote_capped)
print(f'Validation Accuracy after SMOTE (capped outliers): {accuracy_smote_capped * 100:.2f}%')

torch.save(model_ros_removed.state_dict(), 'hate_speech_classifier_ros_removed.pth')
torch.save(model_ros_capped.state_dict(), 'hate_speech_classifier_ros_capped.pth')
torch.save(model_smote_removed.state_dict(), 'hate_speech_classifier_smote_removed.pth')
torch.save(model_smote_capped.state_dict(), 'hate_speech_classifier_smote_capped.pth')


Validation Accuracy with Base Data: 95.06%
Validation Accuracy after Oversampling (removed outliers): 99.48%
Validation Accuracy after Oversampling (capped outliers): 99.78%
Validation Accuracy after SMOTE (removed outliers): 99.30%
Validation Accuracy after SMOTE (capped outliers): 99.22%
