In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [3]:
train_df = pd.read_csv('./dataset/train.csv')
test_df = pd.read_csv('./dataset/test.csv')

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

train_df['cleaned_tweet'] = train_df['tweet'].apply(preprocess_text)
test_df['cleaned_tweet'] = test_df['tweet'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/orkunkinay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/orkunkinay/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/orkunkinay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
Q1 = train_df['cleaned_tweet'].apply(len).quantile(0.25)
Q3 = train_df['cleaned_tweet'].apply(len).quantile(0.75)
IQR = Q3 - Q1
outlier_threshold_high = Q3 + 1.5 * IQR

outlier_threshold_high = int(outlier_threshold_high)

train_df_capped_outliers = train_df.copy()
train_df_capped_outliers['cleaned_tweet'] = train_df_capped_outliers['cleaned_tweet'].apply(
    lambda x: x if len(x) <= outlier_threshold_high else x[:outlier_threshold_high]
)

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(texts, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
        with torch.no_grad():
            outputs = model(**encodings)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
            embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

train_features_capped_outliers = get_bert_embeddings(train_df_capped_outliers['cleaned_tweet'].tolist())
test_features = get_bert_embeddings(test_df['cleaned_tweet'].tolist())

np.save('train_features_capped_outliers.npy', train_features_capped_outliers)
np.save('test_features.npy', test_features)



In [7]:
def manual_oversample(X, y):
    X_majority = X[y == 0]
    y_majority = y[y == 0]
    X_minority = X[y == 1]
    y_minority = y[y == 1]

    num_samples_to_generate = len(y_majority) - len(y_minority)

    indices = np.random.choice(range(len(X_minority)), size=num_samples_to_generate, replace=True)
    X_oversampled = np.vstack([X_majority, X_minority, X_minority[indices]])
    y_oversampled = np.hstack([y_majority, y_minority, y_minority[indices]])

    return X_oversampled, y_oversampled

X_resampled_capped, y_resampled_capped = manual_oversample(train_features_capped_outliers, train_df_capped_outliers['label'].values)

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X_resampled_capped, y_resampled_capped, test_size=0.2, random_state=42)

In [9]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_val)
print("Logistic Regression Accuracy:", accuracy_score(y_val, y_pred))

The history saving thread hit an unexpected error (OperationalError('unable to open database file')).History will not be written to the database.
Logistic Regression Accuracy: 0.8925807537012113


In [10]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)
print("Random Forest Accuracy:", accuracy_score(y_val, y_pred_rf))

Random Forest Accuracy: 0.9970558546433378


In [11]:
gb = GradientBoostingClassifier(n_estimators=100)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_val)
print("Gradient Boosting Accuracy:", accuracy_score(y_val, y_pred_gb))

Gradient Boosting Accuracy: 0.8940948855989233


In [12]:
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_val)
print("SVM Accuracy:", accuracy_score(y_val, y_pred_svm))

SVM Accuracy: 0.8965343203230148
