In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords
import numpy as np
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/mehdi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mehdi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Pickle savings helpers

In [2]:
import pickle
def save_data(object_name, object):
    with open(f'./data/pickles/{object_name}.pickle', 'wb') as f:
        pickle.dump(object, f)

def load_data(object_name, calculator, *args):
    try:
        with open(f'./data/pickles/{object_name}.pickle', 'rb') as f:
            return pickle.load(f)
    except FileNotFoundError:
        obj = calculator(*args)
        save_data(object_name, obj)
        return obj

In [3]:
PATH = './data/ruddit.csv'
TRAIN_RATIO = 0.75
TEST_VAL_RATIO = 1

dataset = pd.read_csv(PATH)
x_train, x_test_valid, y_train, y_test_valid = train_test_split(dataset["comment_text"], dataset['offensiveness_score'] , train_size=TRAIN_RATIO, random_state=0)
x_test, x_valid, y_test, y_valid = train_test_split(x_test_valid, y_test_valid, test_size=TEST_VAL_RATIO, random_state=0)
x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
x_valid = x_valid.reset_index(drop=True)
y_valid = y_valid.reset_index(drop=True)
train_raw = pd.DataFrame({'text': x_train, 'score': y_train})
test_raw = pd.DataFrame({'text': x_test, 'score': y_test})
valid_raw = pd.DataFrame({'text': x_valid, 'score': y_valid})
train_raw['score'] = train_raw['score'].astype('float32')
test_raw['score'] = test_raw['score'].astype('float32')
valid_raw['score'] = valid_raw['score'].astype('float32')
del x_train, x_test, x_valid, y_train, y_test, y_valid, x_test_valid, y_test_valid
len(train_raw), len(test_raw), len(valid_raw)

(4225, 1408, 1)

Dataset Cleaning

In [4]:
wl = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    pattern = re.compile('[^a-zA-Z]')
    text = pattern.sub(' ', text)
    text = text.lower()
    text = text.split()
    text = [wl.lemmatize(word) for word in text if not word in stop_words]
    text = ' '.join(text)
    return text

def clean_dataset(ds):
    ds['text'] = ds['text'].apply(preprocess_text)
    return ds

train = clean_dataset(train_raw.copy())
test = clean_dataset(test_raw.copy())
valid = clean_dataset(valid_raw.copy())

Bag of words representations

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
bow_train = vectorizer.fit_transform(train['text']).toarray()
bow_test = vectorizer.fit_transform(test['text']).toarray()
bow_valid = vectorizer.fit_transform(valid['text']).toarray()

pad_width = ((0, 0), (0, bow_train.shape[1] - bow_test.shape[1])) 
bow_test = np.pad(bow_test, pad_width, mode='constant', constant_values=0)
pad_width = ((0, 0), (0, bow_train.shape[1] - bow_valid.shape[1])) 
bow_valid = np.pad(bow_valid, pad_width, mode='constant', constant_values=0)


bow_train.shape, bow_test.shape, bow_valid.shape

((4225, 10882), (1408, 10882), (1, 10882))

TFIDF representations

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_train = vectorizer.fit_transform(train['text']).toarray()
tfidf_test = vectorizer.fit_transform(test['text']).toarray()
tfidf_valid = vectorizer.fit_transform(valid['text']).toarray()

pad_width = ((0, 0), (0, tfidf_train.shape[1] - tfidf_test.shape[1])) 
tfidf_test = np.pad(tfidf_test, pad_width, mode='constant', constant_values=0)
pad_width = ((0, 0), (0, tfidf_train.shape[1] - tfidf_valid.shape[1])) 
tfidf_valid = np.pad(tfidf_valid, pad_width, mode='constant', constant_values=0)

tfidf_train.shape, tfidf_test.shape, tfidf_valid.shape

((4225, 10882), (1408, 10882), (1, 10882))

Embeddings representations

In [7]:
from gensim.models import Word2Vec
import gensim.downloader as api 

w2vec_google_news_model = load_data('w2vec_google_news_model', api.load, 'word2vec-google-news-300')
glove_twitter_model = load_data('glove_twitter_model', api.load, 'glove-wiki-gigaword-300')
fasttext_wiki_news_model = load_data('fasttext_wiki_news_model', api.load, 'fasttext-wiki-news-subwords-300')


In [43]:
tokenized_train = [text.split() for text in train_raw['text']]
tokenized_test = [text.split() for text in test_raw['text']]

In [44]:
def vectorize(sentence_tokens, w2v_model, vector_size=300):
    words_vecs = [w2v_model[word] for word in sentence_tokens if word in w2v_model]
    if len(words_vecs) == 0:
        return np.zeros(vector_size)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

train_fasttext = ([vectorize(tokens, fasttext_wiki_news_model) for tokens in tokenized_train])
test_fasttext = ([vectorize(tokens, fasttext_wiki_news_model) for tokens in tokenized_test])
train_v2w = ([vectorize(tokens, w2vec_google_news_model) for tokens in tokenized_train])
test_v2w = ([vectorize(tokens, w2vec_google_news_model) for tokens in tokenized_test])
train_glove = ([vectorize(tokens, glove_twitter_model) for tokens in tokenized_train])
test_glove = ([vectorize(tokens, glove_twitter_model) for tokens in tokenized_test])


LLMs representations

In [None]:
# from transformers import BertModel, BertTokenizer

# model = BertModel.from_pretrained('bert-base-uncased')
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# import torch

# def bert_embeddings(ds):
#     # Tokenize the texts
#     tokenized_inputs = tokenizer(list(ds), truncation=True, padding=True, return_tensors="pt")

#     # Generate BERT embeddings
#     with torch.no_grad():
#         model_output = model(**tokenized_inputs)

#     # Extract the embeddings
#     embeddings = model_output.last_hidden_state
    
#     return embeddings.mean(dim=1).numpy()

In [None]:
# train_raw['text']

In [None]:
# train_bert_raw = bert_embeddings(train_raw['text'])
# test_bert_raw = bert_embeddings(test_raw['text'])

In [None]:
# train_bert = bert_embeddings(train['text'])
# test_bert = bert_embeddings(test['text'])

In [None]:
import torch

def bert_text_embeddings(input_text):
    input_ids = tokenizer.encode(input_text, add_special_tokens=True)
    input_ids = torch.tensor([input_ids])

    with torch.no_grad():
        last_hidden_states = model(input_ids)[0] # Models outputs are now tuples
    last_hidden_states = last_hidden_states.mean(1)

    return last_hidden_states

In [45]:
datasets = {
    'bow': (bow_train, bow_test, bow_valid),
    'tfidf': (tfidf_train, tfidf_test, tfidf_valid),
    'fasttext': (train_fasttext, test_fasttext),
    'word2vec': (train_v2w, test_v2w),
    'glove': (train_glove, test_glove)
}

In [46]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

linear_reg = LinearRegression()
svr_reg = SVR(kernel = 'rbf')
mlp_reg = MLPRegressor(random_state=1, max_iter=500)
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)


In [56]:
model_train = datasets['glove'][0]
model_test = datasets['glove'][1]

In [57]:
linear_reg.fit(model_train, train['score'])
svr_reg.fit(model_train, train['score'])
mlp_reg.fit(model_train, train['score'])
rf_reg.fit(model_train, train['score'])

In [58]:
svr_preds = svr_reg.predict(model_test)
linear_preds = linear_reg.predict(model_test)
mlp_preds = mlp_reg.predict(model_test)
rf_preds = rf_reg.predict(model_test)

In [59]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report
svr_mse = mean_squared_error(test['score'], svr_preds)
linear_mse = mean_squared_error(test['score'], linear_preds)
mlp_mse = mean_squared_error(test['score'], mlp_preds)
rf_mse = mean_squared_error(test['score'], rf_preds)
print(f'SVR MSE: {svr_mse}, Linear MSE: {linear_mse}, MLP MSE: {mlp_mse}, RF MSE: {rf_mse}')

SVR MSE: 0.07340861310598293, Linear MSE: 0.09466922426057676, MLP MSE: 0.10379193202544125, RF MSE: 0.08534953326234156


In [51]:
times_clean = {
    'fasttext': 50.6,
    'word2vec': 48.9,
    'glove': 49.4
}
times_dirty = {
    'fasttext': 55.7,
    'word2vec': 54.3,
    'glove': 54.2
}
results_mse_cleaned = {
    'fasttext': {'SVR': 0.04685816665354699, 'Linear': 0.07273326632161381, 'MLP': 0.05643423252877395, 'RF': 0.06449617951793207},
    'Word2Vec': {'SVR': 0.04816666309223599, 'Linear': 0.07068408441559317, 'MLP': 0.0655118452881107, 'RF': 0.07014625622457209},
    'Glove': {'SVR': 0.05030112790450738, 'Linear': 0.0782962824967885, 'MLP': 0.07334035831215925, 'RF': 0.07286072318841706}
}
results_mse_dirty = {
    'fasttext': {'SVR': 0.058821879162422326, 'Linear': 0.08009510252213108, 'MLP': 0.08096923470973573, 'RF': 0.07731245249116338},
    'Word2Vec': {'SVR': 0.05957748697334124, 'Linear': 0.08946030570644542, 'MLP': 0.09698301239891266, 'RF': 0.08042184426460584},
    'Glove': {'SVR': 0.07340861310598293, 'Linear': 0.09466922426057676, 'MLP': 0.10379193202544125, 'RF': 0.08534953326234156}
}