In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /home/mehdi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mehdi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
dataset = pd.read_csv('./data/ruddit.csv')
sentences = dataset['comment_text']
x_train, x_test, y_train, y_test = train_test_split(dataset['comment_text'], dataset['offensiveness_score'], train_size=0.75, test_size=0.25, random_state=0)
x_train = x_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

## Data Cleaning

In [None]:

def clean_corpus(corpus):
    tokenized_clean_docs = []
    cleaned_docs = []
    for doc in corpus:
        text_data = re.sub('[^a-zA-Z]', ' ', doc)
        text_data = text_data.lower()
        text_data = text_data.split()
        wl = WordNetLemmatizer()
        # text_data = [wl.lemmatize(word) for word in text_data if not word in set(stopwords.words('english'))]
        text_data = ' '.join(text_data)
        cleaned_docs.append(text_data)
        tokenized_clean_docs.append(word_tokenize(text_data))

    cleaned_docs = pd.Series(cleaned_docs)
    return cleaned_docs, tokenized_clean_docs


In [None]:
x_train_cleaned, x_train_tokenized = clean_corpus(x_train)
x_test_cleaned, x_test_tokenized = clean_corpus(x_test)

## Text Representations

In [None]:
# Bag of words
from sklearn.feature_extraction.text import CountVectorizer
# Create a CountVectorizer object
vectorizer = CountVectorizer()
# Use the fit_transform method to transform the sentences into a bag of words
bow = vectorizer.fit_transform(cleaned_sentences)
# Print the vocabulary (features) of the bag of words
print(vectorizer.get_feature_names())
# Print the bag of words
print(bow.toarray())
print(bow.shape)  

In [None]:
# N-grams

In [None]:
# TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer
# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()
# Use the fit_transform method to transform the documents into a TF-IDF matrix
tfidf = vectorizer.fit_transform(sentences)
# Print the vocabulary (features) of the TF-IDF matrix
print(vectorizer.get_feature_names())
# Print the TF-IDF matrix
print(tfidf.shape)

## Word2Vec

Training the dataset for a w2v model using gensim

In [None]:
from gensim.models import Word2Vec
import gensim.downloader as api 

w2v = Word2Vec(tokenized_clean_sentences, min_count=1,vector_size=300)

print(w2v)


Loads gensim pretrained models

In [None]:
print(list(api.info()['models'].keys()))
w2vec_google_news_model = api.load('word2vec-google-news-300')
glove_twitter_model = api.load('glove-twitter-200')
fasttext_wiki_news_model = api.load('fasttext-wiki-news-subwords-300')

Words that are not in the embedding model vocabulary

In [None]:
def words_not_in_vocab(sentences_tokens, w2v_model):
    not_in_words = []
    for sentence_tokens in sentences_tokens:
        for word in sentence_tokens:
            if word not in w2v_model and word not in not_in_words:
                not_in_words.append(word)
    return not_in_words

print(len(words_not_in_vocab(tokenized_clean_sentences, w2vec_google_news_model)))
print(len(words_not_in_vocab(tokenized_clean_sentences, glove_twitter_model)))
print(len(words_not_in_vocab(tokenized_clean_sentences, fasttext_wiki_news_model)))


In [None]:
# TODO: retrain the fasttext model with vocabularies that are not existed

A vectorize method for calculate the w2v of a sentence

In [None]:
def vectorize(sentence_tokens, w2v_model, vector_size=300):
    words_vecs = [w2v_model[word] for word in sentence_tokens if word in w2v_model]
    if len(words_vecs) == 0:
        return np.zeros(vector_size)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [None]:
x_train_vectorized = ([vectorize(tokens, fasttext_wiki_news_model) for tokens in x_train_tokenized])
x_test_vectorized = ([vectorize(tokens, fasttext_wiki_news_model) for tokens in x_test_tokenized])

In [None]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

# clf = LogisticRegression()
# clf.fit(x_train_vectorized, y_train)

linear_reg = LinearRegression()
linear_reg.fit(x_train_vectorized, y_train)


svr_reg = SVR(kernel = 'rbf')
svr_reg.fit(x_train_vectorized, y_train)

mlp_reg = MLPRegressor(random_state=1, max_iter=500)
mlp_reg.fit(x_train_vectorized, y_train)

rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(x_train_vectorized, y_train)




In [None]:
# Inference
def model_inference(sentence, model):
    tokens = sentence.split()
    sentence_embedding = vectorize(tokens, fasttext_wiki_news_model)
    return model.predict([sentence_embedding])

In [None]:
# test_pred = clf.predict(x_test_vectorized)
svr_preds = svr_reg.predict(x_test_vectorized)
linear_preds = linear_reg.predict(x_test_vectorized)
mlp_preds = mlp_reg.predict(x_test_vectorized)
rf_preds = rf_reg.predict(x_test_vectorized)


## Evaluation Metrics

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report
SVR_MAE = mean_absolute_error(y_test, svr_preds)
SVR_MSE = mean_squared_error(y_test, svr_preds)
REG_MAE = mean_absolute_error(y_test, linear_preds)
REG_MSE = mean_squared_error(y_test, linear_preds)
MLP_MAE = mean_absolute_error(y_test, mlp_preds)
MLP_MSE = mean_squared_error(y_test, mlp_preds)
RF_MAE = mean_absolute_error(y_test, rf_preds)
RF_MSE = mean_squared_error(y_test, rf_preds)

print(f"SVR MAE score is: {SVR_MAE}")
print(f"SVR MSE score is: {SVR_MSE}")
print(svr_reg.score(x_test_vectorized, y_test))
print(f"REG MAE score is: {REG_MAE}")
print(f"REG MSE score is: {REG_MSE}")
print(linear_reg.score(x_test_vectorized, y_test))
print(f"MLP MAE score is: {MLP_MAE}")
print(f"MLP MSE score is: {MLP_MSE}")
print(mlp_reg.score(x_test_vectorized, y_test))
print(f"RF MAE score is: {RF_MAE}")
print(f"RF MSE score is: {RF_MSE}")
print(rf_reg.score(x_test_vectorized, y_test))


In [None]:
import matplotlib.pyplot as plt
# plt.plot(y_test[:100])
plt.plot(svr_preds[:100])
plt.plot(linear_preds[:100])
plt.plot(mlp_preds[:100])

In [None]:
# model_inference("youre fucking nice ", regressor)
# for index, pred in enumerate(svr_preds):
    # if (pred - y_test[index])**2 > 0.5:
    #     print(x_test[index], pred, y_test[index])

# Dataset Prepreation

In [3]:
dataset = pd.read_csv('./data/ruddit.csv')
x_train, x_test_valid, y_train, y_test_valid = train_test_split(dataset["comment_text"], dataset['offensiveness_score'] , train_size=0.8, test_size=0.2, random_state=0)
x_test, x_valid, y_test, y_valid = train_test_split(x_test_valid, y_test_valid, test_size=0.5, random_state=0)
x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
x_valid = x_valid.reset_index(drop=True)
y_valid = y_valid.reset_index(drop=True)
train = pd.DataFrame({'text': x_train, 'score': y_train})
test = pd.DataFrame({'text': x_test, 'score': y_test})
valid = pd.DataFrame({'text': x_valid, 'score': y_valid})
train['score'] = train['score'].astype('float32')
test['score'] = test['score'].astype('float32')
valid['score'] = valid['score'].astype('float32')
print(valid.dtypes)

text      object
score    float32
dtype: object


## Fine Tuning LLMs

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import DataLoader

BASE_MODEL = "bert-base-cased"
LEARNING_RATE = 2e-5
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 2

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1)


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
train_encodings = tokenizer(list(train['text']), truncation=True, padding=True)
val_encodings = tokenizer(list(valid['text']), truncation=True, padding=True)
test_encodings = tokenizer(list(test['text']), truncation=True, padding=True)

In [6]:
import torch

class RudditDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RudditDataset(train_encodings, train['score'])
val_dataset = RudditDataset(val_encodings, valid['score'])
test_dataset = RudditDataset(test_encodings, test['score'])

In [7]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)
    
    mse = mean_squared_error(labels, logits)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    single_squared_errors = ((logits - labels).flatten()**2).tolist()
    
    # Compute accuracy 
    # Based on the fact that the rounded score = true score only if |single_squared_errors| < 0.5
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)
    
    return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}

In [8]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./models/fine-tuned-regression-1",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

  return torch._C._cuda_getDeviceCount() > 0


In [9]:

import torch

class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.mse_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [10]:
trainer = RegressionTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_for_regression,
)

trainer.train()

  0%|          | 0/564 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.030178766697645187, 'eval_mse': 0.030178766697645187, 'eval_mae': 0.1319590061903, 'eval_r2': 0.7650108361277528, 'eval_accuracy': 0.9911347517730497, 'eval_runtime': 68.5199, 'eval_samples_per_second': 8.231, 'eval_steps_per_second': 0.525, 'epoch': 1.0}
{'loss': 0.0433, 'learning_rate': 2.269503546099291e-06, 'epoch': 1.77}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.027695482596755028, 'eval_mse': 0.027695482596755028, 'eval_mae': 0.1253727823495865, 'eval_r2': 0.7843471120679827, 'eval_accuracy': 0.9893617021276596, 'eval_runtime': 78.2377, 'eval_samples_per_second': 7.209, 'eval_steps_per_second': 0.46, 'epoch': 2.0}
{'train_runtime': 4225.7312, 'train_samples_per_second': 2.133, 'train_steps_per_second': 0.133, 'train_loss': 0.041004494360998164, 'epoch': 2.0}


TrainOutput(global_step=564, training_loss=0.041004494360998164, metrics={'train_runtime': 4225.7312, 'train_samples_per_second': 2.133, 'train_steps_per_second': 0.133, 'train_loss': 0.041004494360998164, 'epoch': 2.0})

In [11]:
trainer.eval_dataset=test_dataset
trainer.evaluate()

  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.032084871083498,
 'eval_mse': 0.032084871083498,
 'eval_mae': 0.13935185968875885,
 'eval_r2': 0.7126925165259967,
 'eval_accuracy': 0.9875666074600356,
 'eval_runtime': 77.3045,
 'eval_samples_per_second': 7.283,
 'eval_steps_per_second': 0.466,
 'epoch': 2.0}