# Real-World Data Tweet Analysis

Analysis of Real-World Data (RWD) found in tweets.

In [1]:
import os
import transformers
import datasets
from datasets import load_dataset, load_metric
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import xgboost
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## 1. ALBERT

### 1.1. Task setup

In [3]:
task = "cola"
model_name = "distilbert-base-uncased"
batch_size = 1
metric = load_metric('glue', task)

  metric = load_metric('glue', task)
Downloading builder script: 5.76kB [00:00, ?B/s]                                                                       


### 1.2. Load dataset

In [None]:
dataset = load_dataset("csv", data_files = { "train": "data/training.csv",
                                             "test": "data/test.csv" },
                      features = datasets.Features({ "Message": datasets.Value("string"),
                                                   "label": datasets.ClassLabel(names = [0, 1])}))

In [6]:
def preprocess_function(examples):
    return tokenizer(examples[text_key], truncation = True)

In [7]:
text_key = "Message"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = True)

In [9]:
dataset["train"].features

{'Message': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=2, names=[0, 1], names_file=None, id=None)}

In [None]:
encoded_dataset = dataset.map(preprocess_function, batched = True)

In [11]:
num_labels = 3 if task.startswith("mlni") else 1 if task == "stsb" else 2

In [None]:
modelCheckpoint = f"models/{model_name}"

### 1.3. Finetune a model

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = num_labels)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [14]:
metric_name = "f1"

args = TrainingArguments(
    "test-glue",
    learning_rate = 2e-5,
    per_device_train_batch_size = batch_size,
    num_train_epochs = 5,
    weight_decay = 0.01,
    seed = 1194,
)

In [15]:
encoded_dataset['train'] = encoded_dataset['train'].rename_column("Message", "sentence")
encoded_dataset['test'] = encoded_dataset['test'].rename_column("Message", "sentence")

In [16]:
encoded_dataset["train"].features

{'sentence': Value(dtype='string', id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'label': ClassLabel(num_classes=2, names=[0, 1], names_file=None, id=None)}

In [18]:
trainer = Trainer(
    model,
    args,
    train_dataset = encoded_dataset["train"],
)

In [19]:
trainer.train()

Step,Training Loss
500,0.8099
1000,0.4914
1500,0.2243
2000,0.0717
2500,0.0458


TrainOutput(global_step=2530, training_loss=0.32474020032190054, metrics={'train_runtime': 174.4835, 'train_samples_per_second': 14.5, 'total_flos': 54510751841400.0, 'epoch': 5.0})

In [20]:
model.save_pretrained(modelCheckpoint)

### 1.4. Evaluate a finetuned model

Load a model and evaluate a saved model.

In [21]:
model = AutoModelForSequenceClassification.from_pretrained(modelCheckpoint, num_labels = num_labels)

In [22]:
trainer_test = Trainer(model, tokenizer = tokenizer)

In [23]:
pred_prob, _, _ = trainer_test.predict(encoded_dataset["test"])

In [24]:
y_pred = np.argmax(pred_prob, axis = 1)

In [None]:
def print_evaluation_metrics(label, y_true, y_pred):
    print("{} prediction metrics\nF1: {:.4f}\nPrecision: {:.4f}\nRecall: {:.4f}".format(
        label,
        f1_score(y_true, y_pred),
        precision_score(y_true, y_pred),
        recall_score(y_true, y_pred)
    ))

In [None]:
print_evaluation_metrics(label = "ALBERT", y_true = encoded_dataset["test"]["label"], y_pred = y_pred)

## 2. TF-IDF vectorization

Vectorize data to use for subsequent models.

In [None]:
df_train = pd.read_csv("data/training.csv")
df_test = pd.read_csv("data/test.csv")

In [None]:
vectorizer = TfidfVectorizer(min_df = 2)

In [None]:
data_train = vectorizer.fit_transform(df_train["Message"].values)
data_test = vectorizer.transform(df_test["Message"].values)
labels_train, labels_test = data_train["label"], data_test["label"]

## 2. SVM

In [4]:
svm = SVC()

In [None]:
svm.fit(data_train, labels_train)

In [None]:
svm_pred = svm.predict(data_test)

In [None]:
svm_pred_binary = [int(x >= 0.5) for x in svm_pred]

In [None]:
print_evaluation_metrics(label = "SVM", y_true = labels_test, y_pred = svm_pred_binary)

## 3. Random forest

In [None]:
rf = RandomForestClassifier(n_estimators = 500)

In [None]:
rf.fit(data_train, labels_train)

In [None]:
rf_pred = rf.predict(data_test)

In [None]:
rf_pred_binary = [int(x >= 0.5) for x in rf_pred]

In [None]:
print_evaluation_metrics(label = "Random forest", y_true = labels_test, y_pred = rf_pred_binary)

## 4. XGBoost

In [None]:
xgb = xgboost.XGBClassifier()

In [None]:
xgb.fit(data_train, labels_train)

In [None]:
xgb_pred = xgb.predict(data_test)

In [None]:
xgb_pred_binary = [int(x >= 0.5) for x in xgb_pred]

In [None]:
print_evaluation_metrics(label = "XGBoost", y_true = labels_test, y_pred = xgb_pred_binary)

## 5. Ensemble

In [None]:
ensemble = VotingClassifier([svm, rf, xgb])

In [None]:
ens_pred = ensemble.predict(data_test)

In [None]:
print_evaluation_metrics(label = "Ensemble", y_true = labels_test, y_pred = ens_pred)