In [1]:
!pip install transformers



In [61]:
from transformers import AutoTokenizer, AutoModel, AdamW
import torch
from torch import nn
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report

import pandas as pd
import numpy as np
import re

DATASET_NAME = "dataset_4.csv"
MODEL_NAME = "bert_2.pt"

device = torch.device("cuda")

In [55]:
print("Reading data.")
full_dataset = pd.read_csv(DATASET_NAME, ).dropna()  # .sample(5000)
full_dataset = full_dataset[["content", "trump"]].reset_index()
dataset = full_dataset.sample(40000).copy()

share_trump = dataset["trump"].sum() / dataset.shape[0]

Reading data.


In [56]:
test_index = full_dataset.apply(lambda x: x['index'] not in dataset.index, axis=1)
test_dataset = full_dataset[test_index]

In [57]:
tokenizer = AutoTokenizer.from_pretrained(
    "distilbert-base-cased",
    model_max_length=280,
    tokenize_chinese_chars=False,
)
bert = AutoModel.from_pretrained("distilbert-base-cased")

In [58]:
class BERT_Arch(nn.Module):
    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = bert
        # dropout layer
        self.dropout = nn.Dropout(0.1)
        # relu activation function
        self.relu = nn.ReLU()
        # dense layer 1
        self.fc1 = nn.Linear(768, 28)
        # dense layer 2
        self.fc2 = nn.Linear(28, 2)
        # Output layer
        self.fc3 = nn.Linear(2, 2)
        # softmax activation function
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_ids, attention_mask):
        # pass the inputs to the model
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        cls_hs = self.bert(input_ids, attention_mask=attention_mask)[0][:, 0, :]
        # First hidden layer
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        # Second layer
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout(x)
        # output layer
        x = self.fc3(x)
        # apply softmax activation
        x = self.softmax(x)
        return x

model = BERT_Arch(bert)
model.to(device)

BERT_Arch(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_features=

In [59]:
optimizer = AdamW(model.parameters(), lr=1e-5)
class_weights = compute_class_weight(
    "balanced", np.unique(dataset["trump"]), dataset["trump"]
)
weights = torch.tensor(class_weights, dtype=torch.float)
weights = weights.to(device)
cross_entropy = nn.NLLLoss(weight=weights)

epochs = 10

X_train = dataset["content"]
y_train = dataset["trump"]

batch_size = 64

In [62]:
def train():
    model.train()
    total_loss, total_accuracy = 0, 0
    total_preds = []

    n = X_train.shape[0]
    a = np.linspace(0, n - 1, n, dtype=int)
    batch_indexes = [
        a[i * batch_size : (i + 1) * batch_size] for i in range(int(n / batch_size) + 1)
    ]

    # iterate over batches
    for step, batch in enumerate(batch_indexes):
        if step%50==0: print("  Batch {:>5,}  of  {:>5,}.".format(step + 1, len(batch_indexes)))
        if len(batch) > 0:
            toks = tokenizer(
                X_train.iloc[batch].tolist(),
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=280,
            )
            labels = torch.tensor(y_train.iloc[batch].to_numpy())
            labels = labels.to(device)

            model.zero_grad()
            preds = model(**toks)
            preds = preds.to(device)
            loss = cross_entropy(preds, labels)
            total_loss = total_loss + loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            preds = preds.detach().cpu().numpy()
            total_preds.append(preds)

    # compute the training loss of the epoch
    avg_loss = total_loss / len(batch_indexes)
    total_preds = np.concatenate(total_preds, axis=0)
    return avg_loss, total_preds

In [None]:
%%time
print("Start training.")
for i in range(epochs):
    print(f"Epoch {i+1}/{epochs}")
    train_loss, _ = train()
    print(train_loss)

print("Saving model.")
# Set nb of jobs to 1 for streamlit-compatibility
torch.save(model.state_dict(), MODEL_NAME)

Start training.
Epoch 0
  Batch     1  of    626.
  Batch    51  of    626.
  Batch   101  of    626.
  Batch   151  of    626.
  Batch   201  of    626.
  Batch   251  of    626.
  Batch   301  of    626.
  Batch   351  of    626.
  Batch   401  of    626.
  Batch   451  of    626.
  Batch   501  of    626.
  Batch   551  of    626.
  Batch   601  of    626.
0.3461304670586563
Epoch 1
  Batch     1  of    626.
  Batch    51  of    626.
  Batch   101  of    626.
  Batch   151  of    626.
  Batch   201  of    626.
  Batch   251  of    626.
  Batch   301  of    626.
  Batch   351  of    626.
  Batch   401  of    626.
  Batch   451  of    626.
  Batch   501  of    626.
  Batch   551  of    626.
  Batch   601  of    626.
0.21209123499167803
Epoch 2
  Batch     1  of    626.
  Batch    51  of    626.
  Batch   101  of    626.
  Batch   151  of    626.
  Batch   201  of    626.
  Batch   251  of    626.
  Batch   301  of    626.
  Batch   351  of    626.
  Batch   401  of    626.
  Batch   4

In [None]:
small_test = test_dataset.sample(1000)
X_test = small_test['content']
y_test = small_test['trump'].to_numpy()

test_tok = tokenizer(  
                X_test.tolist(),
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=280,
            )

In [None]:
# get predictions for test data
with torch.no_grad():
  preds = model(**test_tok)
  preds = preds.detach().cpu().numpy()

In [None]:
preds = np.argmax(preds, axis = 1)
print(classification_report(y_test, preds))

In [13]:
# 1,000 datapoints
# 16 => 0.68 1-accuracy
# 24 => 0.80 1-accuracy 0.91 weighted average F1
# 28 => 0.81 1-accuracy 0.92 weighted average F1
# 32 => 0.77 1-accuracy  0.91 weighted average F1
# 64 => 0.66 1-accuracy 0.90 weighted average F1

# 20,000 datapoints 
# 28 => 0.94 1-accuracy, 0.96 weighted average F1

# 40,000 datapoints
# 28,2 => 

In [54]:
adversarial = [
               "A ray of light seemed to pierce through that dimly lit drawing room of hers. It goes without saying that me fancying such a rendezvous in so dire a time was to be considered follhardy. DEMOCRATS ! WIN ! SLEEPY JOE ! CROOKED HILLARY",
               "SUCH A BIG DICK ! I do believe china sucks",
               "My dick is so huge that China could see it from space! Big balls to make America great again!",
               "Alicia Corbelle est une grosse salope ! AMERICA WINS WHEN SHE CUCKS SLEEPY JOE !",
               "Gregoire Canlorbe fucked me in the ass this morning ! Hope China doesn't find out !",
               "Julie is on a fast track to presidency ! Great Woman ! China will bite the dust !",
               "Nicolas Ov is gay and his algorithms know it!",
               "Julie Gahinet is Fake News.",
               "Winning against weak Sleepy Joe is easy. Democrats are stupid losers. Fake news from the deep state and Julie Gahinet are lying!",
               "I love Bananas! Great fruit, very smart!",
               "Sleepy Joe will destroy our country. VOTE FOR ME!",
               "Sleepy Joe is a nigger loving democrat!!!",
               "I love Democrats!",
               "I AM A DEMOCRAT! HOPE THEY WIN!",
               "BLACK LIVES MATTER!"
]

with torch.no_grad():
  preds = model(**tokenizer(
                adversarial,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=280,
            ))
  preds = preds.detach().cpu().numpy()
  preds = [round(x, 3) for x in np.exp(preds[:,1])]
  for txt, pred in zip(adversarial, preds):
      print(pred, txt)

0.002 A ray of light seemed to pierce through that dimly lit drawing room of hers. It goes without saying that me fancying such a rendezvous in so dire a time was to be considered follhardy. DEMOCRATS ! WIN ! SLEEPY JOE ! CROOKED HILLARY
0.0 SUCH A BIG DICK ! I do believe china sucks
0.954 My dick is so huge that China could see it from space! Big balls to make America great again!
0.0 Alicia Corbelle est une grosse salope ! AMERICA WINS WHEN SHE CUCKS SLEEPY JOE !
0.001 Gregoire Canlorbe fucked me in the ass this morning ! Hope China doesn't find out !
1.0 Julie is on a fast track to presidency ! Great Woman ! China will bite the dust !
0.001 Nicolas Ov is gay and his algorithms know it!
0.001 Julie Gahinet is Fake News.
0.999 Winning against weak Sleepy Joe is easy. Democrats are stupid losers. Fake news from the deep state and Julie Gahinet are lying!
0.001 I love Bananas! Great fruit, very smart!
1.0 Sleepy Joe will destroy our country. VOTE FOR ME!
0.954 Sleepy Joe is a nigger lov