In [1]:
# setup
import sys
import subprocess
import pkg_resources
from collections import Counter
import re
import pickle

required = {'spacy', 'transformers'}
installed = {pkg.key for pkg in pkg_resources.working_set}
missing = required - installed

if missing:
    python = sys.executable
    subprocess.check_call([python, '-m', 'pip', 'install', *missing], stdout=subprocess.DEVNULL)

import json
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.svm import LinearSVC
import torch
import transformers
import gzip
# this will set the device on which to train
#device = torch.device("cpu")
# if using collab, set your runtime to use GPU and use the line below
device = torch.device("cuda:0")

In [2]:
from transformers import DistilBertModel, DistilBertTokenizer
MODEL_NAME = 'distilbert-base-uncased'
# Load pre-trained model
distil_model = DistilBertModel.from_pretrained(MODEL_NAME)
# Load pre-trained model tokenizer (vocabulary)
distil_tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [3]:
with open('clean_tweets_20k.pkl', 'rb') as f:
    df_tweet = pickle.load(f)

In [4]:

X = df_tweet['text']
df_tweet['Target'] = df_tweet['Target'].replace(4,1)
y = df_tweet['Target']
print(y.value_counts())

1    10076
0     9924
Name: Target, dtype: int64


In [5]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.3, random_state=53)
print(X_train.shape, X_test.shape)

(14000,) (6000,)


In [6]:
print('GPU active', torch.cuda.is_available())


GPU active True


In [7]:
# pass model to GPU
distil_model.to(device)

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i

In [8]:
# Professor code
def create_bert_vector(textlist):
  %%time
  start = 0
  batch_size = 10
  batches = list(range(batch_size, len(textlist), batch_size))+[len(textlist)]
  doc_rep_collector = []
  for b in batches:
      tokens = distil_tokenizer.batch_encode_plus(
          textlist[start:b],
          pad_to_max_length=True, 
          return_tensors="pt",
          max_length=512)
      start = b
      tokens.to(device)
      outputs = distil_model(**tokens)
      # taking the representation of the 'CLS' token (doc-level embedding)
      o = outputs[0][:,0].cpu().detach().numpy()
      doc_rep_collector.append(o)

  # stack into array
  doc_rep_collector = np.concatenate(doc_rep_collector)
  # to minimize size, can store as 16-bit float
  doc_rep_collector = doc_rep_collector.astype('float16')
  # additionally, will store as gzip (pandas can handle this)
  return doc_rep_collector

In [9]:
%%time
vec_bert_train  = create_bert_vector(X_train)
pickle.dump(vec_bert_train, gzip.open('tweet_bert_train.pkl.gz', 'wb'))
vec_bert_test  = create_bert_vector(X_test)
pickle.dump(vec_bert_test, gzip.open('tweet_bert_test.pkl.gz', 'wb'))

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 7.87 µs


Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 21 µs


Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (

CPU times: user 2min 15s, sys: 1min 6s, total: 3min 21s
Wall time: 3min 21s


In [10]:
# using BERT representations for prediction
bert_train = pd.read_pickle('tweet_bert_train.pkl.gz')
print(bert_train.shape)
bert_test = pd.read_pickle('tweet_bert_test.pkl.gz')
print(bert_test.shape)

(14000, 768)
(6000, 768)


In [11]:
svc = LinearSVC()
svc.fit(bert_train, y_train)

print('Train accuracy',accuracy_score(y_train, svc.predict(bert_train)))
print('Test accuracy',accuracy_score(y_test, svc.predict(bert_test)))

Train accuracy 0.8069285714285714
Test accuracy 0.7721666666666667


