In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:
!pip install pandas



In [3]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [4]:
batch_1 = df[:2000]

In [5]:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [6]:
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [19]:
len(batch_1[0][0].split(' ' ))

18

In [21]:
len(tokenized[0])

20

In [22]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [25]:
batch_1.shape

(2000, 2)

In [24]:
padded.shape

(2000, 59)

In [26]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2000, 59)

In [32]:
len(attention_mask)

2000

In [34]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [36]:
features = last_hidden_states[0][:,0,:].numpy()

In [39]:
features[0]

array([-5.56649387e-01, -3.31292242e-01, -2.22805753e-01, -1.23546854e-01,
        2.67467313e-02, -3.40102315e-01, -1.21638425e-01,  2.96148419e-01,
       -1.12602049e-02, -3.23891133e-01, -1.11983083e-02, -4.61216383e-02,
        3.53096366e-01,  7.90403962e-01,  6.15052991e-02, -8.23150650e-02,
       -5.70958257e-01, -7.66607150e-02,  4.80745524e-01, -2.88600057e-01,
        1.65010020e-02, -1.70571834e-01, -7.84544423e-02,  7.63593540e-02,
       -3.45684052e-01,  2.02781618e-01,  1.76606234e-02, -7.37734661e-02,
        8.51634666e-02,  4.82495397e-01, -1.21689007e-01,  2.04684377e-01,
       -6.75895363e-02, -3.02422702e-01,  3.44691426e-01, -3.71168554e-01,
        1.63762137e-01, -1.84958622e-01, -5.54547489e-01, -3.43958624e-02,
        1.14215344e-01,  3.06466937e-01,  5.11534870e-01, -4.99668151e-01,
       -5.03433943e-01, -1.93499506e-01, -2.37992311e+00, -1.96781904e-01,
       -2.20870271e-01, -3.66642475e-01,  6.59026392e-03, -1.69750124e-01,
        3.46067071e-01, -

In [38]:
features.shape

(2000, 768)

In [35]:
last_hidden_states

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.5566, -0.3313, -0.2228,  ..., -0.2279,  0.6319,  0.2431],
         [-0.6148,  0.4600, -0.2836,  ..., -0.2024,  0.7702, -0.2371],
         [ 0.1241,  0.5771,  0.3027,  ..., -0.3502,  0.2735, -0.8222],
         ...,
         [ 0.1559,  0.2020,  0.2522,  ..., -0.3594,  0.3242, -0.2912],
         [ 0.2129,  0.1774,  0.1802,  ..., -0.4015,  0.4105, -0.3502],
         [ 0.1550,  0.0942,  0.2884,  ..., -0.4045,  0.0152, -0.1894]],

        [[-0.2879, -0.1429, -0.0686,  ..., -0.3169,  0.1846,  0.3199],
         [ 0.0502,  0.2413, -0.1760,  ..., -0.1800, -0.1737, -0.0019],
         [ 0.4134, -0.2540, -0.0710,  ...,  0.0981, -0.2784,  0.0393],
         ...,
         [ 0.0964, -0.1410,  0.1165,  ...,  0.0558,  0.0646,  0.0469],
         [ 0.1954, -0.1857,  0.1544,  ...,  0.2717,  0.0327,  0.1801],
         [-0.1666, -0.2294,  0.3169,  ...,  0.1869, -0.2137, -0.0562]],

        [[-0.1865,  0.3023, -0.1851,  ..., -0.3349,  