In [2]:
!pip install transformers



In [3]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')
import kagglehub
from kagglehub import KaggleDatasetAdapter

In [4]:
df = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)

df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
df['text'] = df['text'].str.replace(r'https?://\S+|www\.\S+', '', regex=True)
df['text'] = df['text'].str.replace(r'[@#]\w+', '', regex=True)
df['text'] = df['text'].str.lower()

In [5]:
batch_1 = df[:7000]

In [6]:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [7]:
tokenized = batch_1['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [8]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [9]:
np.array(padded).shape

(7000, 94)

In [10]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(7000, 94)

In [None]:
# With a CPU, it takes about 4 minutes

input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
labels = batch_1[1]

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [None]:
# https://huggingface.co/transformers/model_doc/distilbert.html
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
new_input_ids = torch.tensor(tokenizer.encode("I hate Asians", add_special_tokens=True)).unsqueeze(0)
new_outputs = model(new_input_ids)
new_last_hidden_states = [new_outputs[0].detach().numpy()[0][0]]
lr_clf.predict_proba(new_last_hidden_states)