In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [15]:
df = pd.read_csv('../final_racism/final_modularized.csv')

tweets = df['text'].values
classes = df['is_racism'].values

In [17]:
df['is_racism'].value_counts()

0    123
1    118
Name: is_racism, dtype: int64

In [43]:
from transformers import AutoTokenizer  # Or BertTokenizer
from transformers import AutoModelForPreTraining  # Or BertForPreTraining for loading pretraining heads
from transformers import AutoModel  # or BertModel, for BERT without pretraining heads

model = AutoModelForPreTraining.from_pretrained('neuralmind/bert-large-portuguese-cased')
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-large-portuguese-cased', do_lower_case=False)


# For DistilBERT:
# model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'neuralmind/bert-large-portuguese-cased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
# tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
# model = model_class.from_pretrained(pretrained_weights)

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForPreTraining were not initialized from the model checkpoint at neuralmind/bert-large-portuguese-cased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:
tokenized = df['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [45]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [46]:
np.array(padded).shape

(241, 119)

In [47]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(241, 119)

In [48]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [49]:
features = last_hidden_states[0][:,0,:].numpy()

In [50]:
labels = df['is_racism']

In [51]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [52]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

LogisticRegression()

In [60]:
lr_clf.score(test_features, test_labels)

0.7540983606557377

In [61]:
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

results = cross_val_predict(lr_clf, train_features, train_labels, cv=10)
print(metrics.accuracy_score(train_labels, results))

0.65


In [41]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.528 (+/- 0.00)
