In [1]:
%load_ext autoreload
%autoreload 2
%cd ..

%matplotlib inline

/home/mbrzozowski/projects/media_monitoring/roberta_for_longer_texts


In [2]:
import pandas as pd
import numpy as np

from config import VISIBLE_GPUS

import os
os.environ["CUDA_VISIBLE_DEVICES"]= VISIBLE_GPUS
import torch

from sklearn.model_selection import train_test_split
from lib.main import BERTClassificationModelWithPooling

## Load data - sample of IMDB reviews in english

In [3]:
SAMPLE_DATA_PATH = 'test/sample_data/sample_data_eng.csv'

In [4]:
# Loading data for tests
df = pd.read_csv(SAMPLE_DATA_PATH)

texts = df['sentence'].tolist() # list of texts
labels = df['target'].tolist() # list of 0/1 labels


In [5]:
df

Unnamed: 0,sentence,target
0,I saw this movie not knowing anything about it...,0
1,"OK, don't let my summary fool you. This movie ...",0
2,"This should be re-titled ""The Curious Case Of ...",0
3,Those 2 points are dedicated the reasonable pe...,0
4,Following the success of the (awful) Gilligan'...,0
...,...,...
1995,"What if Marylin Monroe, Albert Einstein, Joe D...",1
1996,Such a film of beauty that it's hard to descri...,1
1997,I saw this movie with my friend and we couldnt...,1
1998,This is the best piece of film ever created It...,1


## Divide to train and test sets

In [6]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Fit and predict methods

## Fit the model

In [7]:
# Loading model
model = BERTClassificationModelWithPooling()
# Fitting a model to training data for 5 epochs
model.fit(X_train,y_train,epochs = 5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Token indices sequence length is longer than the specified maximum sequence length for this model (1071 > 512). Running this sequence through the model wil

Epoch: 0, Train accuracy: 0.7825, Train loss: 0.4733248513750732
Epoch: 1, Train accuracy: 0.936875, Train loss: 0.2021335910446942
Epoch: 2, Train accuracy: 0.963125, Train loss: 0.11570522608701139
Epoch: 3, Train accuracy: 0.98375, Train loss: 0.07096196901286021
Epoch: 4, Train accuracy: 0.986875, Train loss: 0.0511996485106647


## Get predictions

In [8]:
# Predicted probability for test set
preds = model.predict(X_test)

In [9]:
preds

[0.9981957077980042,
 0.0014405354158952832,
 0.9900174736976624,
 0.0012359063839539886,
 0.9980940222740173,
 0.9989404678344727,
 0.007049195002764463,
 0.9982878565788269,
 0.0022877631708979607,
 0.42834532260894775,
 0.0010692067444324493,
 0.9988347887992859,
 0.9978082776069641,
 0.012943921610713005,
 0.0023247345816344023,
 0.001353072002530098,
 0.0012805807637050748,
 0.18986642360687256,
 0.9973364472389221,
 0.5747166872024536,
 0.007323339115828276,
 0.9988862872123718,
 0.996353268623352,
 0.5548359751701355,
 0.9965223073959351,
 0.002497371518984437,
 0.5350319147109985,
 0.00200158660300076,
 0.0015786592848598957,
 0.0015121130272746086,
 0.0050605591386556625,
 0.004486889112740755,
 0.0011420610826462507,
 0.0025090016424655914,
 0.9987766146659851,
 0.0015395948430523276,
 0.5616910457611084,
 0.9986227750778198,
 0.9977723360061646,
 0.004975558258593082,
 0.7612091898918152,
 0.999107301235199,
 0.0021477288100868464,
 0.00209933053702116,
 0.9860948324203491,


## Calculate model accuracy on the test data

In [10]:
predicted_classes = (np.array(preds) >= 0.5)
accurate = sum(predicted_classes == np.array(y_test).astype(bool))
accuracy = accurate/len(y_test)

print(f'Test accuracy: {accuracy}')

Test accuracy: 0.8825
