In [13]:
import pandas as pd
import numpy as np
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from datasets import Dataset


In [14]:
df = pd.read_csv("./sst_dataset/SST2-Data/SST2-Data/stanfordSentimentTreebank/stanfordSentimentTreebank/datasetSentences.txt", sep="\t", index_col=0)

dictionary = pd.read_csv("./sst_dataset/SST2-Data/SST2-Data/stanfordSentimentTreebank/stanfordSentimentTreebank/dictionary.txt", sep="|", index_col=1, header=None)

label = pd.read_csv("./sst_dataset/SST2-Data/SST2-Data/stanfordSentimentTreebank/stanfordSentimentTreebank/sentiment_labels.txt", sep="|", index_col=0)

In [15]:
df

Unnamed: 0_level_0,sentence
sentence_index,Unnamed: 1_level_1
1,The Rock is destined to be the 21st Century 's...
2,The gorgeously elaborate continuation of `` Th...
3,Effective but too-tepid biopic
4,If you sometimes like to go to the movies to h...
5,"Emerges as something rare , an issue movie tha..."
...,...
11851,A real snooze .
11852,No surprises .
11853,We 've seen the hippie-turned-yuppie plot befo...
11854,Her fans walked out muttering words like `` ho...


In [16]:
label

Unnamed: 0_level_0,sentiment values
phrase ids,Unnamed: 1_level_1
0,0.50000
1,0.50000
2,0.44444
3,0.50000
4,0.42708
...,...
239227,0.36111
239228,0.38889
239229,0.33333
239230,0.88889


In [17]:
dictionary = dictionary.sort_index()
dictionary

Unnamed: 0_level_0,0
1,Unnamed: 1_level_1
0,!
1,'
2,' (
3,' ( the cockettes
4,' ( the cockettes )
...,...
239227,your standard Hollywood bio-pic
239228,your typical ` fish out of water ' story
239229,zero .
239230,zippy jazzy score


In [18]:
data = pd.merge(dictionary, label, left_index=True, right_index=True)
data

Unnamed: 0_level_0,0,sentiment values
1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,!,0.50000
1,',0.50000
2,' (,0.44444
3,' ( the cockettes,0.50000
4,' ( the cockettes ),0.42708
...,...,...
239227,your standard Hollywood bio-pic,0.36111
239228,your typical ` fish out of water ' story,0.38889
239229,zero .,0.33333
239230,zippy jazzy score,0.88889


In [19]:
# transform it to a dictionary with 'label and 'text'
data = data.reset_index()
data = data.rename(columns={ 0:'text', 'sentiment values':'label'})
data = data.drop(1, axis=1)
data = data[['label', 'text']]
data

Unnamed: 0,label,text
0,0.50000,!
1,0.50000,'
2,0.44444,' (
3,0.50000,' ( the cockettes
4,0.42708,' ( the cockettes )
...,...,...
239227,0.36111,your standard Hollywood bio-pic
239228,0.38889,your typical ` fish out of water ' story
239229,0.33333,zero .
239230,0.88889,zippy jazzy score


In [20]:
#change the label to 5 classes 0-0.2 for 0, 0.2-0.4 for 1, 0.4-0.6 for 2, 0.6-0.8 for 3, 0.8-1 for 4
data['label'] = pd.cut(data['label'], bins=[0, 0.2, 0.4, 0.6, 0.8, 1], labels=[0, 1, 2, 3, 4])

In [22]:
#split the dataset into train, test and validation
train, test = train_test_split(data, test_size=0.2, random_state=42)
train= Dataset.from_dict(train)
test = Dataset.from_dict(test)

In [23]:
#create a dictionary with train and test
dataset = {"train": train, "test": test}
dataset["train"][100]

{'label': 3.0,
 'text': "Such master screenwriting comes courtesy of John Pogue , the Yale grad who previously gave us '' The Skulls '' and last year 's '' Rollerball . '' Enough said"}

In [33]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply the function to each dataset
tokenized_datasets = {name: ds.map(tokenize_function, batched=True) for name, ds in dataset.items()}

tokenized_train_datasets = tokenized_datasets["train"].shuffle(seed=42).select(range(10000))
tokenized_test_datasets = tokenized_datasets["test"].shuffle(seed=42).select(range(5000))

tokenized_train_datasets.set_format("torch")
tokenized_test_datasets.set_format("torch")

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at C:\Users\phi-h/.cache\huggingface\transformers\a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.4",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file https://huggingface.co/bert-base-cased/resolve

Map:   0%|          | 0/191385 [00:00<?, ? examples/s]

Map:   0%|          | 0/47847 [00:00<?, ? examples/s]

In [34]:
tokenized_train_datasets.shape

(10000, 5)

In [35]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at C:\Users\phi-h/.cache\huggingface\transformers\a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 1

In [36]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [37]:
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [38]:
trainer = Trainer(
    model=model, args=training_args, train_dataset=tokenized_train_datasets, eval_dataset=tokenized_test_datasets, compute_metrics=compute_metrics
)

In [39]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3750


KeyboardInterrupt: 

In [None]:
predictions = trainer.predict(tokenized_test_datasets)

results = pd.DataFrame(
    {
        'Predicted label': predictions.predictions.argmax(-1),
        'True label': tokenized_test_datasets['label']
    })
results