In [1]:
import pandas as pd
import numpy as np
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from datasets import Dataset


In [2]:
df = pd.read_csv("./sst_dataset/SST2-Data/SST2-Data/stanfordSentimentTreebank/stanfordSentimentTreebank/datasetSentences.txt", sep="\t", index_col=0)

dictionary = pd.read_csv("./sst_dataset/SST2-Data/SST2-Data/stanfordSentimentTreebank/stanfordSentimentTreebank/dictionary.txt", sep="|", index_col=1, header=None)

label = pd.read_csv("./sst_dataset/SST2-Data/SST2-Data/stanfordSentimentTreebank/stanfordSentimentTreebank/sentiment_labels.txt", sep="|", index_col=0)

In [3]:
df

Unnamed: 0_level_0,sentence
sentence_index,Unnamed: 1_level_1
1,The Rock is destined to be the 21st Century 's...
2,The gorgeously elaborate continuation of `` Th...
3,Effective but too-tepid biopic
4,If you sometimes like to go to the movies to h...
5,"Emerges as something rare , an issue movie tha..."
...,...
11851,A real snooze .
11852,No surprises .
11853,We 've seen the hippie-turned-yuppie plot befo...
11854,Her fans walked out muttering words like `` ho...


In [4]:
label

Unnamed: 0_level_0,sentiment values
phrase ids,Unnamed: 1_level_1
0,0.50000
1,0.50000
2,0.44444
3,0.50000
4,0.42708
...,...
239227,0.36111
239228,0.38889
239229,0.33333
239230,0.88889


In [5]:
dictionary = dictionary.sort_index()
dictionary

Unnamed: 0_level_0,0
1,Unnamed: 1_level_1
0,!
1,'
2,' (
3,' ( the cockettes
4,' ( the cockettes )
...,...
239227,your standard Hollywood bio-pic
239228,your typical ` fish out of water ' story
239229,zero .
239230,zippy jazzy score


In [6]:
data = pd.merge(dictionary, label, left_index=True, right_index=True)
data

Unnamed: 0_level_0,0,sentiment values
1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,!,0.50000
1,',0.50000
2,' (,0.44444
3,' ( the cockettes,0.50000
4,' ( the cockettes ),0.42708
...,...,...
239227,your standard Hollywood bio-pic,0.36111
239228,your typical ` fish out of water ' story,0.38889
239229,zero .,0.33333
239230,zippy jazzy score,0.88889


In [7]:
# transform it to a dictionary with 'label and 'text'
data = data.reset_index()
data = data.rename(columns={ 0:'text', 'sentiment values':'label'})
data = data.drop(1, axis=1)
data = data[['label', 'text']]
data

Unnamed: 0,label,text
0,0.50000,!
1,0.50000,'
2,0.44444,' (
3,0.50000,' ( the cockettes
4,0.42708,' ( the cockettes )
...,...,...
239227,0.36111,your standard Hollywood bio-pic
239228,0.38889,your typical ` fish out of water ' story
239229,0.33333,zero .
239230,0.88889,zippy jazzy score


In [8]:
#split the dataset into train, test and validation
train, test = train_test_split(data, test_size=0.2, random_state=42)
train= Dataset.from_pandas(train)
test = Dataset.from_pandas(test)
train[100]

{'label': 0.75,
 'text': "Such master screenwriting comes courtesy of John Pogue , the Yale grad who previously gave us '' The Skulls '' and last year 's '' Rollerball . '' Enough said",
 '__index_level_0__': 109252}

In [9]:
#create a dictionary with train and test
dataset = {"train": train, "test": test}
dataset["train"][100]

{'label': 0.75,
 'text': "Such master screenwriting comes courtesy of John Pogue , the Yale grad who previously gave us '' The Skulls '' and last year 's '' Rollerball . '' Enough said",
 '__index_level_0__': 109252}

In [10]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply the function to each dataset
tokenized_datasets = {name: ds.map(tokenize_function, batched=True) for name, ds in dataset.items()}

tokenized_train_datasets = tokenized_datasets["train"].shuffle(seed=42).select(range(len(train)))
tokenized_test_datasets = tokenized_datasets["test"].shuffle(seed=42).select(range(len(test)))

Map:   0%|          | 0/191385 [00:00<?, ? examples/s]

Map:   0%|          | 0/47847 [00:00<?, ? examples/s]

In [11]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [None]:
trainer = Trainer(
    model=model, args=training_args, train_dataset=tokenized_train_datasets, eval_dataset=tokenized_test_datasets, compute_metrics=compute_metrics
)

In [None]:
trainer.train()