Uvozimo vse potrebne knjižnice:

In [None]:
!pip install transformers datasets sklearn numpy torch torchvision

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Ustvarjanje *dataset*-a za učenje modela

In [None]:
from datasets import load_dataset, load_metric

Naložimo csv datoteki, ki smo ju pripravili na koncu preprocesiranja: eno v slovenskem in drugo v angleškem jeziku. Iz obeh odstranimo prvo vrstico, ki vsebuje imeni stolpcev, ter na novo ustvarjeni datoteki shranimo.

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving sentTweetsEng.csv to sentTweetsEng (1).csv
Saving sentTweetsSlo.csv to sentTweetsSlo (1).csv
User uploaded file "sentTweetsEng.csv" with length 1063871 bytes
User uploaded file "sentTweetsSlo.csv" with length 3524151 bytes


In [None]:
with open("sentTweetsEng.csv",'r') as f:
    with open("sentTweetsEngHeadless.csv",'w') as f1:
        next(f) # skip header line
        for line in f:
            f1.write(line)

In [None]:
files.download('sentTweetsEngHeadless.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
with open("sentTweetsSlo.csv",'r') as f:
    with open("sentTweetsSloHeadless.csv",'w') as f1:
        next(f) # skip header line
        for line in f:
            f1.write(line)

In [None]:
files.download('sentTweetsSloHeadless.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Ustvarimo instanco *dataset*, pri čemer definiramo, da je zbirka za učenje modela slovenska zbirka, zbirka za testiranje pa angleška.

In [None]:
dataset = load_dataset(
    'csv',
    data_files={
        'train': 'sentTweetsSloHeadless.csv',
        'test': 'sentTweetsEngHeadless.csv'
    },
    column_names = ['sentence', 'label']
)



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-6c52764aaa390bf8/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-6c52764aaa390bf8/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Uvozimo že predefinirane parametre GLUE ter tokenizer tipa [CroCloEngual BERT](https://huggingface.co/EMBEDDIA/crosloengual-bert).

In [None]:
metric = load_metric('glue', 'sst2')

### Tokeniziranje podatkov v zbirki

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    'EMBEDDIA/crosloengual-bert',
    use_fast=True
)

In [None]:
tokenizer(['hello', 'world'])

{'input_ids': [[103, 17592, 1169, 104], [103, 2329, 104]], 'token_type_ids': [[0, 0, 0, 0], [0, 0, 0]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1]]}

Tekstovni zapis oznake sentimentov spremenimo v števke: *2* za pozitiven sentiment, *1* za nevtralen in *0* za negativen. Določimo tudi maksimalno dolžino twittov, tj. 512 znakov.

In [None]:
label2id = {'Positive': 2, 'Neutral': 1, 'Negative': 0}
id2label = ['Negative', 'Neutral', 'Positive']

In [None]:
def preprocess(examples):
  result = tokenizer(examples['sentence'], truncation=True, max_length=512)
  result['label'] = [label2id[l] for l in examples['label']]
  return result

In [None]:
encoded_dataset = dataset.map(preprocess, batched=True, load_from_cache_file=False)

  0%|          | 0/43 [00:00<?, ?ba/s]

  0%|          | 0/17 [00:00<?, ?ba/s]

### Učenje modela
Uvozimo model strojnega učenja, ki je istega tipa kot tokenizer (CroSloEngual BERT), definiramo argumente učenja modela ter ustvarimo novo instanco *trainerja*.

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    'EMBEDDIA/crosloengual-bert',
    num_labels=3
)

Some weights of the model checkpoint at EMBEDDIA/crosloengual-bert were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model chec

In [None]:
args = TrainingArguments(
    "tweet-sentiment",
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=0.1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    )

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=None,
    )

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

Sprožimo učenje modela na podatkovni zbirki, ki se izvaja 4 epohe.

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 42737
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 268


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=268, training_loss=1.11281335175927, metrics={'train_runtime': 44.1382, 'train_samples_per_second': 96.825, 'train_steps_per_second': 6.072, 'total_flos': 84615630682464.0, 'train_loss': 1.11281335175927, 'epoch': 0.1})

In [None]:
eval_results = trainer.evaluate()
print(eval_results)

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 16712
  Batch size = 16


{'eval_loss': 1.0616443157196045, 'eval_accuracy': 0.46834609861177595, 'eval_runtime': 40.9749, 'eval_samples_per_second': 407.859, 'eval_steps_per_second': 25.503, 'epoch': 0.1}


Shranimo in naložimo ustvarjen model tipa CroSloEngual BERT.

In [None]:
trainer.save_model(output_dir='C:/Users/gogi1/Desktop/diploma/model/Sentiment/tweet-sentiment-model')

Saving model checkpoint to C:/Users/gogi1/Desktop/diploma/model/Sentiment/tweet-sentiment-model
Configuration saved in C:/Users/gogi1/Desktop/diploma/model/Sentiment/tweet-sentiment-model/config.json
Model weights saved in C:/Users/gogi1/Desktop/diploma/model/Sentiment/tweet-sentiment-model/pytorch_model.bin
tokenizer config file saved in C:/Users/gogi1/Desktop/diploma/model/Sentiment/tweet-sentiment-model/tokenizer_config.json
Special tokens file saved in C:/Users/gogi1/Desktop/diploma/model/Sentiment/tweet-sentiment-model/special_tokens_map.json


In [None]:
model = AutoModelForSequenceClassification.from_pretrained('C:/Users/gogi1/Desktop/diploma/model/Sentiment/tweet-sentiment-model')

loading configuration file C:/Users/gogi1/Desktop/diploma/model/Sentiment/tweet-sentiment-model/config.json
Model config BertConfig {
  "_name_or_path": "C:/Users/gogi1/Desktop/diploma/model/Sentiment/tweet-sentiment-model",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": t

### Analiza sentimenta na podatkovni zbirki parlamentarnih debat
Naložimo podatkovno zbirko z zapisi parlamentarnih debat ter poženemo analizo sentimenta s pomočjo ravnokar ustvarjenega modela.

In [None]:
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving dataframe.csv to dataframe (1).csv
User uploaded file "dataframe.csv" with length 92383142 bytes


In [None]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [None]:
import csv

corpus = []
text = []

with open('dataframe.csv', 'r') as f:
  lineReader = csv.reader(f, delimiter=',', quotechar="\"")
  for row in lineReader:
    if row:
      #print(row)
      text.append(row[0])
      corpus.append({'text': row[0], 'datum': row[1]})

In [None]:
len(corpus)

75123

In [None]:
res = []

inputs = tokenizer(text, padding='longest', return_tensors="pt")
outputs = model(**inputs)
probs = outputs[0].detach().numpy()
for i in range(len(text)):
    #print(corpus[i],'\t', id2label[np.argmax(probs[i])])
    res.append(id2label[np.argmax(probs[i])])