In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "1"

In [2]:
import csv
import re
data = []

with open('SentiNews_paragraph-level.txt', 'r') as f:
  lineReader = csv.reader(f, delimiter=',', quotechar="\"")
  for row in lineReader:
    if row:
      row = ' '.join(row)
      elementi = row.split('\t')
      sentence = elementi[2]
      sentiment = elementi[11]
      sentiment = re.sub(r'[^A-Z\Č\Š\Ž\Ća-z\č\š\ž\ć\.,!?]+', "", sentiment)
      data.append({'text': sentence, 'sent': sentiment})

In [3]:
data = data[1:]

In [4]:
import pandas as pd
df = pd.DataFrame(data, index=None, columns=['text', 'sent'])

In [5]:
df['sent'].value_counts()

neutral     40358
negative    18268
positive    10781
Name: sent, dtype: int64

In [6]:
df.sample(15)

Unnamed: 0,text,sent
58248,Dolgovi do davčne uprave še niso poravnani se...,neutral
56815,DZ je z 42 glasovi za dal soglasje k prodaji 2...,negative
5681,Prav zato smo na Cekinu za vas pregledali ponu...,neutral
12226,GZS,neutral
19536,katja.svensek@dnevnik.si,neutral
49103,Vzpostavitev novega sistema cestninjenja je nu...,neutral
42345,Nikkei11. 204,neutral
50511,Konec septembra je bilo v celotni skupini 4.79...,negative
67799,Špehar strokovnjak za Facebook,neutral
35241,Dolar še naprej pada,neutral


In [7]:
df['text'] = df['text'].apply(lambda x: ' '.join(x.lower().split()))

In [8]:
df['text'] = df['text'].apply(lambda x: ' '.join(x.split()))

In [9]:
import re
df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w ]+', "", x))

In [10]:
df['text'] = df['text'].apply(lambda x: re.sub("\d+", "", x))

In [11]:
df.shape

(69407, 2)

In [12]:
df = df.dropna()

In [13]:
df.shape

(69407, 2)

In [14]:
train = df[:25000]

In [15]:
test = df[25001:35715]

In [16]:
train.shape

(25000, 2)

In [17]:
test.shape

(10714, 2)

In [18]:
#from google.colab import files
train.to_csv('sentTrain.csv', encoding = 'utf-8-sig', index=False) 
test.to_csv('sentTest.csv', encoding = 'utf-8-sig', index=False)

In [19]:
!pip install transformers datasets sklearn numpy torch torchvision



In [20]:
from datasets import load_dataset, load_metric



In [21]:
with open("sentTrain.csv",'r') as f:
    with open("sentTrainHeadless.csv",'w') as f1:
        next(f) # skip header line
        for line in f:
          f1.write(line)

In [22]:
with open("sentTest.csv",'r') as f:
    with open("sentTestHeadless.csv",'w') as f1:
        next(f) # skip header line
        for line in f:
          f1.write(line)

In [23]:
with open("dataParlamentS.csv",'r') as f:
    with open("dataParlamentSheadless.csv",'w') as f1:
        next(f) # skip header line
        for line in f:
            f1.write(line)

In [24]:
dataset = load_dataset(
    'csv',
    data_files={
        'train': 'sentTrainHeadless.csv',
        'validation': 'dataParlamentSheadless.csv',
        'test': 'sentTestHeadless.csv'
    },
    column_names = ['sentence', 'label']
)

Using custom data configuration default-b87ec910fe917df9


Downloading and preparing dataset csv/default to /home/ncirar/.cache/huggingface/datasets/csv/default-b87ec910fe917df9/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/ncirar/.cache/huggingface/datasets/csv/default-b87ec910fe917df9/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [25]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 25000
    })
    validation: Dataset({
        features: ['sentence', 'label'],
        num_rows: 72
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 10714
    })
})

In [26]:
metric = load_metric('glue', 'sst2')

In [27]:
from transformers import AutoTokenizer

In [28]:
tokenizer = AutoTokenizer.from_pretrained(
    'EMBEDDIA/crosloengual-bert',
    use_fast=True
)

In [29]:
tokenizer(['hello', 'world'])

{'input_ids': [[103, 17592, 1169, 104], [103, 2329, 104]], 'token_type_ids': [[0, 0, 0, 0], [0, 0, 0]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1]]}

In [30]:
label2id = {'positive': 2, 'neutral': 1, 'negative': 0}
id2label = ['negative', 'neutral', 'positive']

In [31]:
def preprocess(examples):
  result = tokenizer(examples['sentence'], truncation=True, max_length=512)
  result['label'] = [label2id[l] for l in examples['label']]
  return result

In [32]:
encoded_dataset = dataset.map(preprocess, batched=True, load_from_cache_file=False)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

In [33]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    validation: Dataset({
        features: ['sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 72
    })
    test: Dataset({
        features: ['sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10714
    })
})

In [34]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np

2022-08-29 10:55:18.173941: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [35]:
model = AutoModelForSequenceClassification.from_pretrained(
    'EMBEDDIA/crosloengual-bert',
    num_labels=3
)

Some weights of the model checkpoint at EMBEDDIA/crosloengual-bert were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model chec

In [40]:
args = TrainingArguments(
    "tweet-sentiment",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3.0,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    )

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [41]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [42]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=None,
    )

In [43]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 25000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4689


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5877,0.876286,0.555556
2,0.4376,1.387938,0.555556
3,0.3125,1.696784,0.555556


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 72
  Batch size = 16
Saving model checkpoint to tweet-sentiment/checkpoint-1563
Configuration saved in tweet-sentiment/checkpoint-1563/config.json
Model weights saved in tweet-sentiment/checkpoint-1563/pytorch_model.bin
tokenizer config file saved in tweet-sentiment/checkpoint-1563/tokenizer_config.json
Special tokens file saved in tweet-sentiment/checkpoint-1563/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****

TrainOutput(global_step=4689, training_loss=0.4483892807296457, metrics={'train_runtime': 938.7008, 'train_samples_per_second': 79.898, 'train_steps_per_second': 4.995, 'total_flos': 4779697790711088.0, 'train_loss': 0.4483892807296457, 'epoch': 3.0})

In [44]:
eval_results = trainer.evaluate()
print(eval_results)

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 72
  Batch size = 16


{'eval_loss': 0.8762858510017395, 'eval_accuracy': 0.5555555555555556, 'eval_runtime': 1.0884, 'eval_samples_per_second': 66.152, 'eval_steps_per_second': 4.594, 'epoch': 3.0}


In [45]:
trainer.save_model(output_dir='tweet-sentiment-model')

Saving model checkpoint to tweet-sentiment-model
Configuration saved in tweet-sentiment-model/config.json
Model weights saved in tweet-sentiment-model/pytorch_model.bin
tokenizer config file saved in tweet-sentiment-model/tokenizer_config.json
Special tokens file saved in tweet-sentiment-model/special_tokens_map.json


In [46]:
model = AutoModelForSequenceClassification.from_pretrained('tweet-sentiment-model')

loading configuration file tweet-sentiment-model/config.json
Model config BertConfig {
  "_name_or_path": "tweet-sentiment-model",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.21.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 49601
}

loading weights file tweet-sentiment-model/pytorch_model.bin
All

In [47]:
examples=['Rezultati za prejšnje leto so res pohvale vredni.',
          'Najlepša hvala za pomoč, zelo sem hvaležen.',
          'Neumni politiki nimajo pojma.', 
          'Če me ne pustiš pri miru, te bom udaril!',
          'Jutri bo deževalo.',
          'Ne maram mleka.',
          'Sovražim ponedeljke.',
          'Lansko poročilo kaže res dobre rezultate, super.',
          'Veselimo se sodelovanja z vami.',
          'Nemški ovčar je vrsta psa.'
          'Včeraj sem videl čudovito mavrico, kar me je zelo osrečilo.',
          'Oblaki so prekrili nebo',
          'Lahko nadaljujete z govorom.',
          'Takoj prenehajte, drugače dobite opomin.']

inputs = tokenizer(examples, padding='longest', return_tensors="pt")
outputs = model(**inputs)
probs = outputs[0].detach().numpy()
for i in range(len(examples)):
    print(examples[i],'\t', id2label[np.argmax(probs[i])])

Rezultati za prejšnje leto so res pohvale vredni. 	 positive
Najlepša hvala za pomoč, zelo sem hvaležen. 	 positive
Neumni politiki nimajo pojma. 	 negative
Če me ne pustiš pri miru, te bom udaril! 	 negative
Jutri bo deževalo. 	 negative
Ne maram mleka. 	 neutral
Sovražim ponedeljke. 	 negative
Lansko poročilo kaže res dobre rezultate, super. 	 positive
Veselimo se sodelovanja z vami. 	 positive
Nemški ovčar je vrsta psa.Včeraj sem videl čudovito mavrico, kar me je zelo osrečilo. 	 positive
Oblaki so prekrili nebo 	 neutral
Lahko nadaljujete z govorom. 	 neutral
Takoj prenehajte, drugače dobite opomin. 	 neutral


In [48]:
import csv

corpus = []
transkript = []
dvajset = []
devetnajst = []
osemnajst = []
sedemnajst = []
sestnajst = []

i=0
with open('dataframe.csv', 'r') as f:
  lineReader = csv.reader(f, delimiter=',', quotechar="\"")
  next(f)
  for row in lineReader:
    transkript.append(row[0])
    t = row[1].split('-')
    leto = t[0]
    mesec = t[1]
    corpus.append({'text': row[0], 'leto': leto, 'mesec': mesec})
    if leto == '2020':
        dvajset.append({'text': row[0], 'leto': leto, 'mesec': mesec})
    elif leto == '2019':
        devetnajst.append({'text': row[0], 'leto': leto, 'mesec': mesec})
    elif leto == '2018':
        osemnajst.append({'text': row[0], 'leto': leto, 'mesec': mesec})
    elif leto == '2017':
        sedemnajst.append({'text': row[0], 'leto': leto, 'mesec': mesec})
    elif leto == '2016':
        sestnajst.append({'text': row[0], 'leto': leto, 'mesec': mesec})
    else:
        continue

In [49]:
import pandas as pd
df = pd.DataFrame(corpus, index=None, columns=['text', 'leto', 'mesec'])

In [50]:
df.sample(5)

Unnamed: 0,text,leto,mesec
27512,hvala lepa. tudi v mojem imenu lep pozdrav del...,2016,6
56070,"hvala lepa. želite odgovoriti? če ne, dajem be...",2018,12
21372,spoštovane kolegice poslanke in kolegi poslanc...,2016,3
55024,danes se nam obeta še ena razprava v nizu tist...,2018,12
40662,mag. marko pogačnik bo postavil poslansko vpra...,2017,6


In [51]:
df['leto'].value_counts()

2015    15671
2016    14724
2017    13571
2019    11877
2018     8515
2020     6601
2014     4163
Name: leto, dtype: int64

In [52]:
dva = pd.DataFrame(dvajset, index=None, columns=['text', 'leto', 'mesec'])
devet = pd.DataFrame(devetnajst, index=None, columns=['text', 'leto', 'mesec'])
osem = pd.DataFrame(osemnajst, index=None, columns=['text', 'leto', 'mesec'])
sedem = pd.DataFrame(sedemnajst, index=None, columns=['text', 'leto', 'mesec'])
sest = pd.DataFrame(sestnajst, index=None, columns=['text', 'leto', 'mesec'])

In [53]:
sest['mesec'].value_counts()

03    2176
11    2061
06    1532
12    1452
05    1445
09    1290
07    1120
04    1061
10    1049
01     794
02     744
Name: mesec, dtype: int64

In [54]:
def normalizeMonths(leto):
    jan = leto[leto['mesec']=='01']
    feb = leto[leto['mesec']=='02']
    mar = leto[leto['mesec']=='03']
    apr = leto[leto['mesec']=='04']
    maj = leto[leto['mesec']=='05']
    jun = leto[leto['mesec']=='06']
    jul = leto[leto['mesec']=='07']
    avg = leto[leto['mesec']=='08']
    sep = leto[leto['mesec']=='09']
    okt = leto[leto['mesec']=='10']
    nov = leto[leto['mesec']=='11']
    dec = leto[leto['mesec']=='12']
    
    if len(jan) >= 50:
        jan = jan.sample(50)
    if len(feb) >= 50:
        feb = feb.sample(50)
    if len(mar) >= 50:
        mar = mar.sample(50)
    if len(apr) >= 50:
        apr = apr.sample(50)
    if len(maj) >= 50:
        maj = maj.sample(50)
    if len(jun) >= 50:
        jun = jun.sample(50)
    if len(jul) >= 50:
        jul = jul.sample(50)
    if len(avg) >= 50:
        avg = avg.sample(50)
    if len(sep) >= 50:
        sep = sep.sample(50)
    if len(okt) >= 50:
        okt = okt.sample(50)
    if len(nov) >= 50:
        nov = nov.sample(50)
    if len(dec) >= 50:
        dec = dec.sample(50)
    
    meseci = [jan, feb, mar, apr, maj, jun, jul, avg, sep, okt, nov, dec]
    
    leto = pd.concat(meseci)

    return leto

In [55]:
sest = normalizeMonths(sest)
sedem = normalizeMonths(sedem)
osem = normalizeMonths(osem)
devet = normalizeMonths(devet)
dva = normalizeMonths(dva)

In [56]:
dva['mesec'].value_counts()

01    50
03    50
04    50
05    50
06    50
07    50
Name: mesec, dtype: int64

In [57]:
sest['mesec'].value_counts()

01    50
02    50
03    50
04    50
05    50
06    50
07    50
09    50
10    50
11    50
12    50
Name: mesec, dtype: int64

In [58]:
def getResults(leto):
    tekst = list(leto['text'])
    examples = tekst
    results = []
    
    inputs = tokenizer(examples, padding='longest', return_tensors="pt", max_length=512, truncation=True)
    outputs = model(**inputs)
    probs = outputs[0].detach().numpy()
    for i in range(len(examples)):
        results.append(id2label[np.argmax(probs[i])])
    
    leto['sent'] = results

In [59]:
getResults(sest)

In [60]:
sest['sent'].value_counts()

neutral     517
negative     33
Name: sent, dtype: int64

In [61]:
getResults(sedem)
sedem['sent'].value_counts()

neutral     501
negative     47
positive      2
Name: sent, dtype: int64

In [62]:
getResults(osem)
osem['sent'].value_counts()

neutral     545
negative     47
positive      1
Name: sent, dtype: int64

In [63]:
getResults(devet)
devet['sent'].value_counts()

neutral     490
negative     38
positive      1
Name: sent, dtype: int64

In [64]:
getResults(dva)
dva['sent'].value_counts()

neutral     284
negative     16
Name: sent, dtype: int64

In [65]:
sest.to_csv('dvaSestnajst-crosloengualBERT.csv', encoding = 'utf-8-sig', index=False)

In [66]:
sedem.to_csv('dvaSedemnajst-crosloengualBERT.csv', encoding = 'utf-8-sig', index=False)

In [67]:
osem.to_csv('dvaOsemnajst-crosloengualBERT.csv', encoding = 'utf-8-sig', index=False)

In [68]:
devet.to_csv('dvaDevetnajst-crosloengualBERT.csv', encoding = 'utf-8-sig', index=False)

In [69]:
dva.to_csv('dvaDvajset-crosloengualBERT.csv', encoding = 'utf-8-sig', index=False)