# Imports and metrics

In [None]:
!pip install datasets
!pip install transformers

import csv 
import numpy as np # numpy
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split # split for validation sets
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score # metrics
import transformers as tr # transformers
from transformers import ElectraForPreTraining, AutoTokenizer # tokenizers
from datasets import load_metric, load_dataset # utility functions
import torch
import re





In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Load data






In [None]:
from sklearn.utils import shuffle

train_data = pd.read_csv('train_data_csfd.csv')
train_data = shuffle(train_data)
train_data.head()

Unnamed: 0,text,label
24669,Tyhle filmy jsou mnohem lepší než ty z 21. sto...,1
5879,Interstate 60 vytváří koncept samostatného svě...,1
9553,Jestli je nějaký pořad který si zaslouží odpad...,0
3785,Dvojka už byla také řádně vyvařená a přesto se...,0
18870,Nevtipný komik. Děsivá kombinace 0%,0


In [None]:
x_train_data = list(train_data['text'])
# print(x[:5])

y_train_data = list(train_data['label'])
# print(y[:5])

# x_train_data = x_train_data[:20000]
# y_train_data = y_train_data[:20000]

- uncomment to use other values

In [None]:
test_data = pd.read_csv('test_data_csfd.csv')

x_test = list(test_data['text'])
y_test = list(test_data['label'])

# Tokenize data

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_train_data, y_train_data, test_size=0.20, random_state=69)

tokenizer = AutoTokenizer.from_pretrained("Seznam/small-e-czech")
train_encodings = tokenizer(x_train, truncation=True, padding=True, max_length=50)
val_encodings = tokenizer(x_val, truncation=True, padding=True, max_length=50)
test_encodings = tokenizer(x_test, truncation=True, padding=True, max_length=50)

loading configuration file https://huggingface.co/Seznam/small-e-czech/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/5757afa75f84a9d8eae0cab2cc71caa7fa734ce29a3d1460d8ff138877a6272c.b4542522909d7c3467d6bb947216c79cef9ed619c6fb7c1b732d20e768f0c674
Model config ElectraConfig {
  "_name_or_path": "Seznam/small-e-czech",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_versi

In [None]:
class PrepareDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = PrepareDataset(train_encodings, y_train)
val_dataset = PrepareDataset(val_encodings, y_val)
test_dataset = PrepareDataset(test_encodings, y_test)



# Small-e-czech

## 1. Experiment

In [None]:
from transformers import Trainer, TrainingArguments, ElectraForSequenceClassification, AutoModelForSequenceClassification



training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="steps",
    eval_steps = 4000,
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=1,  # batch size per device during training
    per_device_eval_batch_size=1,   # batch size for evaluation
    warmup_steps=2500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    #load_best_model_at_end=True,
)

model = AutoModelForSequenceClassification.from_pretrained("Seznam/small-e-czech", num_labels=2)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,             # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/Seznam/small-e-czech/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/5757afa75f84a9d8eae0cab2cc71caa7fa734ce29a3d1460d8ff138877a6272c.b4542522909d7c3467d6bb947216c79cef9ed619c6fb7c1b732d20e768f0c674
Model config ElectraConfig {
  "_name_or_path": "Seznam/small-e-czech",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hid

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
4000,0.8731,0.838835,0.8234,0.803605,0.856,0.828975
8000,0.7242,0.979783,0.798,0.944511,0.6332,0.758142
12000,0.6409,0.736621,0.8585,0.863812,0.8512,0.857459
16000,0.7992,0.729353,0.863,0.86242,0.8638,0.86311


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2500
Configuration saved in ./results/checkpoint-2500/config.json
Model weights saved in ./results/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-3000
Configuration saved in ./results/checkpoint-3

TrainOutput(global_step=16000, training_loss=0.7725331554412842, metrics={'train_runtime': 1865.2655, 'train_samples_per_second': 8.578, 'train_steps_per_second': 8.578, 'total_flos': 45968188800000.0, 'train_loss': 0.7725331554412842, 'epoch': 1.0})

In [None]:
trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 10000
  Batch size = 1


PredictionOutput(predictions=array([[-3.3468058,  3.6649668],
       [ 3.0012815, -3.4552958],
       [ 2.9640367, -3.4117818],
       ...,
       [ 2.426494 , -2.7394288],
       [-3.307341 ,  3.6172903],
       [-2.2921898,  2.4988637]], dtype=float32), label_ids=array([1, 0, 0, ..., 1, 1, 1]), metrics={'test_loss': 0.7293527126312256, 'test_accuracy': 0.863, 'test_precision': 0.8624201277955271, 'test_recall': 0.8638, 'test_f1': 0.8631095123900879, 'test_runtime': 142.7206, 'test_samples_per_second': 70.067, 'test_steps_per_second': 70.067})

## 2. Experiment

In [None]:
from transformers import Trainer, TrainingArguments, ElectraForSequenceClassification, AutoModelForSequenceClassification



training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="steps",
    eval_steps = 10000,
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=1,  # batch size per device during training
    per_device_eval_batch_size=1,   # batch size for evaluation
    warmup_steps=5000,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    #load_best_model_at_end=True,
)

model = AutoModelForSequenceClassification.from_pretrained("Seznam/small-e-czech", num_labels=2)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,             # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 10000
  Batch size = 1


PredictionOutput(predictions=array([[ 3.7413886, -3.9276087],
       [ 2.7627022, -2.8717153],
       [ 3.763007 , -3.9434583],
       ...,
       [ 3.7326827, -3.9196742],
       [ 3.7301762, -3.901807 ],
       [-2.7818615,  2.990993 ]], dtype=float32), label_ids=array([0, 0, 0, ..., 0, 0, 1]), metrics={'test_loss': 0.6246399879455566, 'test_accuracy': 0.8908, 'test_precision': 0.8710596278009874, 'test_recall': 0.9174, 'test_f1': 0.8936294564582116, 'test_runtime': 144.2598, 'test_samples_per_second': 69.319, 'test_steps_per_second': 69.319})

## 3. Experiment

In [None]:
from transformers import Trainer, TrainingArguments, ElectraForSequenceClassification, AutoModelForSequenceClassification



training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="steps",
    eval_steps = 500,
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    load_best_model_at_end=True,
)

model = AutoModelForSequenceClassification.from_pretrained("Seznam/small-e-czech", num_labels=2)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,             # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/Seznam/small-e-czech/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/5757afa75f84a9d8eae0cab2cc71caa7fa734ce29a3d1460d8ff138877a6272c.b4542522909d7c3467d6bb947216c79cef9ed619c6fb7c1b732d20e768f0c674
Model config ElectraConfig {
  "_name_or_path": "Seznam/small-e-czech",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hid

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.5901,0.431672,0.8162,0.784046,0.8728,0.826046
1000,0.3859,0.338738,0.8546,0.822716,0.904,0.861445
1500,0.3282,0.315029,0.8697,0.847136,0.9022,0.873801
2000,0.3194,0.280348,0.8802,0.87187,0.8914,0.881527
2500,0.3034,0.261613,0.8878,0.872885,0.9078,0.89
3000,0.2324,0.299452,0.8835,0.851384,0.9292,0.888591
3500,0.2305,0.298726,0.8913,0.864678,0.9278,0.895128
4000,0.2199,0.313763,0.8868,0.848092,0.9424,0.892762
4500,0.2316,0.30236,0.8823,0.834588,0.9536,0.890133
5000,0.2161,0.267262,0.8955,0.866815,0.9346,0.899432


***** Running Evaluation *****
  Num examples = 10000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_m

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.5901,0.431672,0.8162,0.784046,0.8728,0.826046
1000,0.3859,0.338738,0.8546,0.822716,0.904,0.861445
1500,0.3282,0.315029,0.8697,0.847136,0.9022,0.873801
2000,0.3194,0.280348,0.8802,0.87187,0.8914,0.881527
2500,0.3034,0.261613,0.8878,0.872885,0.9078,0.89
3000,0.2324,0.299452,0.8835,0.851384,0.9292,0.888591
3500,0.2305,0.298726,0.8913,0.864678,0.9278,0.895128
4000,0.2199,0.313763,0.8868,0.848092,0.9424,0.892762
4500,0.2316,0.30236,0.8823,0.834588,0.9536,0.890133
5000,0.2161,0.267262,0.8955,0.866815,0.9346,0.899432


Saving model checkpoint to ./results/checkpoint-5500
Configuration saved in ./results/checkpoint-5500/config.json
Model weights saved in ./results/checkpoint-5500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-6000
Configuration saved in ./results/checkpoint-6000/config.json
Model weights saved in ./results/checkpoint-6000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-6500
Configuration saved in ./results/checkpoint-6500/config.json
Model weights saved in ./results/checkpoint-6500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-7000
Configuration saved in ./results/checkpoint-7000/config.json
Model weights saved in ./results/checkpoint-7000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10000
  Batc

TrainOutput(global_step=7593, training_loss=0.25824740864645584, metrics={'train_runtime': 1385.3549, 'train_samples_per_second': 87.682, 'train_steps_per_second': 5.481, 'total_flos': 348984743346000.0, 'train_loss': 0.25824740864645584, 'epoch': 3.0})

In [None]:
trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 10000
  Batch size = 16


PredictionOutput(predictions=array([[-2.1884203 ,  2.3354883 ],
       [ 1.9350162 , -2.2224708 ],
       [ 2.1401632 , -2.44996   ],
       ...,
       [-0.09943626,  0.11052746],
       [-1.0771567 ,  1.1682254 ],
       [-0.67679715,  0.7417249 ]], dtype=float32), label_ids=array([1, 0, 0, ..., 1, 1, 1]), metrics={'test_loss': 0.26161304116249084, 'test_accuracy': 0.8878, 'test_precision': 0.8728846153846154, 'test_recall': 0.9078, 'test_f1': 0.89, 'test_runtime': 22.2729, 'test_samples_per_second': 448.976, 'test_steps_per_second': 28.061})

## 4. Experiment

In [None]:
from transformers import Trainer, TrainingArguments, ElectraForSequenceClassification, AutoModelForSequenceClassification



training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="steps",
    eval_steps = 500,
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    load_best_model_at_end=True,
)

model = AutoModelForSequenceClassification.from_pretrained("Seznam/small-e-czech", num_labels=2)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,             # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/Seznam/small-e-czech/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/5757afa75f84a9d8eae0cab2cc71caa7fa734ce29a3d1460d8ff138877a6272c.b4542522909d7c3467d6bb947216c79cef9ed619c6fb7c1b732d20e768f0c674
Model config ElectraConfig {
  "_name_or_path": "Seznam/small-e-czech",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hid

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.562,0.412734,0.8229,0.772122,0.9162,0.838013
1000,0.3504,0.306464,0.8672,0.839121,0.9086,0.872479
1500,0.2882,0.324021,0.8645,0.815148,0.9428,0.874339
2000,0.2414,0.258393,0.89,0.865579,0.9234,0.893555
2500,0.2327,0.251818,0.8955,0.875166,0.9226,0.898257
3000,0.1881,0.316629,0.8837,0.840582,0.947,0.890624
3500,0.1759,0.306419,0.8814,0.835385,0.95,0.889014


***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_m

TrainOutput(global_step=3798, training_loss=0.28147689614439086, metrics={'train_runtime': 855.7392, 'train_samples_per_second': 141.947, 'train_steps_per_second': 4.438, 'total_flos': 348984743346000.0, 'train_loss': 0.28147689614439086, 'epoch': 3.0})

In [None]:
trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 10000
  Batch size = 32


PredictionOutput(predictions=array([[-1.9429344 ,  2.6439602 ],
       [ 1.7200942 , -2.2398076 ],
       [ 2.251006  , -2.9004934 ],
       ...,
       [ 0.34885728, -0.45480007],
       [-1.3957477 ,  1.9016438 ],
       [-1.2087216 ,  1.6491921 ]], dtype=float32), label_ids=array([1, 0, 0, ..., 1, 1, 1]), metrics={'test_loss': 0.2518179714679718, 'test_accuracy': 0.8955, 'test_precision': 0.8751660026560425, 'test_recall': 0.9226, 'test_f1': 0.8982572290916171, 'test_runtime': 17.6342, 'test_samples_per_second': 567.078, 'test_steps_per_second': 17.75})

# Save model

In [None]:
torch.save(model, 'modelfirstCZ')

# Testing on my own data

In [None]:
#Testing it on my own data
df = pd.read_fwf('my_test.txt', delimeter='\n', header=None)
my_data = pd.DataFrame({'text':df[0], 'label':1})
my_data[1:3].label = 0
my_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,text,label
0,Film byl skvělý.,1
1,"Nedoporučuji, nestalo to za nic, ztráta času.",0
2,Strašně mě bavilo koukat na 2 hodiny o ničem.,0
3,"Bylo to jako mé studium na gymnáziu, plné výzev,",1


In [None]:
my_data_x = list(my_data['text'])

my_label_y = list(my_data['label'])

In [None]:
my_encodings = tokenizer(my_data_x, truncation=True, padding="max_length",  max_length=50)
my_dataset = PrepareDataset(my_encodings, my_label_y)

In [None]:
outputs = trainer.predict(my_dataset)
y_pred = outputs.predictions.argmax(1)
outputs
y_pred


***** Running Prediction *****
  Num examples = 4
  Batch size = 1


array([1, 0, 0, 1])