In [1]:
import pandas as pd
import numpy as np
import json
import os

import torch
from torch.utils.data import TensorDataset
from torch.optim import AdamW

from tqdm.notebook import tqdm

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup # AdamW (deprec)


# 1. Fine-tune rubert-tiny2 for binary classification (if the post express any value))

batch_size = 8, epochs = 5

In [2]:
# pretrained model
BASE_BERT = 'cointegrated/rubert-tiny2'

# data balance
major_class_exrta_w = 1.2

# data loading
max_length = int(2048*0.75)
batch_size = 8

# train
epochs = 5

#not random
seed_val = 42

#save
bert_path = f'./temp/models/v2_{batch_size}_{max_length}_{major_class_exrta_w}/'

###################
# compute on GPU #0
device_id = '0'

In [65]:
try:
    os.mkdir(bert_path)
    os.mkdir(bert_path+'epochs')
except:
    pass
try:
    os.mkdir(bert_path+'epochs')
except:
    pass

In [68]:
device = torch.device("cuda:{}".format(str(device_id)) if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=3)

In [None]:
# there is no public data

df=pd.read_csv('', sep="|", encoding ='utf-8')[['text', 'final_label']] 
df=df.rename(columns={'final_label':'label'})
print (df.shape[0])
df.head()

5035


Unnamed: 0,text,label
0,"Думаете, что умеете пользоваться фотошопом?...",0.0
1,...Самое страшное - это когда ты стоишь под х...,1.0
2,Друзья мои! Поддержим дочку моей подруги! Про...,1.0
3,"Мой новый дневник, читаем, коментим :)",0.0
4,РУССКИЙ КРЫМ - МИФ для быдла! (о чем молчат ...,0.0


In [3]:
df.label = df.label.astype('int')

In [4]:
# combine 'doesn't reflect' and 'spam' classes
df.label=df.label.replace(3, 0)
df.label.value_counts()


0    3301
1    1734
Name: label, dtype: int64

## TRAIN

In [72]:
possible_labels = df.label.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{0: 0, 1: 1}

In [73]:
df['label'] = df.label.replace(label_dict)

In [76]:
from sklearn.utils.class_weight import compute_class_weight
class_w = compute_class_weight('balanced', classes=df.label.unique(), y=df.label)

# fix disbalance to major class
major_class_idx = np.argmin(class_w)
class_w[major_class_idx] = class_w[major_class_idx]/major_class_exrta_w
class_w

array([0.63553974, 1.45184544])

In [77]:
X = df.text
Y = df.label
X.shape, Y.shape

((5035,), (5035,))

In [78]:
# split into train and test
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X.values, Y.values, test_size = .20, stratify = Y.values, random_state = 87)

In [79]:
X_train.shape , X_val.shape , X_train.shape[0] + X_val.shape[0]

((4028,), (1007,), 5035)

In [80]:
%%time


tokenizer = BertTokenizer.from_pretrained(BASE_BERT, 
                                          do_lower_case=False)
                                          
encoded_data_train = tokenizer.batch_encode_plus(
    X_train, 
    add_special_tokens=True, 
    return_attention_mask=True,
    return_token_type_ids=False,
    padding='max_length', # можно поставить True  #'max_length'
    truncation=True,
    max_length=max_length, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    X_val, 
    add_special_tokens=True, 
    return_attention_mask=True,
    return_token_type_ids=False,
    padding='max_length',  #'max_length',
    truncation=True,
    max_length=max_length, 
    return_tensors='pt'
)

CPU times: user 4.74 s, sys: 128 ms, total: 4.87 s
Wall time: 5.6 s


In [81]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(y_train)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(y_val)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [82]:
model = BertForSequenceClassification.from_pretrained(BASE_BERT,
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model.to(device)

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(83828, 312, padding_idx=0)
      (position_embeddings): Embedding(2048, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, elemen

In [83]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler



dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), #SequentialSampler
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [84]:
optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [85]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted'), f1_score(labels_flat, preds_flat, average='macro')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)} = {len(y_preds[y_preds==label])/len(y_true)}\n')

In [86]:
import random
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [87]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
#         batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0].to(device),
                  'attention_mask': batch[1].to(device),
                  'labels':         batch[2].to(device),
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [88]:
# save
def save_checpoint(model, optimizer, output_model):
    # save
    torch.save({'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict()}, output_model)

# save(model, optimizer)

In [89]:
criterion = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(class_w), reduction = 'mean').to(device)
scores = {}

for epoch in tqdm(range(0,5)): # 1,epochs+1
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
#         batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0].to(device),
                  'attention_mask': batch[1].to(device),
                  'labels':         batch[2].to(device),
                 }       

        outputs = model(**inputs)
        logits = outputs['logits']
        
        
        loss = criterion(logits, inputs['labels'])
        loss_train_total += loss.item()    
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
                 
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1_w, val_f1_macro  = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted/Macro): {(val_f1_w, val_f1_macro)}')
    accuracy_per_class(predictions, true_vals)
    
    scores[epoch] = {'Training loss':loss_train_avg,
                     'Validation loss':val_loss,
                     'F1 Score (Weighted/Macro)':(val_f1_w, val_f1_macro) 
                    }
    
    # save checkpoint
    save_checpoint(model, optimizer, bert_path+f'epochs/{epoch}')

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 0:   0%|          | 0/504 [00:00<?, ?it/s]


Epoch 0
Training loss: 0.6455677735900122
Validation loss: 0.6234295635469376
F1 Score (Weighted/Macro): (0.656251957711443, 0.6468783185467086)
Class: 0
Accuracy: 370/660 = 0.5606060606060606

Class: 1
Accuracy: 284/347 = 0.8184438040345822



Epoch 1:   0%|          | 0/504 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.5627760663037262
Validation loss: 0.5686113704291601
F1 Score (Weighted/Macro): (0.7084040459663059, 0.6914517516401745)
Class: 0
Accuracy: 442/660 = 0.6696969696969697

Class: 1
Accuracy: 264/347 = 0.760806916426513



Epoch 2:   0%|          | 0/504 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.5233887325794924
Validation loss: 0.556405167849291
F1 Score (Weighted/Macro): (0.7316920650851454, 0.7120496307531641)
Class: 0
Accuracy: 476/660 = 0.7212121212121212

Class: 1
Accuracy: 255/347 = 0.7348703170028819



Epoch 3:   0%|          | 0/504 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.5005263998690579
Validation loss: 0.563113215660292
F1 Score (Weighted/Macro): (0.7297478046872117, 0.7099630338745637)
Class: 0
Accuracy: 475/660 = 0.7196969696969697

Class: 1
Accuracy: 254/347 = 0.7319884726224783



Epoch 4:   0%|          | 0/504 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.4895340929340039
Validation loss: 0.5619315552333045
F1 Score (Weighted/Macro): (0.7324875421414411, 0.7124270358566647)
Class: 0
Accuracy: 479/660 = 0.7257575757575757

Class: 1
Accuracy: 253/347 = 0.729106628242075



In [None]:
with open(bert_path+f'epochs/scores.json', 'w') as f:
    json.dump(scores, f)

# load best

best_epoch = np.argmin([scores[e]['Validation loss'] for e in scores]) + 1
print(f'best_epoch #{best_epoch}')

checkpoint = torch.load(bert_path+f'epochs/{best_epoch}', map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
model.save_pretrained(bert_path)
tokenizer.save_pretrained(bert_path)

In [102]:
#! SAVE MODEL !
model.save_pretrained(bert_path)
tokenizer.save_pretrained(bert_path)
bert_path

'./temp/models/v2_8_1536_1.2/'

In [None]:
_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

## export

In [None]:
rc_path = '../models/fine-tuned_rubert-tiny2/v2_8_1536_1.2/'

In [13]:
from transformers import BertModel
from transformers import BertTokenizer

In [16]:
model = BertModel.from_pretrained(rc_path)
tokenizer = BertTokenizer.from_pretrained(rc_path)

Some weights of the model checkpoint at ../../TopicModel/Distilbert_classification/models/v2_8_1536_1.2/ were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Take embeddings from fine-tuned rubert-tiny2

In [17]:
model_emb = model.base_model

In [18]:
def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [19]:
%%time
from tqdm.notebook import tqdm
emb_list = list()
it = 0
for s in tqdm(df.text.values):
    emb_list.append(embed_bert_cls(s, model_emb, tokenizer))

  0%|          | 0/5035 [00:00<?, ?it/s]

CPU times: user 55min 26s, sys: 1min 32s, total: 56min 58s
Wall time: 1min 24s


In [20]:
# embeddings for train dataset:
labels = df["label"]

emb_list_ = [np.asarray(s) for s in emb_list]
df_emb_=pd.DataFrame(emb_list_)
df_emb_

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,302,303,304,305,306,307,308,309,310,311
0,-0.018464,-0.025443,-0.005412,-0.067210,-0.008098,-0.042848,0.048566,0.038053,-0.051750,0.016248,...,0.013072,-0.137592,-0.063010,-0.102285,-0.017897,0.058684,-0.000319,0.078772,-0.005704,0.041218
1,0.087185,-0.013556,0.034466,-0.041398,-0.009335,0.038518,0.072062,-0.126953,0.072556,-0.010305,...,-0.009245,0.031439,0.044652,0.031633,-0.081807,-0.021026,-0.011231,0.007678,-0.057428,0.005134
2,0.121493,0.024607,-0.038873,-0.034820,-0.033760,-0.023758,0.055283,-0.082305,-0.030336,-0.063620,...,0.074011,0.065398,0.025788,0.005126,0.049230,-0.076790,0.114930,0.083010,0.026723,-0.053610
3,-0.020010,0.041242,-0.016424,-0.036992,-0.039034,0.003855,0.005490,0.042822,-0.038598,-0.078856,...,-0.016536,-0.062215,-0.011497,-0.130887,0.024420,0.049948,-0.068521,-0.050156,0.030229,0.039134
4,0.026711,-0.061871,0.018863,-0.026486,0.046739,-0.060128,0.027036,0.004546,-0.053569,0.005665,...,-0.000268,-0.096092,-0.059798,-0.059040,0.025562,0.083468,0.007707,0.097353,0.100729,-0.048607
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5030,0.090720,0.068379,0.007549,-0.037703,0.008094,-0.022078,0.057416,0.074114,0.012745,0.036362,...,0.039732,-0.056965,-0.118393,0.032156,0.018497,0.021067,0.037554,0.101765,0.023785,0.004840
5031,0.143055,0.031883,0.000566,-0.107946,-0.004803,-0.010487,0.043309,0.042991,-0.025998,0.029961,...,0.030708,0.031169,0.063458,-0.007555,-0.065752,0.068371,-0.064800,0.034165,0.053517,-0.010068
5032,0.124135,0.004982,-0.001954,-0.043581,-0.027300,-0.040936,0.006083,-0.125258,-0.009013,-0.044418,...,0.051877,0.048784,0.106491,0.048471,-0.019057,-0.011693,0.046124,-0.033521,0.045135,-0.047027
5033,0.050855,0.037297,0.013623,-0.040512,-0.012929,-0.042989,0.068927,0.009016,0.003902,0.060874,...,0.009632,-0.044231,0.059576,-0.017738,-0.072191,-0.035146,-0.112178,0.042907,-0.006640,-0.064849


In [None]:
# Load test dataset

test_data=pd.read_csv('', sep="|", encoding ='utf-8')[['text', 'final_label']]
test_data=test_data.rename(columns={'final_label':'label_test'})
print (test_data.shape[0])
test_data.head()

804


Unnamed: 0,text,label_test
0,"- интересный новый сервис, где можно оставить...",1
1,чет как-то нерадостно все это...особо на фоне...,0
2,#Repost with . ・・・ жаль что быстро убежала!!!#...,0
3,#hellomyearth #дорогажизни #разорванноекольцо,0
4,#ВтандемеСМамой#кактампробка#😁,0


In [22]:
test_data.replace({'label_test': {3: 0}}, inplace=True)
test_data.label_test.value_counts()

0    532
1    272
Name: label_test, dtype: int64

In [23]:
%%time
from tqdm.notebook import tqdm
emb_list_test = list()
it = 0
for s in tqdm(test_data.text.values):
    emb_list_test.append(embed_bert_cls(s, model_emb, tokenizer))

  0%|          | 0/804 [00:00<?, ?it/s]

CPU times: user 8min 20s, sys: 17.4 s, total: 8min 37s
Wall time: 12.9 s


In [None]:
# embeddings for test dataset:
    
labels_test=test_data['label_test']
emb_list_test_ = [np.asarray(s) for s in emb_list_test]
df_emb_test_=pd.DataFrame(emb_list_test_)
df_emb_test_

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,302,303,304,305,306,307,308,309,310,311
0,-0.047982,-0.007989,0.019273,-0.064445,-0.047470,0.079242,-0.094084,0.037180,0.049691,-0.039148,...,0.050658,-0.002591,-0.030989,0.011156,-0.025440,0.075424,0.062284,0.021909,0.038568,-0.067602
1,0.048708,0.038768,0.008993,-0.043480,-0.011525,-0.074801,0.068448,-0.029761,0.038171,-0.086254,...,0.015159,-0.086703,0.058923,-0.156870,0.034332,0.029744,-0.051479,0.020975,-0.005021,-0.045287
2,0.046934,-0.007776,-0.034438,-0.060966,0.001300,-0.036679,0.076556,-0.046489,-0.026749,-0.019772,...,0.004219,-0.030804,-0.031761,-0.056468,0.068399,0.162177,-0.056490,0.092742,0.047726,-0.021333
3,-0.044973,-0.031754,0.013514,-0.088200,0.005767,0.006182,0.086622,0.048924,-0.041500,-0.050908,...,-0.058033,-0.037874,-0.066198,-0.072266,0.083422,0.040231,-0.079380,0.028450,0.032081,0.027410
4,0.019812,0.003509,0.010675,-0.081504,-0.021689,-0.016962,0.031124,0.072162,-0.058702,-0.041161,...,-0.032649,-0.088169,-0.050347,-0.059466,0.042321,0.087201,-0.081285,0.027309,0.088323,-0.029691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799,0.101509,-0.086583,0.037290,-0.097883,-0.067217,-0.012579,0.081658,0.014439,-0.060402,-0.037441,...,-0.020561,-0.054927,0.003964,-0.058199,0.079025,0.088163,0.000104,0.089325,0.038812,0.045627
800,0.015678,-0.010740,-0.001362,-0.025951,-0.016946,0.006844,-0.015210,0.062379,0.035687,-0.057801,...,-0.027969,-0.004778,-0.050198,-0.040739,0.081978,0.084395,0.013094,0.002331,0.059930,-0.015127
801,0.017429,-0.018824,0.004258,-0.043123,0.043978,-0.012925,-0.038624,0.043852,-0.117673,0.061700,...,0.001252,-0.078440,-0.072294,-0.057007,0.068025,0.063575,-0.018018,0.005897,0.067275,-0.090942
802,-0.004265,0.044574,0.046853,-0.093753,-0.026876,0.010413,-0.004832,0.034466,-0.071080,0.073750,...,0.043476,-0.076574,0.039202,0.005576,-0.015504,0.040497,-0.113724,0.067692,0.015220,-0.046457


# SVM

In [25]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report

import numpy as np
class_weight = compute_class_weight(
    class_weight='balanced', classes=np.unique(labels), y=labels)
class_weight

array([0.76264768, 1.45184544])

In [26]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# parameteres = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
parameteres = {'C': [1], 'gamma': [1],'kernel': ['poly']}
clf = GridSearchCV(SVC(class_weight={0:class_weight[0], 1:class_weight[1]}, probability=True), param_grid=parameteres , cv=5, scoring='f1_macro')
clf.fit(df_emb_, labels)

In [27]:
print("Best parameters from gridsearch: {}".format(clf.best_params_))
print("CV score=%0.3f" % clf.best_score_)
cv_results = clf.cv_results_


Best parameters from gridsearch: {'C': 1, 'gamma': 1, 'kernel': 'poly'}
CV score=0.759


In [28]:
y_predict=clf.predict_proba(df_emb_test_)

In [29]:
import itertools
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

def TestCutoff(df):

    cut_off_list_=np.arange(0.005, 0.901, 0.005)
#     cut_off_list = itertools.chain([0.01], cut_off_list_)
    
    f1score_macro_list=[]
    f1score_list=[]
    recall_list=[]
    
    predict_list_list=[[]]
    for i, cut_off in enumerate(cut_off_list_):
        predict_list=np.where(df['predict_1']>cut_off, 1, 0)
        predict_list_list.append(predict_list)
        print (cut_off)
        precision, recall, f1score = precision_recall_fscore_support(df['label_test'], predict_list)[:3]
        print(f'precision: {precision}, recall: {recall}, f1score: {f1score}')
        f1score_list.append(f1score[1])
        recall_list.append(recall[1])
        print ("macro:")
        precision, recall, f1score_macro = precision_recall_fscore_support(df['label_test'], predict_list, average='macro')[:3]
        print(f'precision: {precision}, recall: {recall}, f1score_macro: {f1score_macro}')
        f1score_macro_list.append(f1score_macro)
        print (" ")
    
    max_ind=f1score_macro_list.index(max(f1score_macro_list))   
    print ("macro", f1score_macro_list[max_ind])
    print ("F1-valued:", f1score_list[max_ind])
    print ("recall-valued:", recall_list[max_ind])
    print (cut_off_list_[max_ind])
    
    df['predict']=predict_list_list[max_ind+1]
        
    return (df)
#         
    

In [None]:
predictions0=[]
predictions1=[]

for res in y_predict:
    predictions0.append(res[0])
    predictions1.append(res[1])

test_data['predict_0']=predictions0
test_data['predict_1']=predictions1

test_data_predict=TestCutoff(test_data)

best cut-off = 0.42

In [31]:
from sklearn.metrics import precision_recall_fscore_support
precision, recall, f1score = precision_recall_fscore_support(test_data.label_test, test_data['predict'])[:3]

print(f'precision: {precision}, recall: {recall}, f1score: {f1score}')

precision: [0.875      0.78461538], recall: [0.89473684 0.75      ], f1score: [0.88475836 0.76691729]


In [32]:
precision, recall, f1score = precision_recall_fscore_support(test_data.label_test, test_data['predict'], average='macro')[:3]

print(f'precision: {precision}, recall: {recall}, f1score_macro: {f1score}')

precision: 0.8298076923076922, recall: 0.8223684210526316, f1score_macro: 0.8258378287726752


In [34]:
# Save model
import joblib

joblib.dump(clf.best_estimator_, '../models/SVC_for_binary_classification_pre-trained_rubert-tiny2_based.pkl')

['../models/SVC_for_binary_classification_pre-trained_rubert-tiny2_based.pkl']