# Extract embeddings from SBERT and train SVM, LogitBoost, and LogitRegression

In [120]:
from transformers import AutoTokenizer, AutoModel
import torch

#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/sbert_large_mt_nlu_ru")
model = AutoModel.from_pretrained("sberbank-ai/sbert_large_mt_nlu_ru")

Load train dataset

In [1]:
import pandas as pd
fold='../data/data_for_binary_classification/'
file_='chatGPT_3_instr0_withEx_temp0_train_all_updated.csv'

df=pd.read_csv(fold+file_, sep="|", encoding ='utf-8')[['text', 'final_label']] #'label_crowd', 'gpt_result', 
df=df.rename(columns={'final_label':'label'})
print (df.shape[0])
df.head()

5035


Unnamed: 0,text,label
0,"Думаете, что умеете пользоваться фотошопом?...",0.0
1,...Самое страшное - это когда ты стоишь под х...,1.0
2,Друзья мои! Поддержим дочку моей подруги! Про...,1.0
3,"Мой новый дневник, читаем, коментим :)",0.0
4,РУССКИЙ КРЫМ - МИФ для быдла! (о чем молчат ...,0.0


In [2]:
df.label = df.label.astype('int')
df.label=df.label.replace(3, 0)
df.label.value_counts()

0    3301
1    1734
Name: label, dtype: int64

In [3]:
possible_labels = df.label.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict


{0: 0, 1: 1}

In [4]:
df['label'] = df.label.replace(label_dict)

In [126]:
from tqdm.notebook import tqdm as tqdm_n
import numpy as np

def mean_pooling(model_output, attention_mask, norm=True):
    token_embeddings = model_output[0]  #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum([1]), min=1e-9)
    sums = sum_embeddings / sum_mask
    if norm:
        sums = torch.nn.functional.normalize(sums)
    return sums


def embed_bert_pytorch(text, model, tokenizer, emb_type=['cls','mean']):
    t = tokenizer(text, padding=True, truncation=True, max_length=24, return_tensors='pt')
    t = {k: v.to(model.device) for k, v in t.items()}

    with torch.inference_mode():    
        model_output = model(**t)
    
    res_dict = {}
   
    if 'cls' in emb_type:
        e1 = torch.nn.functional.normalize(model_output.last_hidden_state[:, 0, :])
        res_dict['cls'] = e1[0].cpu().numpy()

    if 'mean' in emb_type:
        e2 = mean_pooling(model_output, t['attention_mask'])
        res_dict['mean'] = e2[0].cpu().numpy()
        
        
    return res_dict

def get_emb(embedder, data):
    embs = [embedder(x) for x in tqdm_n(data)] # tqdm
    emb = {}
    for k in embs[0].keys():
        emb[k] = np.stack([row[k] for row in embs])
    return emb

In [127]:
%%time
embs = get_emb(lambda x: embed_bert_pytorch(x, model, tokenizer), df.text.values)

  0%|          | 0/5035 [00:00<?, ?it/s]

CPU times: user 3h 58min 55s, sys: 52.3 s, total: 3h 59min 47s
Wall time: 5min


In [128]:
embs['cls'].shape, embs['mean'].shape

((5035, 1024), (5035, 1024))

In [129]:
embs['cls'][0], embs['mean'][0]

(array([ 0.00929506, -0.01398701,  0.00787013, ..., -0.00742487,
         0.02980503,  0.00831383], dtype=float32),
 array([ 0.00592028, -0.00051701,  0.02079129, ..., -0.00939083,
         0.04377579,  0.00758594], dtype=float32))

In [131]:
from sklearn.model_selection import train_test_split

X_cls_train, X_cls_test, y_train, y_test = train_test_split(
    embs['cls'], df.label, test_size=0.20, random_state=42
)

X_mean_train, X_mean_test, _, _ = train_test_split(
    embs['mean'], df.label, test_size=0.20, random_state=42
)

In [132]:
from sklearn.linear_model import LogisticRegressionCV

Cs = np.logspace(-5, 5, 20)

In [133]:
%%time
clf_cls  = LogisticRegressionCV(Cs=Cs, max_iter=1_000, n_jobs=1, verbose=0).fit(X_cls_train, y_train)
clf_mean = LogisticRegressionCV(Cs=Cs, max_iter=1_000, n_jobs=1, verbose=0).fit(X_mean_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


CPU times: user 15min 59s, sys: 25min 32s, total: 41min 32s
Wall time: 39.1 s


In [134]:
import math
from sklearn.metrics import fbeta_score, roc_auc_score, precision_score, recall_score

def get_metrics(y, preds, beta = 5):
    round3 = lambda x: round(x*1000)/1000
    pre = precision_score(y, preds, zero_division=0)
    rec = recall_score(y, preds, zero_division=0)
    pro = (1-min(pre,0.9999999999999))/(1-min(rec,0.9999999999999))
    pro_loss = math.log(abs(beta - pro)+1)
    return {'fbeta':round3(fbeta_score(y, preds, beta=beta, zero_division=0)), 
            'f1':round3(fbeta_score(y, preds, beta=1, zero_division=0)), 
            'roc_auc':round3(roc_auc_score(y, preds)), 
            'precision':round3(pre),
            'recall':round3(rec),
            'pro': round3(pro),
            'pro_loss':pro_loss, 
            'TP': ((y == 1) & (preds == 1)).sum(),
            'TN': ((y == 0) & (preds == 0)).sum(),
            'FP': ((y == 0) & (preds == 1)).sum(),
            'FN': ((y == 1) & (preds == 0)).sum(),
             }

In [135]:
from sklearn.metrics import f1_score

f1_score(y_test, clf_cls.predict_proba(X_cls_test)[:,1] > 0.5)

0.6298003072196622

In [136]:
get_metrics(y_test, clf_cls.predict_proba(X_cls_test)[:,1] > 0.44, beta=1)

{'fbeta': 0.636,
 'f1': 0.636,
 'roc_auc': 0.717,
 'precision': 0.658,
 'recall': 0.616,
 'pro': 0.892,
 'pro_loss': 0.10263123063578138,
 'TP': 225,
 'TN': 525,
 'FP': 117,
 'FN': 140}

In [137]:
get_metrics(y_test, clf_mean.predict_proba(X_mean_test)[:,1] > 0.434, beta=1)

{'fbeta': 0.639,
 'f1': 0.639,
 'roc_auc': 0.718,
 'precision': 0.651,
 'recall': 0.627,
 'pro': 0.938,
 'pro_loss': 0.06032967426580218,
 'TP': 229,
 'TN': 519,
 'FP': 123,
 'FN': 136}

In [6]:
# Load trial dataset
file_test='all_merged_temp0_instr0_withEx_test_final_label.csv'

test_data=pd.read_csv(fold+file_test, sep="|", encoding ='utf-8')[['text', 'final_label']]
test_data=test_data.rename(columns={'final_label':'label_test'})
print (test_data.shape[0])
test_data.label_test=test_data.label_test.astype(int)
test_data.label_test=test_data.label_test.replace(3, 0)
test_data.head()

804


Unnamed: 0,text,label_test
0,"- интересный новый сервис, где можно оставить...",1
1,чет как-то нерадостно все это...особо на фоне...,0
2,#Repost with . ・・・ жаль что быстро убежала!!!#...,0
3,#hellomyearth #дорогажизни #разорванноекольцо,0
4,#ВтандемеСМамой#кактампробка#😁,0


In [7]:
test_data.label_test.value_counts()

0    532
1    272
Name: label_test, dtype: int64

In [151]:
%%time
emb_test = get_emb(lambda x: embed_bert_pytorch(x, model, tokenizer), test_data.text.values)

  0%|          | 0/804 [00:00<?, ?it/s]

CPU times: user 36min 6s, sys: 11 s, total: 36min 17s
Wall time: 45.4 s


In [140]:
embs['cls']

array([[ 0.00929506, -0.01398701,  0.00787013, ..., -0.00742487,
         0.02980503,  0.00831383],
       [-0.04644227, -0.02437294,  0.02493586, ...,  0.04479778,
        -0.02900108, -0.00685201],
       [-0.05861808, -0.00421491, -0.05079638, ...,  0.00123103,
        -0.02409819,  0.03700725],
       ...,
       [-0.03123325,  0.01808644, -0.02161152, ..., -0.00243144,
        -0.00893479, -0.00128236],
       [ 0.02588368,  0.0146559 , -0.03030781, ...,  0.00612109,
        -0.04469183,  0.02309735],
       [-0.00969478, -0.01412101, -0.01880374, ...,  0.01690224,
        -0.01349046,  0.00146537]], dtype=float32)

In [152]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report

import numpy as np
class_weight = compute_class_weight(
    class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight

array([0.7574276 , 1.47114682])

# SVM

In [153]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# parameteres = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
parameteres = {'C': [1], 'gamma': [1],'kernel': ['poly']}
clf = GridSearchCV(SVC(class_weight={0:class_weight[0], 1:class_weight[1]}, probability=True), param_grid=parameteres , cv=10, scoring='f1_macro')
clf.fit(embs['cls'], df.label)

In [154]:
y_predict=clf.predict_proba(emb_test['cls'])

In [155]:
import itertools
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

def TestCutoff(df):

    cut_off_list_=np.arange(0.005, 0.901, 0.005)
#     cut_off_list = itertools.chain([0.01], cut_off_list_)
    
    f1score_macro_list=[]
    f1score_list=[]
    recall_list=[]
    
    predict_list_list=[[]]
    for i, cut_off in enumerate(cut_off_list_):
        predict_list=np.where(df['predict_1']>cut_off, 1, 0)
        predict_list_list.append(predict_list)
        print (cut_off)
        precision, recall, f1score = precision_recall_fscore_support(df['label_test'], predict_list)[:3]
        print(f'precision: {precision}, recall: {recall}, f1score: {f1score}')
        f1score_list.append(f1score[1])
        recall_list.append(recall[1])
        print ("macro:")
        precision, recall, f1score_macro = precision_recall_fscore_support(df['label_test'], predict_list, average='macro')[:3]
        print(f'precision: {precision}, recall: {recall}, f1score_macro: {f1score_macro}')
        f1score_macro_list.append(f1score_macro)
        print (" ")
    
    max_ind=f1score_macro_list.index(max(f1score_macro_list))   
    print ("macro", f1score_macro_list[max_ind])
    print ("F1-valued:", f1score_list[max_ind])
    print ("recall-valued:", recall_list[max_ind])
    print (cut_off_list_[max_ind])
    
    df['predict']=predict_list_list[max_ind+1]
        
    return (df)
#         
    

In [None]:
predictions0=[]
predictions1=[]


for res in y_predict:
    predictions0.append(res[0])
    predictions1.append(res[1])

    
test_data['predict_0']=predictions0
test_data['predict_1']=predictions1

test_data_predict=TestCutoff(test_data)

In [157]:
from sklearn.metrics import precision_recall_fscore_support
precision, recall, f1score = precision_recall_fscore_support(test_data.label_test, test_data['predict'])[:3]

print(f'precision: {precision}, recall: {recall}, f1score: {f1score}')

precision: [0.84424779 0.76987448], recall: [0.89661654 0.67647059], f1score: [0.86964448 0.72015656]


In [158]:
precision, recall, f1score = precision_recall_fscore_support(test_data.label_test, test_data['predict'], average='macro')[:3]

print(f'precision: {precision}, recall: {recall}, f1score_macro: {f1score}')

precision: 0.8070611322990335, recall: 0.7865435647943388, f1score_macro: 0.7949005203659865


# LogitBoost

In [159]:
from logitboost import LogitBoost
from sklearn.model_selection import GridSearchCV

In [160]:
steps = [('LogitBoost', LogitBoost())]
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps) # define the pipeline object.

parameteres = {'LogitBoost__n_estimators':range(10,100,10)}


grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)


In [161]:
emb_test['cls'].shape

(804, 1024)

In [162]:
grid.fit(embs['cls'], df.label)

In [163]:
print("Best parameters from gridsearch: {}".format(grid.best_params_))
print("CV score=%0.3f" % grid.best_score_)
cv_results = grid.cv_results_
print(cv_results)

Best parameters from gridsearch: {'LogitBoost__n_estimators': 50}
CV score=0.714
{'mean_fit_time': array([ 3.73461342,  7.4055469 , 10.97259603, 14.7951643 , 19.09631443,
       22.78958521, 25.92236528, 30.04713492, 32.93792868]), 'std_fit_time': array([0.06701074, 0.16823047, 0.20264273, 0.35731802, 0.60867683,
       0.5151093 , 0.40347909, 0.57965765, 0.52679355]), 'mean_score_time': array([0.00888124, 0.0151794 , 0.02140398, 0.02689648, 0.03915634,
       0.04470406, 0.05560169, 0.06649499, 0.06598163]), 'std_score_time': array([0.00147477, 0.00141773, 0.00223581, 0.00042711, 0.00533535,
       0.0096013 , 0.00942776, 0.01159643, 0.0103816 ]), 'param_LogitBoost__n_estimators': masked_array(data=[10, 20, 30, 40, 50, 60, 70, 80, 90],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object), 'params': [{'LogitBoost__n_estimators': 10}, {'LogitBoost__n_estimators': 20}, {'LogitBoost__n_estima

In [164]:
y_pred = grid.predict_proba(emb_test['cls'])

In [None]:
predictions0=[]
predictions1=[]


for res in y_pred:
    predictions0.append(res[0])
    predictions1.append(res[1])

    
test_data['predict_0']=predictions0
test_data['predict_1']=predictions1

test_data_predict=TestCutoff(test_data)

In [166]:
from sklearn.metrics import precision_recall_fscore_support
precision, recall, f1score = precision_recall_fscore_support(test_data.label_test, test_data['predict'])[:3]

print(f'precision: {precision}, recall: {recall}, f1score: {f1score}')

precision: [0.84115523 0.736     ], recall: [0.87593985 0.67647059], f1score: [0.85819521 0.70498084]


In [167]:
precision, recall, f1score = precision_recall_fscore_support(test_data.label_test, test_data['predict'], average='macro')[:3]

print(f'precision: {precision}, recall: {recall}, f1score_macro: {f1score}')

precision: 0.7885776173285198, recall: 0.7762052189296771, f1score_macro: 0.7815880273491247


# Logit Regression

In [168]:
from sklearn.linear_model import LogisticRegression

In [169]:
import numpy as np
class_weight = compute_class_weight(
    class_weight='balanced', classes=np.unique(df.label), y=df.label)
class_weight

array([0.76264768, 1.45184544])

In [170]:
parameters = {'C': np.linspace(0.0001, 10, 50), "penalty":["l1","l2"]}  #high C means "Trust this training data a lot", while a low value says "This data may not be fully representative of the real world data, so if it's telling you to make a parameter really large, don't listen to it"
grid_search = GridSearchCV(LogisticRegression(class_weight={0:class_weight[0], 1:class_weight[1]}), parameters, cv=5, scoring='f1_macro')
grid_search.fit(embs['cls'], df.label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [171]:
y_pred = grid_search.predict_proba(emb_test['cls'])

In [None]:
predictions0=[]
predictions1=[]


for res in y_pred:
    predictions0.append(res[0])
    predictions1.append(res[1])

    
test_data['predict_0']=predictions0
test_data['predict_1']=predictions1

test_data_predict=TestCutoff(test_data)

In [175]:
from sklearn.metrics import precision_recall_fscore_support
precision, recall, f1score = precision_recall_fscore_support(test_data.label_test, test_data['predict'])[:3]

print(f'precision: {precision}, recall: {recall}, f1score: {f1score}')

precision: [0.86440678 0.73260073], recall: [0.86278195 0.73529412], f1score: [0.8635936  0.73394495]


In [176]:
precision, recall, f1score = precision_recall_fscore_support(test_data.label_test, test_data['predict'], average='macro')[:3]

print(f'precision: {precision}, recall: {recall}, f1score_macro: {f1score}')

precision: 0.7985037561308748, recall: 0.7990380362671385, f1score_macro: 0.7987692785693943
