In [None]:
pip install sentence_transformers


In [None]:
import os
import calendar

import pandas as pd
import numpy as np 
import gensim
import nltk
import spacy

from matplotlib import pyplot as plt
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import train_test_split, StratifiedKFold, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

%matplotlib inline

In [None]:
df_train  = pd.read_csv('/kaggle/input/propaganda-detection-our-data/fake_detection_df_train.csv')
df_test  = pd.read_csv("/kaggle/input/propaganda-detection-our-data/fake_detection_df_test.csv")
df_val  = pd.read_csv("/kaggle/input/propaganda-detection-our-data/fake_detection_df_val.csv")

In [None]:
df_train

In [None]:
df_train["label"].value_counts()

True     544
False    336
Name: label, dtype: int64

In [None]:
from nltk.corpus import stopwords
import re

nltk.download('stopwords')
stop_words = stopwords.words('russian')

def preprocess(text, join_back=True):
    text =  re.sub(r'\n', '', text)
    text = re.sub(r'[^\w\s]','', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = text.lower()

    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in stop_words:
            result.append(token)
    if join_back:
        result = " ".join(result)
    return result


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df_train["text_clean"] = df_train["text"].apply(preprocess)
df_test["text_clean"] = df_test["text"].apply(preprocess)
df_val["text_clean"] = df_val["text"].apply(preprocess)

In [None]:
df_train = pd.concat([df_train, df_val], axis=0)
df_train

Unnamed: 0,id,text,label,text_clean
0,00000_03514,Чернигов прилет во многоэтажку. Говорят русска...,True,чернигов прилет многоэтажку говорят русская ра...
1,00002_06059,Председатель Следственного комитета РФ Алексан...,True,председатель следственного комитета рф алексан...
2,00003_08645,Все сейчас массово хотят уехать со Львова.,True,массово хотят уехать львова
3,00004_00901,«К военным подошли бабушки и попросили убрать ...,True,военным подошли бабушки попросили убрать аллею...
4,00006_06251,"С уважение отношусь к Лобаеву, но Владислав, е...",True,уважение отношусь лобаеву владислав вопросы пр...
...,...,...,...,...
122,08848_08668,Принято решение отключать РФ от SWIFT - МИД Ук...,True,принято решение отключать рф swift мид украины...
123,08855_04438,"В ДНР заявили, что украинские силовики обстрел...",True,днр заявили украинские силовики обстреляли сел...
124,08873_06191,Франция намерена принять меры по борьбе с расп...,False,франция намерена принять меры борьбе российски...
125,08878_04343,Макрон созывает на 19:00 мск Совет обороны стр...,False,макрон созывает мск совет обороны страны ситуа...


In [None]:
import warnings

In [None]:
def testing(scaler, classifier):
  model = Pipeline([
        ("scaler", scaler),
        ("classifier", classifier)
    ])
  model.fit(df_train["text_clean"], df_train["label"])
  y_pred = model.predict_proba(df_test["text_clean"])[:,1]
  print("FINAL TESTING")
  print("ROC AUC = ",roc_auc_score(df_test["label"].values,  y_pred))
  print("accuracy :" ,accuracy_score(df_test["label"].values, model.predict(df_test["text_clean"])))

In [None]:
def testing_embs(scaler, classifier):
  model = Pipeline([
        ("scaler", scaler),
        ("classifier", classifier)
    ])
  model.fit(train_embs, df_train["label"])
  y_pred = model.predict_proba(test_embs)[:,1]
  print("FINAL TESTING")
  print("ROC AUC = ",roc_auc_score(df_test["label"].values,  y_pred))
  print("accuracy :" ,accuracy_score(df_test["label"].values, model.predict(test_embs)))

In [None]:

!python -m spacy download ru_core_news_lg

 

In [None]:
import numpy as np
import spacy
from sklearn.base import BaseEstimator, TransformerMixin

spacy_nlp = spacy.load('ru_core_news_lg')
# spacy_doc = spacy_nlp(df['comment_text_clean'][0])

class SpacyEmbeddings(TransformerMixin,BaseEstimator):
    def __init__(self, model=spacy_nlp):
        self.model = model

    def fit(self,X,y=None):
        return self

    def transform(self,X):
        data = list(spacy_nlp(doc).vector.reshape(1,-1) for doc in X)
        return np.concatenate(data)

In [None]:
train_embs = SpacyEmbeddings().fit_transform(df_train["text_clean"].to_list())
test_embs = SpacyEmbeddings().fit_transform(df_test["text_clean"].to_list())

# **Log Regression**

In [None]:
from sklearn.model_selection import GridSearchCV

def search_hyper_p_embs_regr(scaler, classifier):

  model = Pipeline([
          ("scaler", scaler),
          ("classifier", classifier)
      ])

  # print(model.get_params().keys())



  params={
      'classifier__C':[0.0001, 0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 10],
      'classifier__max_iter':[1000, 10000],
      'classifier__penalty':['l1',  'l2'],
      'classifier__solver': ['lbfgs', 'newton-cg']
  }
    


  grid_search = GridSearchCV(model,
                          param_grid=params,
                          cv=5,
                          scoring='roc_auc'
                          #  verbose=1, 
                          )
  with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    grid_search.fit(train_embs, df_train["label"])
  return grid_search.best_params_, grid_search.best_score_

In [None]:
def search_hyper_p_regr(vectorizer, classifier):

  model = Pipeline([
          ("vectorizer", vectorizer),
          ("classifier", classifier)
      ])
  
  params={
      'classifier__C':[0.0001, 0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 10],
      'classifier__max_iter':[1000, 10000],
      'classifier__penalty':['l1',  'l2'],
      'classifier__solver': ['lbfgs', 'newton-cg']
  }
  

  grid_search = GridSearchCV(model,
                          param_grid=params,
                          cv=5,
                          verbose=1, scoring='roc_auc',
                          )
  with warnings.catch_warnings():
    warnings.filterwarnings("ignore")

    grid_search.fit(df_train["text_clean"], df_train["label"])

  return grid_search.best_params_, grid_search.best_score_

## BOW

In [None]:
parametrs, acc = search_hyper_p_regr(CountVectorizer(), LogisticRegression())
print(parametrs, acc)
param_1 , param_2, param_3, param_4 = parametrs.values() 
# testing(all_models, max_iter=1000)

testing(CountVectorizer(), LogisticRegression(C=param_1, max_iter=param_2, penalty=param_3, solver=param_4))



Fitting 5 folds for each of 80 candidates, totalling 400 fits

{'classifier__C': 0.1, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'} 0.724014894347594

FINAL TESTING

ROC AUC =  0.7362880337931489

accuracy : 0.6370967741935484


## TF-IDF 

In [None]:
parametrs, acc = search_hyper_p_regr(TfidfVectorizer(), LogisticRegression())
print(parametrs, acc)
param_1 , param_2, param_3, param_4 = parametrs.values() 

testing(CountVectorizer(), LogisticRegression(C=param_1, max_iter=param_2, penalty=param_3, solver=param_4))


Fitting 5 folds for each of 80 candidates, totalling 400 fits

{'classifier__C': 0.01, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'} 0.6990285948085322

<class 'float'>

FINAL TESTING

ROC AUC =  0.7267177084020857

accuracy : 0.5604838709677419


## SPACY 

In [None]:
parametrs, acc = search_hyper_p_embs_regr(StandardScaler(), LogisticRegression())
print(parametrs, acc)
param_1 , param_2, param_3, param_4 = parametrs.values() 

testing_embs(StandardScaler(), LogisticRegression(C=param_1, max_iter=param_2, penalty=param_3, solver=param_4))

{'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'} 0.6848682436292485

FINAL TESTING

ROC AUC =  0.7325919081248762

accuracy : 0.6370967741935484


In [None]:
parametrs, acc = search_hyper_p_embs_regr(MinMaxScaler(), LogisticRegression())
print(parametrs, acc)
param_1 , param_2, param_3, param_4 = parametrs.values() 

testing_embs(MinMaxScaler(), LogisticRegression(C=param_1, max_iter=param_2, penalty=param_3, solver=param_4))

{'classifier__C': 0.1, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'} 0.6847322309421889

FINAL TESTING

ROC AUC =  0.7295558048973665

accuracy : 0.6491935483870968







## Sentence Transformers

**distiluse-base-multilingual-cased-v1**

In [None]:
import torch
# torch.cuda.is_available()
     
from sentence_transformers import SentenceTransformer

sent_tr = SentenceTransformer('distiluse-base-multilingual-cased-v1',device="cpu")

train_embs = sent_tr.encode(df_train["text_clean"].to_list())
test_embs = sent_tr.encode(df_test["text_clean"].to_list())

In [None]:
parametrs, acc = search_hyper_p_embs_regr(StandardScaler(), LogisticRegression())
print(parametrs, acc)
param_1 , param_2, param_3, param_4 = parametrs.values() 

testing_embs(StandardScaler(), LogisticRegression(C=param_1, max_iter=param_2, penalty=param_3, solver=param_4))

{'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'} 0.7511882477454633
FINAL TESTING
ROC AUC =  0.7792554946868193
accuracy : 0.6693548387096774


In [None]:
parametrs, acc = search_hyper_p_embs_regr(MinMaxScaler(), LogisticRegression())
print(parametrs, acc)
param_1 , param_2, param_3, param_4 = parametrs.values() 

testing_embs(MinMaxScaler(), LogisticRegression(C=param_1, max_iter=param_2, penalty=param_3, solver=param_4))

{'classifier__C': 0.1, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'} 0.7550270026613353

FINAL TESTING

ROC AUC =  0.7772094251204541

accuracy : 0.6774193548387096


another model  - sberbank-ai/sbert_large_nlu_ru

In [None]:
import torch
# torch.cuda.is_available()
     
from sentence_transformers import SentenceTransformer

sent_tr = SentenceTransformer('sberbank-ai/sbert_large_nlu_ru',device="cuda")

train_embs = sent_tr.encode(df_train["text_clean"].to_list())
test_embs = sent_tr.encode(df_test["text_clean"].to_list())

In [None]:
parametrs, acc = search_hyper_p_embs_regr(StandardScaler(), LogisticRegression())
print(parametrs, acc)
param_1 , param_2, param_3, param_4 = parametrs.values() 

testing_embs(StandardScaler(), LogisticRegression(C=param_1, max_iter=param_2, penalty=param_3, solver=param_4))

{'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'} 0.7711695279253854

FINAL TESTING

ROC AUC =  0.7890898290541879

accuracy : 0.7137096774193549


**DeepPavlov/rubert-base-cased**

In [None]:
import torch
# torch.cuda.is_available()
     
from sentence_transformers import SentenceTransformer

sent_tr = SentenceTransformer('DeepPavlov/rubert-base-cased',device="cuda")

train_embs = sent_tr.encode(df_train["text_clean"].to_list())
test_embs = sent_tr.encode(df_test["text_clean"].to_list())

In [None]:
parametrs, acc = search_hyper_p_embs_regr(StandardScaler(), LogisticRegression())
print(parametrs, acc)
param_1 , param_2, param_3, param_4 = parametrs.values() 

testing_embs(StandardScaler(), LogisticRegression(C=param_1, max_iter=param_2, penalty=param_3, solver=param_4))

{'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'} 0.7514657464776311

FINAL TESTING

ROC AUC =  0.8137746683387235

accuracy : 0.7419354838709677


# **NAIVE BAYES**

In [None]:
from sklearn.model_selection import GridSearchCV

def search_hyper_p_embs_bayes(scaler, classifier):

  model = Pipeline([
          ("scaler", scaler),
          ("classifier", classifier)
      ])

  # print(model.get_params().keys())

  params={
      'classifier__alpha':[0.0001, 0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 10]
  }

  grid_search = GridSearchCV(model,
                          param_grid=params,
                          cv=5,
                          scoring='roc_auc'
                          #  verbose=1, 
                          )
  with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    grid_search.fit(train_embs, df_train["label"])
  return grid_search.best_params_, grid_search.best_score_

In [None]:
def search_hyper_p_bayes(vectorizer, classifier):

  model = Pipeline([
          ("vectorizer", vectorizer),
          ("classifier", classifier)
      ])
    
  params={
      'classifier__alpha':[0.0001, 0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 10]
  }


  grid_search = GridSearchCV(model,
                          param_grid=params,
                          cv=5,
                          verbose=1, scoring='roc_auc',
                          )
  with warnings.catch_warnings():
    warnings.filterwarnings("ignore")

    grid_search.fit(df_train["text_clean"], df_train["label"])

  return grid_search.best_params_, grid_search.best_score_

## BOW

In [None]:
parametrs, acc = search_hyper_p_bayes(CountVectorizer(), MultinomialNB())
print(parametrs, acc)
param_1,  = parametrs.values() 

testing(CountVectorizer(), MultinomialNB(alpha = param_1))



Fitting 5 folds for each of 10 candidates, totalling 50 fits

{'classifier__alpha': 4} 0.7302758931253667

FINAL TESTING

ROC AUC =  0.7277407431852683

accuracy : 0.5967741935483871


## TF-IDF


In [None]:
parametrs, acc = search_hyper_p_bayes(TfidfVectorizer(), MultinomialNB())
print(parametrs, acc)
param_1,  = parametrs.values() 

testing(CountVectorizer(), MultinomialNB(alpha = param_1))



Fitting 5 folds for each of 10 candidates, totalling 50 fits

{'classifier__alpha': 2} 0.7061650284286429

FINAL TESTING

ROC AUC =  0.7191604514553495

accuracy : 0.6290322580645161


## SPACY 

In [None]:
parametrs, acc = search_hyper_p_embs_bayes(MinMaxScaler(), MultinomialNB())
print(parametrs, acc)
param_1 , = parametrs.values() 

testing_embs(MinMaxScaler(), MultinomialNB(alpha=param_1))

{'classifier__alpha': 2} 0.657255162247168

FINAL TESTING

ROC AUC =  0.735628011352386

accuracy : 0.5806451612903226


## Sentence Transformers

rubert-base-cased

In [None]:
parametrs, acc = search_hyper_p_embs_bayes(MinMaxScaler(), MultinomialNB())
print(parametrs, acc)
param_1 , = parametrs.values() 

testing_embs(MinMaxScaler(), MultinomialNB(alpha= param_1))

{'classifier__alpha': 0.0001} 0.7260365576698344

FINAL TESTING

ROC AUC =  0.7763513959474622

accuracy : 0.7419354838709677


distiluse-base-multilingual-cased-v1

In [None]:
parametrs, acc = search_hyper_p_embs_bayes(MinMaxScaler(), MultinomialNB())
print(parametrs, acc)
param_1 , = parametrs.values() 

testing_embs(MinMaxScaler(), MultinomialNB(alpha= param_1))

{'classifier__alpha': 5} 0.7332948795627308

FINAL TESTING

ROC AUC =  0.7323939013926473

accuracy : 0.6491935483870968


# **GradientBoostingClassifier**

In [None]:
from sklearn.model_selection import GridSearchCV

def search_hyper_p_embs_gboost(scaler, classifier):

  model = Pipeline([
          ("scaler", scaler),
          ("classifier", classifier)
      ])

  # print(model.get_params().keys())



  params={
      'classifier__loss':['log_loss', 'deviance', 'exponential'],
      'classifier__learning_rate':[0.0001, 0.001, 0.01],
      'classifier__n_estimators':[100, 1000]
  }
    


  grid_search = GridSearchCV(model,
                          param_grid=params,
                          cv=5,
                          scoring='roc_auc'
                          #  verbose=1, 
                          )
  with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    grid_search.fit(train_embs, df_train["label"])
  return grid_search.best_params_, grid_search.best_score_

In [None]:
def search_hyper_p_gboost(vectorizer, classifier):

  model = Pipeline([
          ("vectorizer", vectorizer),
          ("classifier", classifier)
      ])
  
  params={
      'classifier__loss':['log_loss', 'deviance', 'exponential'],
      'classifier__learning_rate':[0.0001, 0.001, 0.01, 0.1, 1],
      'classifier__n_estimators':[10, 100, 1000, 10000]
  }
  

  grid_search = GridSearchCV(model,
                          param_grid=params,
                          cv=5,
                          verbose=1, scoring='roc_auc',
                          )
  with warnings.catch_warnings():
    warnings.filterwarnings("ignore")

    grid_search.fit(df_train["text_clean"], df_train["label"])

  return grid_search.best_params_, grid_search.best_score_

## Sentence Transformers

rubert-base-cased

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
parametrs, acc = search_hyper_p_embs_gboost(StandardScaler(), GradientBoostingClassifier())
print(parametrs, acc)
param_1 , param_2, param_3 = parametrs.values() 



{'classifier__learning_rate': 0.01, 'classifier__loss': 'deviance', 'classifier__n_estimators': 1000} 0.7216369721958856


In [None]:
testing_embs(StandardScaler(), GradientBoostingClassifier(loss=param_2, learning_rate=param_1, n_estimators=param_3))

FINAL TESTING
ROC AUC =  0.777407431852683
accuracy : 0.6895161290322581


# **SVM**

In [None]:
from sklearn.model_selection import GridSearchCV

def search_hyper_p_embs_SVM(scaler, classifier):

  model = Pipeline([
          ("scaler", scaler),
          ("classifier", classifier)
      ])

  # print(model.get_params().keys())



  params={
      'classifier__C':[0.1, 1, 10],
      'classifier__kernel':['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
      'classifier__degree':[2, 3, 4, 5],
      'classifier__gamma':['scale', 'auto']
  }
    


  grid_search = GridSearchCV(model,
                          param_grid=params,
                          cv=5,
                          scoring='roc_auc'
                          #  verbose=1, 
                          )
  with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    grid_search.fit(train_embs, df_train["label"])
  return grid_search.best_params_, grid_search.best_score_

In [None]:
def search_hyper_p_SVM(vectorizer, classifier):

  model = Pipeline([
          ("vectorizer", vectorizer),
          ("classifier", classifier)
      ])
  
  params={
       'classifier__C':[0.1, 1, 10],
      'classifier__kernel':['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
      'classifier__degree':[2, 3, 4, 5]
  }
  

  grid_search = GridSearchCV(model,
                          param_grid=params,
                          cv=5,
                          verbose=1, scoring='roc_auc',
                          )
  with warnings.catch_warnings():
    warnings.filterwarnings("ignore")

    grid_search.fit(df_train["text_clean"], df_train["label"])

  return grid_search.best_params_, grid_search.best_score_

## **BOW**

In [None]:
parametrs, acc = search_hyper_p_SVM(CountVectorizer(), SVC())
print(parametrs, acc)
param_1, param_2, param_3 = parametrs.values() 

testing(CountVectorizer(), SVC(C = param_1, degree= param_2, kernel=param_3, probability=True))

Fitting 5 folds for each of 60 candidates, totalling 300 fits
{'classifier__C': 1, 'classifier__degree': 2, 'classifier__kernel': 'rbf'} 0.7139053998632946
FINAL TESTING
ROC AUC =  0.7387961190680483
accuracy : 0.6854838709677419


## **TF-IDF**

In [None]:
parametrs, acc = search_hyper_p_SVM(TfidfVectorizer(), SVC())
print(parametrs, acc)
param_1, param_2, param_3 = parametrs.values() 

testing(TfidfVectorizer(), SVC(C = param_1, degree= param_2, kernel=param_3, probability=True))

Fitting 5 folds for each of 60 candidates, totalling 300 fits
{'classifier__C': 1, 'classifier__degree': 2, 'classifier__kernel': 'linear'} 0.6884823319295306
FINAL TESTING
ROC AUC =  0.6836842452643389
accuracy : 0.6612903225806451


## **Sentence Transformers**

rubert-base-cased

In [None]:
from sklearn.svm import SVC


In [None]:
parametrs, acc = search_hyper_p_embs_SVM(StandardScaler(), SVC())
print(parametrs, acc)
param_1 , param_2, param_3, param_4 = parametrs.values() 

testing_embs(StandardScaler(), SVC(C=param_1, degree=param_2, gamma=param_3, kernel = param_4,probability=True))

{'classifier__C': 1, 'classifier__degree': 2, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf'} 0.7478529138094504
FINAL TESTING
ROC AUC =  0.8172727872747673
accuracy : 0.7379032258064516


In [None]:
# another preprocess without gensim 
parametrs, acc = search_hyper_p_embs_SVM(StandardScaler(), SVC())
print(parametrs, acc)
param_1 , param_2, param_3, param_4 = parametrs.values() 

testing_embs(StandardScaler(), SVC(C=param_1, degree=param_2, gamma=param_3, kernel = param_4,probability=True))

{'classifier__C': 1, 'classifier__degree': 2, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf'} 0.748324803210372
FINAL TESTING
ROC AUC =  0.8080324731040855
accuracy : 0.7258064516129032


sbert_large_mt_nlu_ru

In [None]:
parametrs, acc = search_hyper_p_embs_SVM(StandardScaler(), SVC())
print(parametrs, acc)
param_1 , param_2, param_3, param_4 = parametrs.values() 

testing_embs(StandardScaler(), SVC(C=param_1, degree=param_2, gamma=param_3, kernel = param_4,probability=True))

{'classifier__C': 1, 'classifier__degree': 2, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf'} 0.7844463122615923

FINAL TESTING

ROC AUC =  0.8040723384595075

accuracy : 0.7096774193548387

distiluse-base-multilingual-cased-v1

In [None]:
parametrs, acc = search_hyper_p_embs_SVM(StandardScaler(), SVC())
print(parametrs, acc)
param_1 , param_2, param_3, param_4 = parametrs.values() 

testing_embs(StandardScaler(), SVC(C=param_1, degree=param_2, gamma=param_3, kernel = param_4,probability=True))

{'classifier__C': 1, 'classifier__degree': 2, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf'} 0.7410965713403745

FINAL TESTING

ROC AUC =  0.7725232657910369

accuracy : 0.6854838709677419

# **1. Combination of Sentence Transformers and Logistic Regression gives better results than others algorithms**

**distiluse-base-multilingual-cased-v1**

result only on train data

In [None]:
import pickle

parametrs, acc = search_hyper_p_embs_regr(StandardScaler(), LogisticRegression())
print(parametrs, acc)
param_1 , param_2, param_3, param_4 = parametrs.values() 

model = Pipeline([
        ("scaler", StandardScaler()),
        ("classifier",  LogisticRegression(C=param_1, max_iter=param_2, penalty=param_3, solver=param_4))
    ])
model.fit(train_embs, df_train["label"])

name = 'regression_transf_standard.pkl'
pickle.dump(model, open(name, 'wb'))


{'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'} 0.7571054341830454


In [None]:
import pickle

name = 'regression_transf_standard.pkl'

loaded_model = pickle.load(open(name, 'rb'))
y_pred = loaded_model.predict_proba(test_embs)[:,1]
print("FINAL TESTING")
print("ROC AUC = ",roc_auc_score(df_test["label"].values,  y_pred))
print("accuracy :" ,accuracy_score(df_test["label"].values, loaded_model.predict(test_embs)))

FINAL TESTING

ROC AUC =  0.7733152927199525

accuracy : 0.6895161290322581


result on train + val dataset

In [None]:
import pickle

parametrs, acc = search_hyper_p_embs_regr(StandardScaler(), LogisticRegression())
print(parametrs, acc)
param_1 , param_2, param_3, param_4 = parametrs.values() 

model = Pipeline([
        ("scaler", StandardScaler()),
        ("classifier",  LogisticRegression(C=param_1, max_iter=param_2, penalty=param_3, solver='lbfgs'))
    ])
model.fit(train_embs, df_train["label"])

name = 'regression_transf_standard.pkl'
pickle.dump(model, open(name, 'wb'))


{'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'} 0.7528724979604436


In [None]:
import pickle

name = 'regression_transf_standard.pkl'

loaded_model = pickle.load(open(name, 'rb'))
y_pred = loaded_model.predict_proba(test_embs)[:,1]
print("FINAL TESTING")
print("ROC AUC = ",roc_auc_score(df_test["label"].values,  y_pred))
print("accuracy :" ,accuracy_score(df_test["label"].values, loaded_model.predict(test_embs)))

FINAL TESTING

ROC AUC =  0.7768794139000726

accuracy : 0.6895161290322581


**rubert-base-cased**

In [None]:
import pickle

parametrs, acc = search_hyper_p_embs_regr(StandardScaler(), LogisticRegression())
print(parametrs, acc)
param_1 , param_2, param_3, param_4 = parametrs.values() 

model = Pipeline([
        ("scaler", StandardScaler()),
        ("classifier",  LogisticRegression(C=param_1, max_iter=param_2, penalty=param_3, solver=param_4))
    ])
model.fit(train_embs, df_train["label"])

name = 'regression_transf_standard_2.pkl'
pickle.dump(model, open(name, 'wb'))

import pickle

name = 'regression_transf_standard_2.pkl'

loaded_model = pickle.load(open(name, 'rb'))
y_pred = loaded_model.predict_proba(test_embs)[:,1]
print("FINAL TESTING")
print("ROC AUC = ",roc_auc_score(df_test["label"].values,  y_pred))
print("accuracy :" ,accuracy_score(df_test["label"].values, loaded_model.predict(test_embs)))


{'classifier__C': 0.001, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'} 0.7514657464776311

FINAL TESTING

ROC AUC =  0.8137746683387235

accuracy : 0.7419354838709677

# **2**. **Combination of Sentence Transformers and SVM gives the best result**

DeepPavlov/rubert-base-cased

In [None]:
import pickle

parametrs, acc = search_hyper_p_embs_SVM(StandardScaler(), SVC())
print(parametrs, acc)
param_1 , param_2, param_3, param_4 = parametrs.values() 

model = Pipeline([
        ("scaler", StandardScaler()),
        ("classifier",  SVC(C=param_1, degree=param_2, gamma=param_3, kernel = param_4,probability=True))
    ])
model.fit(train_embs, df_train["label"])

name = 'svm_transf_standard.pkl'
pickle.dump(model, open(name, 'wb'))


{'classifier__C': 1, 'classifier__degree': 2, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf'} 0.7478529138094504


In [None]:
import pickle

name = 'svm_transf_standard.pkl'

loaded_model = pickle.load(open(name, 'rb'))
y_pred = loaded_model.predict_proba(test_embs)[:,1]
print("FINAL TESTING")
print("ROC AUC = ",roc_auc_score(df_test["label"].values,  y_pred))
print("accuracy :" ,accuracy_score(df_test["label"].values, loaded_model.predict(test_embs)))

FINAL TESTING
ROC AUC =  0.8174047917629199
accuracy : 0.7379032258064516
