In [1]:
pip install sentence_transformers


Installing collected packages: tokenizers, sentencepiece, huggingface-hub, transformers, sentence_transformers
Successfully installed huggingface-hub-0.12.1 sentence_transformers-2.2.2 sentencepiece-0.1.97 tokenizers-0.13.2 transformers-4.26.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import calendar

import pandas as pd
import numpy as np 
import gensim
import nltk

from matplotlib import pyplot as plt
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import train_test_split, StratifiedKFold, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

%matplotlib inline

In [4]:
df_train  = pd.read_csv('/content/drive/MyDrive/dataset_labeled/fake_detection_df_train.csv')
df_test  = pd.read_csv("/content/drive/MyDrive/dataset_labeled/fake_detection_df_test.csv")
df_val  = pd.read_csv("/content/drive/MyDrive/dataset_labeled/fake_detection_df_val.csv")

In [5]:
from nltk.corpus import stopwords
import re

nltk.download('stopwords')
stop_words = stopwords.words('russian')

def preprocess(text, join_back=True):
    text =  re.sub(r'\n', '', text)
    text = re.sub(r'[^\w\s]','', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = text.lower()

    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in stop_words:
            result.append(token)
    if join_back:
        result = " ".join(result)
    return result


df_train["text_clean"] = df_train["text"].apply(preprocess)
df_test["text_clean"] = df_test["text"].apply(preprocess)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
import torch
# torch.cuda.is_available()
     
from sentence_transformers import SentenceTransformer

sent_tr = SentenceTransformer('distiluse-base-multilingual-cased-v1',device="cuda")

train_embs = sent_tr.encode(df_train["text_clean"].to_list())
test_embs = sent_tr.encode(df_test["text_clean"].to_list())

Downloading (…)5f450/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Downloading (…)966465f450/README.md:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Downloading (…)6465f450/config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)5f450/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

Downloading (…)966465f450/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)465f450/modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

In [7]:
df_not_labeled = pd.read_csv("/content/drive/MyDrive/dataset_labeled/text_table_processed.csv")

# df_not_labeled["Text_processed"].isnull().sum()

df_not_labeled = df_not_labeled[df_not_labeled['Text_processed'].notna()]
# df_not_labeled.shape

df_not_labeled = df_not_labeled[df_not_labeled["lang"] == 'ru']
df_not_labeled = df_not_labeled.iloc[:df_not_labeled.shape[0]//2]

df_not_labeled['Text_processed_clean'] = df_not_labeled['Text_processed'].apply(preprocess)
df_not_labeled.to_csv("Not_labeled_Cleaned_ds.csv")


In [8]:
df_not_labeled = pd.read_csv("/content/drive/MyDrive/dataset_labeled/Not_labeled_Cleaned_ds.csv")

In [9]:
df_not_labeled['Text_processed_clean'].shape[0]

517769

In [10]:
# downloading embs, unlabeled data
for_labeling_embs = np.load(r'/content/drive/MyDrive/dataset_labeled/embeddings.npy')


In [11]:
rounds = 3
for rnd in range(rounds):
  if rnd == 0:
    model = Pipeline([
                  ("scaler", StandardScaler()),
                  ("classifier", LogisticRegression(max_iter=1000, C=0.001, penalty='l2', solver='lbfgs'))
              ])

    model.fit(train_embs, df_train["label"])

  else:
    tr = np.concatenate((train_embs, for_labeling_embs[predictions[0]]), axis=0) 
    model = Pipeline([
                  ("scaler", StandardScaler()),
                  ("classifier", LogisticRegression(max_iter=1000, C=0.001, penalty='l2', solver='lbfgs'))
              ])

    model.fit(tr, dataset["label"])

  predictions = np.where((model.predict_proba(for_labeling_embs) > 0.7) == True)
  new_df = df_not_labeled.iloc[predictions[0]]
  new_df["label"] = predictions[1].astype(bool)
  new_df = new_df[["Text_processed", "label", "Text_processed_clean"]]
  new_df.rename(columns={"Text_processed": 'text', 'Text_processed_clean':'text_clean'}, inplace=True)


  dataset = pd.concat([df_train.iloc[:, 1:], new_df])

  print(f"--- round: {rnd} ---")
  print("FINAL TESTING ")
  y_pred_test = model.predict_proba(test_embs)[:,1]
  print("ROC AUC = ",roc_auc_score(df_test["label"].values,  y_pred_test))
  print("accuracy :" ,accuracy_score(df_test["label"].values, model.predict(test_embs)))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["label"] = predictions[1].astype(bool)


--- round: 0 ---
FINAL TESTING 
ROC AUC =  0.7733152927199525
accuracy : 0.6895161290322581


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["label"] = predictions[1].astype(bool)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


--- round: 1 ---
FINAL TESTING 
ROC AUC =  0.7407761863903372
accuracy : 0.6411290322580645
--- round: 2 ---
FINAL TESTING 
ROC AUC =  0.7181374166721668
accuracy : 0.6532258064516129


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["label"] = predictions[1].astype(bool)
