In [None]:
import pandas as pd
import os
import json
from pathlib import Path
from tqdm import tqdm
from sklearn.model_selection import train_test_split, StratifiedGroupKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import pickle
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
import torch
from torch import nn
import transformers
from statistics import mode
import warnings
from sklearn.linear_model import LogisticRegression

In [2]:
class CFG:
    encoder = {'low': 0, 'mixed': 1, 'high': 2}
    target = 'fact'
    max_len = 512
    text_col = 'article_text_lemmatization'
    input_text='article'
#     text_col = 'wiki_text_lemmatization'
    logging_steps = 100

In [3]:

df = pd.read_pickle('/input/news-small-dataset/article.pkl')
# df = pd.read_pickle('/input/news-small-dataset/media_description.pkl')
df['website'] = df['source_url'].str.strip('https://').str.strip('www.')

In [4]:
%%time
from multiprocessing import Pool, cpu_count
os.environ["TOKENIZERS_PARALLELISM"] = 'false'
# Function to process each row
def process_row(row):
    text = row[CFG.text_col]
    label = row[CFG.target]
    label = np.vectorize(lambda item: CFG.encoder[item])(label)
    
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=CFG.max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    result = {
        'text': text,
        'input_ids': encoding['input_ids'].flatten().tolist(),
        'attention_mask': encoding['attention_mask'].flatten().tolist(),
        'label': label.tolist()
    }
    
    return result

def process_chunk(df_chunk):
    return df_chunk.apply(process_row, axis=1)

# Function to apply processing in parallel
def parallelize_dataframe(df, func, n_cores=cpu_count()):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    results = list(tqdm(pool.imap(func, df_split), total=len(df_split)))
    pool.close()
    pool.join()
    return pd.concat(results)


CPU times: user 58 µs, sys: 0 ns, total: 58 µs
Wall time: 62.5 µs


In [5]:
with open('/kaggle/input/news-small-dataset/2/Архив 2/fact_distribution.json') as f:
    split = json.load(f)

In [6]:
class ArticlesDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        label = self.df.iloc[idx, -1]
        attention_mask = self.df.iloc[idx, -2]
        input_ids = self.df.iloc[idx, -3]
        text = self.df.iloc[idx, -4]
        return {
            'text': text,
            'input_ids': torch.tensor(input_ids),
            'attention_mask': torch.tensor(attention_mask),
            'label': torch.tensor(label).to(dtype=torch.int8)
        }

In [7]:
!mkdir probability

In [8]:
class NewsModel:
    def __init__(self, model=None, tokenizer=None, target='fact',
                input_text =CFG.input_text) -> None:
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.model = model
        self.tokenizer = tokenizer
        self.target = target
        self.input_text = input_text
    
    
    def create_dataset(self, X_train, X_test):
        self.train_dataset = ArticlesDataset(X_train)
        self.test_dataset = ArticlesDataset(X_test)
        self.X_train = X_train.copy()
        self.X_test = X_test.copy()
        
    
    def train_model(self, tokenizer=None, model_name='bert-base-uncased', 
                batch_size=100, epochs=4, learning_rate=3e-5):
        if self.tokenizer is None:
            self.tokenizer = tokenizer
        self.model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name,
                                                                   num_labels=3).to(self.device)
        self.model_name = model_name.strip('microsoft/')
        
        
        training_args = transformers.TrainingArguments(
            output_dir='./results',
            num_train_epochs=epochs,
            per_device_train_batch_size=batch_size,
            learning_rate=learning_rate,
            logging_dir='./logs',
            logging_steps=CFG.logging_steps,
    #         evaluation_strategy="steps",
            save_steps=15_000,
            fp16=True,
        )
        self.trainer = transformers.Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train_dataset,
    #         eval_dataset=self.test_dataset,
        )
        self.trainer.train()
        
    def save_models(self):
        self.model.save_pretrained(f'./models/{self.model_name}_model_{CFG.target}_{self.input_text}')
        self.tokenizer.save_pretrained(f'./models/{self.model_name}_model_{CFG.target}_{self.input_text}')
        with open(f'./models/{self.model_name}_log_reg_model_{CFG.target}_{self.input_text}', 'wb') as f:
            pickle.dump(self.clf, f)
    
    def restore_models(self, tokenizer, model_name, root):
        self.model_name = model_name.strip('microsoft/')
        self.model = transformers.AutoModelForSequenceClassification.from_pretrained(
            root / f'models/{self.model_name}_model_{CFG.target}_{self.input_text}', num_labels=3)
        self.tokenizer = tokenizer.from_pretrained(root / f'models/{self.model_name}_model_{CFG.target}_{self.input_text}')
        with open(root / f'models/{self.model_name}_log_reg_model_{CFG.target}_{self.input_text}', 'rb') as f:
            self.clf = pickle.load(f)
        
        self.trainer = transformers.Trainer(
            model=self.model)
        
        
    def softmax(self, x):
        x = torch.tensor(x)
        if len(x.shape) == 1:
            # Apply softmax on a 1D tensor by converting it to a 2D tensor
            x = x.unsqueeze(0)  # Add a batch dimension
            s = nn.Softmax(dim=1)
            result = s(x).squeeze(0).numpy()
            return result
        elif len(x.shape) == 2:
            # Apply softmax on a 2D tensor
            s = nn.Softmax(dim=1)
            return s(x).numpy()

        
    def soft_voting_for_article(self, df:pd.DataFrame):
        groups = df['source_url'].unique()
        classes = list(CFG.encoder.keys())
        dst = {'target': [], 'website': []} | {k:[] for k in classes}
        for group in groups:
            mask = df['source_url'] == group
            dst['website'].append(df.loc[mask, 'website'].tolist()[0])
            dst['target'].append(df.loc[mask, self.target].tolist()[0])
            logits = df.loc[mask, classes].sum(axis=0)
            pred = self.softmax(logits)
            for class_, probability in zip(classes, pred):
                dst[class_].append(probability)
        df = pd.DataFrame(dst)
        return df
    
    
    def create_dataset_for_log_reg(self, dataset_to_predict='train'):
        self.model.eval()
        if dataset_to_predict == 'train':
            predictions = self.trainer.predict(self.train_dataset)
            df = self.X_train.copy()
        else:
            predictions = self.trainer.predict(self.test_dataset)
            df = self.X_test.copy()
        pred = self.softmax(predictions.predictions)
        target_decoder = {v: k for k, v in CFG.encoder.items()}
        decoder_func = np.vectorize(lambda item: target_decoder[item])
        if self.input_text == 'article':
            classes = list(CFG.encoder.keys())
            df[list(CFG.encoder.keys())] = pred
            df['pred'] = decoder_func(pred.argmax(axis=1))
            df = self.soft_voting_for_article(df)    
        
        else:
#             df = pd.DataFrame(pred, columns=list(CFG.encoder.keys()))
            df[list(CFG.encoder.keys())] = pred
            df['target'] = df[CFG.target].copy()
            columns = ['target', 'website'] + list(CFG.encoder.keys())
            df = df[columns]
        return df
        
    
    def prepare_dataset_for_log_reg(self):
        X_train = self.create_dataset_for_log_reg(dataset_to_predict='train')
        X_test = self.create_dataset_for_log_reg(dataset_to_predict='test')
        df_2 = pd.read_csv(f'/kaggle/input/news-small-dataset/{self.target}/{self.target}/train.csv')
        df_2 = df_2[~df_2['website'].isin(X_train['website'])]
        X_train = pd.concat([X_train, df_2]).reset_index(drop=True)
        
        df_2 = pd.read_csv(f'/kaggle/input/news-small-dataset/{self.target}/{self.target}/test.csv')
        df_2 = df_2[~df_2['website'].isin(X_test['website'])]
        X_test = pd.concat([X_test, df_2]).reset_index(drop=True)
        
        return X_train, X_test
    
    def log_reg(self):
        X_train, X_test = pipeline.prepare_dataset_for_log_reg()
        clf = LogisticRegression()
        clf.fit(X_train[list(CFG.encoder.keys())], X_train['target'])
        self.clf = clf
        y_pred = clf.predict(X_test[list(CFG.encoder.keys())])
        y_test = X_test['target']
        accuracy = accuracy_score(y_test, y_pred)
        macro_f1 = f1_score(y_test, y_pred, average='macro')
        avg_recall = recall_score(y_test, y_pred, average='macro')
        print("Accuracy:", accuracy)
        print("Macro-F1 Score:", macro_f1)
        print("Average Recall:", avg_recall)
        self.save_prob(X_train, X_test)
    
    def save_prob(self, X_train, X_test):
        train_prob_path = f'./probability/{self.model_name}_train_prob_{self.target}_{self.input_text}.pkl'
        test_prob_path = f'./probability/{self.model_name}_test_prob_{self.target}_{self.input_text}.pkl'
        X_train.to_pickle(train_prob_path)
        X_test.to_pickle(test_prob_path)

### BERT

In [9]:
tokenizer = transformers.BertTokenizerFast.from_pretrained('bert-base-uncased')
df_copy = df.copy()
df_processed = parallelize_dataframe(df_copy, process_chunk)
df['text'] = df_processed.apply(lambda x: x['text'])
df['input_ids'] = df_processed.apply(lambda x: x['input_ids'])
df['attention_mask'] = df_processed.apply(lambda x: x['attention_mask'])
df['label'] = df_processed.apply(lambda x: x['label'])

X_train = df[~df['website'].isin(split['test'])]
X_test = df[df['website'].isin(split['test'])]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

100%|██████████| 4/4 [00:56<00:00, 14.08s/it]


In [10]:
pipeline = NewsModel(target='fact',input_text=CFG.input_text, tokenizer=tokenizer)
pipeline.create_dataset(X_train, X_test)
pipeline.train_model(transformers.BertTokenizerFast, model_name='bert-base-uncased', batch_size=20, epochs=5, learning_rate=3e-5)
pipeline.log_reg()
pipeline.save_models()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Currently logged in as: [33mignat-gotin[0m ([33mfivel[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.17.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240603_160116-f147qcqe[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mdifferent-lion-110[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/fivel/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://w

Step,Training Loss
100,0.8613
200,0.7534
300,0.6448
400,0.5562
500,0.4513
600,0.3175
700,0.2622
800,0.2019
900,0.1568
1000,0.1552


Accuracy: 0.8197674418604651
Macro-F1 Score: 0.7618083783899641
Average Recall: 0.7186125691587458


### ROberta

In [12]:
model_name = 'roberta-base'
tokenizer = transformers.RobertaTokenizerFast.from_pretrained(model_name)
df_copy = df.copy()
df_processed = parallelize_dataframe(df_copy, process_chunk)
df['text'] = df_processed.apply(lambda x: x['text'])
df['input_ids'] = df_processed.apply(lambda x: x['input_ids'])
df['attention_mask'] = df_processed.apply(lambda x: x['attention_mask'])
df['label'] = df_processed.apply(lambda x: x['label'])
X_train = df[~df['website'].isin(split['test'])]
X_test = df[df['website'].isin(split['test'])]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

100%|██████████| 4/4 [00:52<00:00, 13.03s/it]


In [13]:
pipeline = NewsModel(target='fact',input_text=CFG.input_text, tokenizer=tokenizer)
pipeline.create_dataset(X_train, X_test)
pipeline.train_model(tokenizer, model_name=model_name, batch_size=16, epochs=5, learning_rate=4e-5)
pipeline.log_reg()
pipeline.save_models()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,0.8825
200,0.8307
300,0.8274
400,0.768
500,0.7441
600,0.7162
700,0.6545
800,0.6166
900,0.594
1000,0.519


Accuracy: 0.8023255813953488
Macro-F1 Score: 0.6988529883266725
Average Recall: 0.6685108998912375


### distilbert

In [14]:
model_name = 'distilbert-base-uncased'
tokenizer = transformers.DistilBertTokenizerFast.from_pretrained(model_name)
df_copy = df.copy()
df_processed = parallelize_dataframe(df_copy, process_chunk)
df['text'] = df_processed.apply(lambda x: x['text'])
df['input_ids'] = df_processed.apply(lambda x: x['input_ids'])
df['attention_mask'] = df_processed.apply(lambda x: x['attention_mask'])
df['label'] = df_processed.apply(lambda x: x['label'])
X_train = df[~df['website'].isin(split['test'])]
X_test = df[df['website'].isin(split['test'])]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

100%|██████████| 4/4 [00:56<00:00, 14.12s/it]


In [15]:
pipeline = NewsModel(target='fact',input_text=CFG.input_text, tokenizer=tokenizer)
pipeline.create_dataset(X_train, X_test)
pipeline.train_model(tokenizer, model_name=model_name, batch_size=25, epochs=7, learning_rate=3e-5)
pipeline.log_reg()
pipeline.save_models()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,0.8405
200,0.7261
300,0.5563
400,0.4512
500,0.3032
600,0.2701
700,0.1762
800,0.1451
900,0.0941
1000,0.0816


Accuracy: 0.813953488372093
Macro-F1 Score: 0.7454633544749824
Average Recall: 0.7148295266468058


### deberta-v3

In [16]:
model_name = 'microsoft/deberta-v3-base'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
df_copy = df.copy()
df_processed = parallelize_dataframe(df_copy, process_chunk)
df['text'] = df_processed.apply(lambda x: x['text'])
df['input_ids'] = df_processed.apply(lambda x: x['input_ids'])
df['attention_mask'] = df_processed.apply(lambda x: x['attention_mask'])
df['label'] = df_processed.apply(lambda x: x['label'])
X_train = df[~df['website'].isin(split['test'])]
X_test = df[df['website'].isin(split['test'])]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

100%|██████████| 4/4 [00:52<00:00, 13.05s/it]


In [17]:
pipeline = NewsModel(target='fact',input_text =CFG.input_text, tokenizer=tokenizer)
pipeline.create_dataset(X_train, X_test)
pipeline.train_model(tokenizer, model_name=model_name, batch_size=8, epochs=4, learning_rate=4e-5)
pipeline.log_reg()
pipeline.save_models()

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,0.8768
200,0.8276
300,0.7844
400,0.7773
500,0.7843
600,0.8011
700,0.7582
800,0.7219
900,0.7432
1000,0.7054


Accuracy: 0.7790697674418605
Macro-F1 Score: 0.648693670499838
Average Recall: 0.6296519600889016
