In [1]:
import numpy as np 
import pandas as pd 

Для начала посмотрим на данные.

In [2]:
train = pd.read_csv('/kaggle/input/spam-dataset/train_spam.csv')
test_df = pd.read_csv('/kaggle/input/spam-dataset/test_spam.csv')

In [3]:
train.head(5)

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...


In [4]:
test_df.head(3)

Unnamed: 0,text
0,j jim whitehead ejw cse ucsc edu writes j you ...
1,original message from bitbitch magnesium net p...
2,java for managers vince durasoft who just taug...


In [66]:
len(test_df['text'])

4070

In [5]:
train['text_type'].value_counts()

text_type
ham     11469
spam     4809
Name: count, dtype: int64

In [6]:
CLASSES = list(train['text_type'].unique())
labels = dict(zip(CLASSES, range(len(CLASSES))))
labels

{'ham': 0, 'spam': 1}

In [57]:
id2labels = {'0':'ham', '1':'spam'}

In [7]:
train['label'] = train['text_type'].apply(lambda x : labels[x])
train.head(3)

Unnamed: 0,text_type,text,label
0,ham,make sure alex knows his birthday is over in f...,0
1,ham,a resume for john lavorato thanks vince i will...,0
2,spam,plzz visit my website moviesgodml to get all m...,1


Разделим датасет с метками на собственно обучающий и валидационный:

In [8]:
train_df, val_df = np.split(train.sample(frac=1, random_state=40), 
                                     [int(.90*len(train))])

  return bound(*args, **kwds)


In [9]:
print('Train data')
print(train_df['text_type'].value_counts())
print('Validation data')
print(val_df['text_type'].value_counts())

Train data
text_type
ham     10366
spam     4284
Name: count, dtype: int64
Validation data
text_type
ham     1103
spam     525
Name: count, dtype: int64


Видим, что доля спама/неспама примерно одинаковая, что хорошо.

Приступим к тестированию моделей. Для начала просто попробуем токенизировать (векторизировать) текст, а потом классифицировать его Random Forestом.

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [11]:
train_size, val_size = train_df.shape[0], val_df.shape[0]
corpus = train_df['text'].tolist() + val_df['text'].tolist() + test_df['text'].tolist()
y_train = train_df['label'].tolist()
y_val = val_df['label'].tolist()
corpus[:3]

['incremented karma of ash by 1 total points 197',
 'i have translated the article in full see end of post i think that i ve done a far better job than the google translation at least it s readable now any corrections appreciated stuff in s is my and others additions to the debate my apologies if i ve paraphrased anybody incorrectly i will be glad to retract if anyone is miffed the article makes four main points 1635465 absence of critical clauses in this case the idea is that the licence is invalid because it doesn t specify under what country s law the gpl is governed 1635465 specification in english only that for the end user as opposed to businesses the gpl doesn t apply because it s not written in french 1635465 arbitary licence change the point here is that under french law the author can change the terms of the licence arbitarily this',
 'url url date 1635465 1635465 1635465t1635465 1635465 1635465 1635465 1635465 mark tosczak a new way to read not see maps 1635465 1635465 the m

Для начала попробуем OneHotEncoding в качестве токенизатора.

In [13]:
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(corpus)

X_train, X_val = X[:train_size], X[train_size:train_size + val_size]

print("Размерность матрицы признаков:", X.shape)

Размерность матрицы признаков: (20348, 60317)


In [14]:
%%time
rf_clf = RandomForestClassifier(random_state=40)
rf_clf.fit(X_train, y_train)

CPU times: user 55.7 s, sys: 40.9 ms, total: 55.7 s
Wall time: 55.7 s


In [15]:
models_results = {'model':[], 'roc_auc':[], 'time':[]}

In [16]:
y_pred = rf_clf.predict(X_val)
rf_score = roc_auc_score(y_val, y_pred)
models_results['model'].append('RF+OHE')
models_results['roc_auc'].append(rf_score)
print("RocAuc для RandomForest+OHE: ", rf_score)

RocAuc для RandomForest+OHE:  0.8781410007339292


Теперь пробуем TF IDF

In [17]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

X_train, X_val = X[:train_size], X[train_size:train_size + val_size]

print("Размерность матрицы признаков:", X.shape)

Размерность матрицы признаков: (20348, 60317)


In [18]:
%%time
rf_clf2 = RandomForestClassifier(random_state=40)
rf_clf2.fit(X_train, y_train)

CPU times: user 52.8 s, sys: 48.9 ms, total: 52.8 s
Wall time: 52.8 s


In [19]:
y_pred = rf_clf2.predict(X_val)
rf2_score = roc_auc_score(y_val, y_pred)
models_results['model'].append('RF+TFIDF')
models_results['roc_auc'].append(rf2_score)
print("RocAuc для RandomForest+TF_IDF: ", rf2_score)

RocAuc для RandomForest+TF_IDF:  0.877687691577084


In [20]:
models_results

{'model': ['RF+OHE', 'RF+TFIDF'],
 'roc_auc': [0.8781410007339292, 0.877687691577084],
 'time': []}

Теперь попробуем бустинг на деревьях, в качетве токенизатора возьмем TF IDF.

In [21]:
from catboost import CatBoostClassifier


boost = CatBoostClassifier(iterations=1000,
                           task_type="GPU",
                           devices='0')
boost.fit(X_train,
          y_train,
          verbose=False)

<catboost.core.CatBoostClassifier at 0x7f77c78a7b80>

In [22]:
y_pred = boost.predict(X_val)
boost_score = roc_auc_score(y_val, y_pred)
models_results['model'].append('CATBOOST+TFIDF')
models_results['roc_auc'].append(boost_score)
print("RocAuc для CatBoost+TF_IDF: ", boost_score)

RocAuc для CatBoost+TF_IDF:  0.8784155765660753


In [23]:
boost.save_model('catboost+tfidf')

Попробуем затюнить предобученный Bert

In [45]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_cosine_schedule_with_warmup, AdamW
from tqdm import tqdm
import os
import sys
from collections import Counter


In [29]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, phase='train'):
        self.phase = phase
        
        if self.phase == 'train':
            self.labels = [labels[label] for label in df['text_type']]
        elif self.phase == 'test':
            self.oid = [oid for oid in range(df['text'].tolist())]
            
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        if self.phase == 'train':
            return len(self.labels)
        elif self.phase == 'test':
            return len(self.oid)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])
    
    def get_batch_oid(self, idx):
        return np.array(self.oid[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):
        if self.phase == 'train':
            batch_texts = self.get_batch_texts(idx)
            batch_y = self.get_batch_labels(idx)
            return batch_texts, batch_y
        elif self.phase == 'test':
            batch_texts = self.get_batch_texts(idx)
            batch_oid = self.get_batch_oid(idx)
            return batch_texts, batch_oid
   

In [38]:
class BertClassifier:
    def __init__(self, model_path, tokenizer_path, data, n_classes=13, epochs=5):
        self.model = BertForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.data = data
        self.device = torch.device('cuda')
        self.max_len = 512
        self.epochs = epochs
        self.out_features = self.model.bert.encoder.layer[1].output.dense.out_features
        self.model.classifier = torch.nn.Linear(self.out_features, n_classes).to(self.device)
        self.model = self.model.to(self.device)

    
    def preparation(self):
        self.df_train, self.df_val = np.split(self.data.sample(frac=1, random_state=40), 
                                     [int(.90*len(self.data))])
        
        self.train, self.val = CustomDataset(self.df_train, self.tokenizer, phase='train'), CustomDataset(self.df_val, self.tokenizer, phase='train')
        self.train_dataloader = torch.utils.data.DataLoader(self.train, batch_size=4, shuffle=True)
        self.val_dataloader = torch.utils.data.DataLoader(self.val, batch_size=4)
    
       
        self.optimizer = AdamW(self.model.parameters(), lr=2e-4, correct_bias=False)
        self.scheduler = get_cosine_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=0,
                num_training_steps=len(self.train_dataloader) * self.epochs
            )
        self.loss_fn = torch.nn.CrossEntropyLoss().to(self.device)
            
    def fit(self):
        self.model = self.model.train()
        
        for epoch_num in range(self.epochs):
            total_acc_train = 0
            total_loss_train = 0
            for train_input, train_label in tqdm(self.train_dataloader):
                train_label = train_label.to(self.device)
                mask = train_input['attention_mask'].to(self.device)
                input_id = train_input['input_ids'].squeeze(1).to(self.device)
                output = self.model(input_id.to(self.device), mask.to(self.device))

                batch_loss = self.loss_fn(output[0], train_label.long())
                total_loss_train += batch_loss.item()

                acc = (output[0].argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                self.model.zero_grad()
                batch_loss.backward()
                self.optimizer.step()
                self.scheduler.step()
            total_acc_val, total_loss_val, roc_auc = self.eval()
           
            print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(self.df_train): .3f} \
            | Train Accuracy: {total_acc_train / len(self.df_train): .3f} \
            | Val Loss: {total_loss_val / len(self.df_val): .3f} \
            | Val Accuracy: {total_acc_val / len(self.df_val): .3f} \
            | Val roc_auc: {roc_auc: .3f}')

            if epoch_num % 2 == 0:
                os.makedirs('checkpoint', exist_ok=True)
                torch.save(self.model, f'checkpoint/BertAnswerClassifierLarge{epoch_num}.pt')
                print("SAVED")

        return total_acc_train, total_loss_train, roc_auc
    
    def eval(self):
        self.model = self.model.eval()
        total_acc_val = 0
        total_loss_val = 0
        y_true = []
        y_pred = []

        with torch.no_grad():
            for val_input, val_label in tqdm(self.val_dataloader):
                val_label = val_label.to(self.device)
                mask = val_input['attention_mask'].to(self.device)
                input_id = val_input['input_ids'].squeeze(1).to(self.device)

                output = self.model(input_id.to(self.device), mask.to(self.device))

                batch_loss = self.loss_fn(output[0], val_label.long())
                total_loss_val += batch_loss.item()

                acc = (output[0].argmax(dim=1) == val_label).sum().item()
                pred_label = output[0].argmax(dim=1)
                y_pred.extend(pred_label)
                y_true.extend(val_label)
                total_acc_val += acc
        y_true_tensor = torch.tensor(y_true)
        y_pred_tensor = torch.tensor(y_pred)
        roc_auc = roc_auc_score(y_true_tensor, y_pred_tensor)
        
        return total_acc_val, total_loss_val, roc_auc
    

In [42]:
model_path = 'prajjwal1/bert-tiny'
tokenizer_path = 'prajjwal1/bert-tiny'
bert = BertClassifier(model_path, tokenizer_path, train,n_classes = 2, epochs=10)

config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [43]:
%%time
bert.preparation()

CPU times: user 45.5 s, sys: 34.9 ms, total: 45.6 s
Wall time: 45.5 s


In [47]:
_, x, bert_score = bert.fit()

100%|██████████| 3663/3663 [00:42<00:00, 86.93it/s]
100%|██████████| 407/407 [00:01<00:00, 322.45it/s]


Epochs: 1 | Train Loss:  0.007             | Train Accuracy:  0.992             | Val Loss:  0.039             | Val Accuracy:  0.956             | Val roc_auc:  0.954
SAVED


100%|██████████| 3663/3663 [00:40<00:00, 90.09it/s]
100%|██████████| 407/407 [00:01<00:00, 324.46it/s]


Epochs: 2 | Train Loss:  0.003             | Train Accuracy:  0.997             | Val Loss:  0.048             | Val Accuracy:  0.951             | Val roc_auc:  0.942


100%|██████████| 3663/3663 [00:40<00:00, 91.07it/s]
100%|██████████| 407/407 [00:01<00:00, 323.11it/s]


Epochs: 3 | Train Loss:  0.002             | Train Accuracy:  0.998             | Val Loss:  0.047             | Val Accuracy:  0.957             | Val roc_auc:  0.950
SAVED


100%|██████████| 3663/3663 [00:40<00:00, 90.96it/s]
100%|██████████| 407/407 [00:01<00:00, 325.83it/s]


Epochs: 4 | Train Loss:  0.001             | Train Accuracy:  0.999             | Val Loss:  0.054             | Val Accuracy:  0.956             | Val roc_auc:  0.952


100%|██████████| 3663/3663 [00:40<00:00, 90.11it/s]
100%|██████████| 407/407 [00:01<00:00, 318.28it/s]


Epochs: 5 | Train Loss:  0.001             | Train Accuracy:  0.999             | Val Loss:  0.060             | Val Accuracy:  0.957             | Val roc_auc:  0.953
SAVED


100%|██████████| 3663/3663 [00:40<00:00, 89.91it/s]
100%|██████████| 407/407 [00:01<00:00, 312.74it/s]


Epochs: 6 | Train Loss:  0.001             | Train Accuracy:  1.000             | Val Loss:  0.059             | Val Accuracy:  0.955             | Val roc_auc:  0.950


100%|██████████| 3663/3663 [00:40<00:00, 90.70it/s]
100%|██████████| 407/407 [00:01<00:00, 330.40it/s]


Epochs: 7 | Train Loss:  0.001             | Train Accuracy:  1.000             | Val Loss:  0.059             | Val Accuracy:  0.956             | Val roc_auc:  0.951
SAVED


100%|██████████| 3663/3663 [00:40<00:00, 90.86it/s]
100%|██████████| 407/407 [00:01<00:00, 329.74it/s]


Epochs: 8 | Train Loss:  0.001             | Train Accuracy:  1.000             | Val Loss:  0.062             | Val Accuracy:  0.955             | Val roc_auc:  0.945


100%|██████████| 3663/3663 [00:40<00:00, 91.13it/s]
100%|██████████| 407/407 [00:01<00:00, 324.49it/s]


Epochs: 9 | Train Loss:  0.001             | Train Accuracy:  1.000             | Val Loss:  0.073             | Val Accuracy:  0.950             | Val roc_auc:  0.948
SAVED


100%|██████████| 3663/3663 [00:40<00:00, 91.01it/s]
100%|██████████| 407/407 [00:01<00:00, 324.35it/s]

Epochs: 10 | Train Loss:  0.001             | Train Accuracy:  0.999             | Val Loss:  0.068             | Val Accuracy:  0.955             | Val roc_auc:  0.943





In [48]:
models_results['model'].append('Bert_Tiny')
models_results['roc_auc'].append(bert_score)

In [49]:
models_results

{'model': ['RF+OHE', 'RF+TFIDF', 'CATBOOST+TFIDF', 'Bert_Tiny'],
 'roc_auc': [0.8781410007339292,
  0.877687691577084,
  0.8784155765660753,
  0.94295298536459],
 'time': []}

In [59]:
del models_results['time']

In [60]:
models_df = pd.DataFrame.from_dict(models_results)
models_df

Unnamed: 0,model,roc_auc
0,RF+OHE,0.878141
1,RF+TFIDF,0.877688
2,CATBOOST+TFIDF,0.878416
3,Bert_Tiny,0.942953


   Как видно из таблички сверху, скор лучший у Bertа. С учетом того, что Bert предобучен на бОльшем количестве данных и что при его использовании не нужно каждый раз переделывать словарь слов, можно предположить, что для данной задачи это лучшая модель из всех использованных в данном ноутбуке.
   Поэтому будем использовать Bert для предсказания результатов на тестовой выборке.

In [51]:
model = torch.load(f'/kaggle/working/checkpoint/BertAnswerClassifierLarge8.pt')
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

In [63]:
from torch.nn.functional import softmax

control_dict = {"text": [], "score": []}
for text in test_df["text"]:
    encoding = tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt").to('cuda')
    model.eval()
    with torch.no_grad():
        logits = model(**encoding).logits

    distr = softmax(logits)
    #control_dict["text"].append(text)
    control_dict["text"].append(text)
    control_dict["score"].append(id2labels[str(distr.argmax(dim=1).item())])

  distr = softmax(logits)


In [64]:
final_df = pd.DataFrame.from_dict(control_dict)
final_df.to_csv('submit.csv', index=False)