In [85]:
import warnings
warnings.filterwarnings('ignore')


In [86]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import statsmodels.api as sm 
from tqdm import tqdm
import math
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertModel
import torch

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using {device} device")

Using cuda device


In [3]:
train_data = pd.read_csv('data/train_bert_data.csv', index_col=0)
test_data = pd.read_csv('data/test_bert_data.csv', index_col=0)

In [4]:
train_data['all_text'] = train_data['title'] + train_data['text']
test_data['all_text'] = test_data['title'] + test_data['text']

In [5]:
model_name = "bert-base-uncased"  
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [6]:
model=model.to(device)

In [7]:
def get_embeddings(X, batch_size=128, max_length=512):
    all_pred_list = []
    n_iters = math.ceil(len(X) / batch_size)
    for i in tqdm(range(n_iters)):
        text_batch = X.iloc[i * batch_size : (i + 1) * batch_size].astype(str).to_list()
        tok = tokenizer(
            text_batch,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=max_length
        )
        with torch.no_grad():
            outputs = model(
                input_ids=tok['input_ids'].to(device),
                token_type_ids=tok.get('token_type_ids', None).to(device) if 'token_type_ids' in tok else None,
                attention_mask=tok['attention_mask'].to(device)
            )
        all_pred_list.extend(outputs.last_hidden_state[:, 0, :].cpu().numpy().tolist())
    return all_pred_list


In [8]:
X_test_bert = np.array(get_embeddings(test_data['all_text']))

100%|███████████████████████████████████████████| 97/97 [02:37<00:00,  1.62s/it]


In [9]:
X_train_bert = np.array(get_embeddings(train_data['all_text']))


100%|█████████████████████████████████████████| 386/386 [10:28<00:00,  1.63s/it]


In [11]:
X_train_emb = pd.DataFrame(X_train_bert)

In [12]:
X_test_emb = pd.DataFrame(X_test_bert)

In [15]:
np.array(test_data['label'])

array([0, 0, 0, ..., 0, 0, 0])

In [16]:
X_test_emb['label'] = np.array(test_data['label'])
X_train_emb['label'] = np.array(train_data['label'])

In [32]:
X_train = X_train_emb.drop(columns='label')
y_train = X_train_emb['label']


In [17]:
# X_train_emb.to_csv('data/train_bert_emb.csv')
# X_test_emb.to_csv('data/test_bert_emb.csv')

In [57]:
model = CatBoostClassifier(
    iterations= 1000,
    learning_rate=0.01,
    depth=6,
    loss_function='Logloss', 
    verbose=100,
    random_seed=4
)

In [58]:
model.fit(X_train, y_train)


0:	learn: 0.6868790	total: 32.8ms	remaining: 32.8s
100:	learn: 0.4100198	total: 3s	remaining: 26.7s
200:	learn: 0.3485089	total: 5.95s	remaining: 23.6s
300:	learn: 0.3206682	total: 8.89s	remaining: 20.7s
400:	learn: 0.3026592	total: 11.8s	remaining: 17.7s
500:	learn: 0.2892203	total: 14.8s	remaining: 14.7s
600:	learn: 0.2783044	total: 17.7s	remaining: 11.8s
700:	learn: 0.2688910	total: 20.7s	remaining: 8.81s
800:	learn: 0.2605508	total: 23.6s	remaining: 5.87s
900:	learn: 0.2531030	total: 26.6s	remaining: 2.92s
999:	learn: 0.2462852	total: 29.5s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x74872249a840>

In [71]:
model.save_model('models/catboost_model.bin')


In [83]:
X_test_emb['pred_bert'] =  model.predict_proba(X_test_emb.iloc[:, :-1])[:,1]

In [84]:
X_test_emb[['label', 'pred_bert']].to_csv('test_with_bert.csv')