In [1]:
import os
import gc
import cv2
import copy
import time
import random

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# For Transformer Models
from transformers import AutoTokenizer, AutoModel

#Text Cleaning
from bs4 import BeautifulSoup
import re 

# Utils
from tqdm import tqdm

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
CONFIG = dict(
    seed = 2021,
    model_name = "DeepPavlov/rubert-base-cased",
    test_batch_size = 32,
    max_length = 256,
    num_classes = 1,
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
)

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])

In [3]:
df = pd.read_csv("input/test_data.csv", index_col="id")
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0_level_0,title,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Графики обслуживания внутриквартирного и внутр...,В соответствии с п.42 Постановления Правительс...
1,На МЦД улучшили поезда,"В Москве, на Белорусско-Савеловском центрально..."
2,Катастрофа SSJ 100: вина за гибель 41 человека...,Завершено расследование дела о крушении самоле...
3,С Днем юриста!,От души поздравляем сотрудников и преподавател...
4,"Охота на ""черного"" лесоруба","В Костроме прошел межрегиональный форум, посвя..."


In [4]:
df["text_new"]=df.title.apply(lambda x: str(x) + ". ")+df["text"].apply(str)

In [5]:
df.head()

Unnamed: 0_level_0,title,text,text_new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Графики обслуживания внутриквартирного и внутр...,В соответствии с п.42 Постановления Правительс...,Графики обслуживания внутриквартирного и внутр...
1,На МЦД улучшили поезда,"В Москве, на Белорусско-Савеловском центрально...","На МЦД улучшили поезда. В Москве, на Белорусск..."
2,Катастрофа SSJ 100: вина за гибель 41 человека...,Завершено расследование дела о крушении самоле...,Катастрофа SSJ 100: вина за гибель 41 человека...
3,С Днем юриста!,От души поздравляем сотрудников и преподавател...,С Днем юриста!. От души поздравляем сотруднико...
4,"Охота на ""черного"" лесоруба","В Костроме прошел межрегиональный форум, посвя...","Охота на ""черного"" лесоруба. В Костроме прошел..."


In [6]:
MODEL_PATHS = [
    'Loss-Fold-0.bin',
    'Loss-Fold-1.bin',
    'Loss-Fold-2.bin',
    'Loss-Fold-3.bin',
    'Loss-Fold-4.bin'
]

In [7]:
def set_seed(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

In [8]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [9]:
df.text_new = df.text_new.apply(text_cleaning)

In [10]:
class SberNewsDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df.text_new
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
                                text,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length',
                                return_token_type_ids=True
                            )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
        }

In [11]:
test_dataset = SberNewsDataset(df, CONFIG['tokenizer'], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['test_batch_size'],
                         num_workers=2, shuffle=False, pin_memory=True)

In [12]:
class SberNewsModel(nn.Module):
    def __init__(self, model_name):
        super(SberNewsModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, CONFIG['num_classes'])
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, ids, mask, token_type_ids):
        _, features = self.model(input_ids=ids, attention_mask=mask, token_type_ids = token_type_ids, return_dict=False)
        out = self.drop(features)
        outputs = self.fc(out)
        outputs = self.sigmoid(outputs.squeeze())
        
        return outputs


In [13]:
@torch.no_grad()
def valid_fn(model, dataloader, device):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    PREDS = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        
        outputs = model(ids, mask, token_type_ids)
        PREDS.append(outputs.view(-1).cpu().detach().numpy()) 
    
    PREDS = np.concatenate(PREDS)
    gc.collect()
    
    return PREDS

In [14]:
def inference(model_paths, dataloader, device):
    final_preds = []
    for i, path in enumerate(model_paths):
        model = SberNewsModel(CONFIG['model_name'])
        model.to(CONFIG['device'])
        model.load_state_dict(torch.load(path))
        
        print(f"Getting predictions for model {i+1}")
        preds = valid_fn(model, dataloader, device)
        final_preds.append(preds)
    
    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    return final_preds


In [15]:
preds = inference(MODEL_PATHS, test_loader, CONFIG['device'])

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model 1


100%|█████████████████████████████████████████| 331/331 [01:28<00:00,  3.74it/s]
Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model 2


100%|█████████████████████████████████████████| 331/331 [01:29<00:00,  3.71it/s]
Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model 3


100%|█████████████████████████████████████████| 331/331 [01:29<00:00,  3.70it/s]
Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model 4


100%|█████████████████████████████████████████| 331/331 [01:29<00:00,  3.71it/s]
Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model 5


100%|█████████████████████████████████████████| 331/331 [01:28<00:00,  3.74it/s]


In [16]:
print(f"Total Predictiions: {preds.shape[0]}")
print(f"Total Unique Predictions: {np.unique(preds).shape[0]}")

Total Predictiions: 10571
Total Unique Predictions: 10318


In [17]:
df['prediction'] = preds
df.head()

Unnamed: 0_level_0,title,text,text_new,prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Графики обслуживания внутриквартирного и внутр...,В соответствии с п.42 Постановления Правительс...,Графики обслуживания внутриквартирного и внутр...,0.031191
1,На МЦД улучшили поезда,"В Москве, на Белорусско-Савеловском центрально...","На МЦД улучшили поезда. В Москве, на Белорусск...",0.032953
2,Катастрофа SSJ 100: вина за гибель 41 человека...,Завершено расследование дела о крушении самоле...,Катастрофа SSJ 100: вина за гибель 41 человека...,0.037305
3,С Днем юриста!,От души поздравляем сотрудников и преподавател...,С Днем юриста!. От души поздравляем сотруднико...,0.016447
4,"Охота на ""черного"" лесоруба","В Костроме прошел межрегиональный форум, посвя...","Охота на ""черного"" лесоруба. В Костроме прошел...",0.025331


In [18]:
df[df['prediction']>= 0.5]

Unnamed: 0_level_0,title,text,text_new,prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
243,"Голубев выяснял, почему затянули реконструкцию...",Местные чиновники пожаловались на подрядчика Г...,"Голубев выяснял, почему затянули реконструкцию...",0.723632
274,Губернатор призвал не снижать заявленных темпо...,На еженедельном заседании рабочей группы по ре...,Губернатор призвал не снижать заявленных темпо...,0.565532
410,Начало осуществления сталинского проекта Росси...,ОАО «РЖД» утвердило инвестиционную программу в...,Начало осуществления сталинского проекта Росси...,0.842356
479,«Московский Диснейленд» не уложился в сроки //...,Московский аналог Диснейленда не успели запуст...,«Московский Диснейленд» не уложился в сроки //...,0.745989
506,Названы сроки закладки российских «Мистралей»,"Строительство российских вертолетоносцев, явля...",Названы сроки закладки российских «Мистралей»....,0.695104
...,...,...,...,...
10364,Медкабинеты в приморских детсадах оснастят за ...,В Приморье медицинские кабинеты в детских сада...,Медкабинеты в приморских детсадах оснастят за ...,0.725344
10385,Реконструкция набережной Георгия Седова в Арха...,Из-за изменения уровня грунта потребовался пер...,Реконструкция набережной Георгия Седова в Арха...,0.530310
10398,"11 школ, 8 детских садов и пристроек ввели в э...",Ещё ряд объектов на завершающей стадии. Их дол...,"11 школ, 8 детских садов и пристроек ввели в э...",0.728613
10411,«АртСтрой» могут оштрафовать за опоздание с ре...,фото показано с : vechor.ru 2019-12-4 17:...,«АртСтрой» могут оштрафовать за опоздание с ре...,0.961172


In [19]:
df.drop('text_new', axis=1, inplace=True)
df.to_csv("submission.csv")