# Pip

In [1]:
! pip install transformers --quiet

[K     |████████████████████████████████| 2.9 MB 4.1 MB/s 
[K     |████████████████████████████████| 596 kB 48.9 MB/s 
[K     |████████████████████████████████| 3.3 MB 39.2 MB/s 
[K     |████████████████████████████████| 895 kB 30.2 MB/s 
[K     |████████████████████████████████| 56 kB 4.6 MB/s 
[?25h

# Import

In [2]:
import re
from warnings import filterwarnings

import numpy as np
import pandas as pd

from pymystem3 import Mystem

from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm_notebook as tqdm

from torchtext.legacy import data

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup


filterwarnings('ignore')

# Data Preprocessing

In [3]:
df_train = pd.read_csv('train.tsv', sep='\t')
df_test = pd.read_csv('test.tsv', sep='\t')
df_valid = pd.read_csv('valid.tsv', sep='\t')

df_train_positive_class = df_train[df_train['label'] == 1]
df_train_negative_class = df_train[df_train['label'] == 0]

num_positive_examples = len(df_train_positive_class)

# For training set, we take the same amount of positive and negative examples
df_train_negative_class = df_train_negative_class.sample(num_positive_examples)
# Concatenating positive and negative examples and shuffling the training set
df_train = pd.concat((df_train_positive_class, df_train_negative_class)).sample(frac=1)

df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
df_valid.reset_index(drop=True, inplace=True)

In [5]:
def my_clean(text):
    text = ' '.join(filter(lambda x : x[0] != '@', text.split()))
    text = re.sub(r'http\S+', '', text)
    return text

def list_replace(search, replacement, text):
    '''
    Replaces all symbols of text which are present
    in the search string with the replacement string.
    '''
    search = [el for el in search if el in text]
    for c in search:
        text = text.replace(c, replacement)
    return text

def clean_text(text):

    text = list_replace(
        '\u00AB\u00BB\u2039\u203A\u201E\u201A\u201C\u201F\u2018\u201B\u201D\u2019',
         '\u0022',
          text
    )

    text = list_replace(
        '\u2012\u2013\u2014\u2015\u203E\u0305\u00AF',
         '\u2003\u002D\u002D\u2003',
          text
    )

    text = list_replace(
        '\u2010\u2011',
         '\u002D',
          text
    )

    text = list_replace(
        '\u2000\u2001\u2002\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u2060\u3000',
        '\u2002',
        text
    )

    text = re.sub('\u2003\u2003', '\u2003', text)
    text = re.sub('\t\t', '\t', text)

    text = list_replace(
        '\u02CC\u0307\u0323\u2022\u2023\u2043\u204C\u204D\u2219\u25E6\u00B7\u00D7\u22C5\u2219\u2062',
        '.',
         text
    )

    text = list_replace('\u2217', '\u002A', text)

    text = list_replace('…', '...', text)

    text = list_replace('\u00C4', 'A', text)
    text = list_replace('\u00E4', 'a', text)
    text = list_replace('\u00CB', 'E', text)
    text = list_replace('\u00EB', 'e', text)
    text = list_replace('\u1E26', 'H', text)
    text = list_replace('\u1E27', 'h', text)
    text = list_replace('\u00CF', 'I', text)
    text = list_replace('\u00EF', 'i', text)
    text = list_replace('\u00D6', 'O', text)
    text = list_replace('\u00F6', 'o', text)
    text = list_replace('\u00DC', 'U', text)
    text = list_replace('\u00FC', 'u', text)
    text = list_replace('\u0178', 'Y', text)
    text = list_replace('\u00FF', 'y', text)
    text = list_replace('\u00DF', 's', text)
    text = list_replace('\u1E9E', 'S', text)
    # Removing punctuation
    text = list_replace(',.[]{}()=+-−*&^%$#@!~;:§/\|\?\'\n', ' ', text)
    # Replacing all numbers with masks
    text = list_replace('0123456789', 'x', text)

    currencies = list(
            '\u20BD\u0024\u00A3\u20A4\u20AC\u20AA\u2133\u20BE\u00A2\u058F\u0BF9\u20BC\u20A1\u20A0\u20B4\u20A7\u20B0\u20BF\u20A3\u060B\u0E3F\u20A9\u20B4\u20B2\u0192\u20AB\u00A5\u20AD\u20A1\u20BA\u20A6\u20B1\uFDFC\u17DB\u20B9\u20A8\u20B5\u09F3\u20B8\u20AE\u0192'
    )

    alphabet = list(
        '\t\r абвгдеёзжийклмнопрстуфхцчшщьыъэюяАБВГДЕЁЗЖИЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ '
    )

    allowed = set(currencies + alphabet)

    cleaned_text = [sym for sym in text if sym in allowed]
    cleaned_text = ''.join(cleaned_text)

    return cleaned_text 

In [6]:
# Extracting tweet texts
train_tweet_texts = df_train.tweet.values
test_tweet_texts = df_test.tweet.values
valid_tweet_texts = df_valid.tweet.values

# Extracting tweet labels
train_labels = df_train['label'].values
valid_labels = df_valid['label'].values

# Preprocessing training tweets
cleaned_train_texts = []
for tweet_text in train_tweet_texts:
    my_cleaned_text = my_clean(tweet_text)
    cleaned_text = clean_text(my_cleaned_text).lower()
    split_cleaned_text = cleaned_text.split()
    cleaned_train_texts.append(' '.join(split_cleaned_text))
    
# Preprocessing test tweets
cleaned_test_texts = []
for tweet_text in test_tweet_texts:
    my_cleaned_text = my_clean(tweet_text)
    cleaned_text = clean_text(my_cleaned_text)
    cleaned_test_texts.append(' '.join(cleaned_text.split()))
    
# Preprocessing validation tweets
cleaned_valid_texts = []
for tweet_text in valid_tweet_texts:
    my_cleaned_text = my_clean(tweet_text)
    cleaned_text = clean_text(my_cleaned_text)
    cleaned_valid_texts.append(' '.join(cleaned_text.split()))

In [7]:
df_train['clean_text'] = cleaned_train_texts
df_test['clean_text'] = cleaned_test_texts
df_valid['clean_text'] = cleaned_valid_texts

## Lemmatization

In [8]:
def lemmatize_all(texts):
    res_texts = []
    mystem_analyzer = Mystem(entire_input=False)
    for text in texts:
        lem_list_text = mystem_analyzer.lemmatize(text)
        lem_text = ' '.join(lem_list_text)
        res_texts.append(lem_text)
    return res_texts

df_train['clean_text'] = lemmatize_all(cleaned_train_texts)
df_test['clean_text'] = lemmatize_all(cleaned_test_texts)
df_valid['clean_text'] = lemmatize_all(cleaned_valid_texts)

# Dataset

In [9]:
class TwitterDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, maxlen):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.maxlen = maxlen

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        tensor_label = torch.tensor(self.labels[index], dtype=torch.long)

        encode = self.tokenizer.encode_plus(text, add_special_tokens=True,
                                            max_length=self.maxlen,
                                            return_token_type_ids=False,
                                            pad_to_max_length=True,
                                            return_attention_mask=True,
                                            return_tensors='pt',
                                            truncation=True)

        return {'text': text,
                'label': tensor_label,
                'input_ids' : encode['input_ids'].flatten(),
                'targets' : tensor_label,
                'attention_mask': encode['attention_mask'].flatten()}

In [10]:
RANDOM_SEED = 0
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [12]:
def create_data_loader(df, tokenizer, batch_size, max_len):
    if 'label' not in df.columns:
        labels = [0] * len(df)
    else:
        labels = df['label'].values

    ds = TwitterDataset(texts=df['clean_text'].values,
                        labels=labels,
                        tokenizer=tokenizer,
                        max_len=max_len)
    
    return DataLoader(ds, batch_size=batch_size)

In [13]:
PRE_TRAINED_MODEL_NAME = 'cimm-kzn/enrudr-bert'

In [14]:
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

train_tokenized = [tokenizer.encode(x, add_special_tokens=True) for x in df_train['clean_text'].values]
valid_tokenized = [tokenizer.encode(x, add_special_tokens=True) for x in df_valid['clean_text'].values]
test_tokenized = [tokenizer.encode(x, add_special_tokens=True) for x in df_test['clean_text'].values]

train_max_len = max(map(len, train_tokenized))
valid_max_len = max(map(len, valid_tokenized))
test_max_len = max(map(len, valid_tokenized))

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/521 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [15]:
EPOCHS = 2
L_RATE = 2e-5
BATCH_SIZE = 32

In [16]:
def create_data_loader(df, tokenizer, batch_size, max_len):
    if 'label' not in df.columns:
        labels = [0] * len(df)
    else:
        labels = df['label'].values

    ds = TwitterDataset(texts=df['clean_text'].values,
                        labels=labels,
                        tokenizer=tokenizer,
                        maxlen=max_len)
    
    return DataLoader(ds, batch_size=batch_size)



train_data_loader = create_data_loader(df_train, tokenizer, BATCH_SIZE, train_max_len)
valid_data_loader = create_data_loader(df_valid, tokenizer, BATCH_SIZE, valid_max_len)
test_data_loader = create_data_loader(df_test, tokenizer, BATCH_SIZE, test_max_len)

# Model

In [17]:
class TwitterClassifier(nn.Module):

    def __init__(self, n_classes):
        super(TwitterClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output = self.drop(outputs["pooler_output"])
        return self.out(output)

In [18]:
n_classes = 2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TwitterClassifier(n_classes)
model = model.to(device)

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at cimm-kzn/enrudr-bert were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
optimizer = AdamW(model.parameters(), lr=L_RATE, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

loss_fn = nn.CrossEntropyLoss().to(device)

In [20]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()

    losses = []
    correct_predictions = 0
  
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

In [21]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)

            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

# Train

In [22]:
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1:2d}/{EPOCHS:2d}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train))
    valid_acc, valid_loss = eval_model(model, valid_data_loader, loss_fn, device, len(df_valid))
    
    print(f'Train loss {train_loss:.4f} accuracy {train_acc:.4f}')
    print(f'Valid loss {valid_loss:.4f} accuracy {valid_acc:.4f}')

Epoch  1/ 2
----------
Train loss 0.5308 accuracy 0.7380
Valid loss 0.5277 accuracy 0.7746
Epoch  2/ 2
----------
Train loss 0.2373 accuracy 0.9049
Valid loss 0.6325 accuracy 0.7924


In [38]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)

test_acc.item()

# Prediction

In [23]:
def get_predictions(model, data_loader):
    model = model.eval()
    
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:

            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)

            probs = F.softmax(outputs, dim=1)

            predictions.extend(preds)
            prediction_probs.extend(probs)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()

    return predictions, prediction_probs

In [24]:
predicted_valid_labels, prediction_probs_valid = get_predictions(model, valid_data_loader)
predicted_test_labels, prediction_probs_test = get_predictions(model, test_data_loader)

In [25]:
valid_precision = precision_score(valid_labels, predicted_valid_labels)
valid_recall = recall_score(valid_labels, predicted_valid_labels)
valid_f_measure = f1_score(valid_labels, predicted_valid_labels)
valid_roc_auc = roc_auc_score(valid_labels, [x[1] for x in prediction_probs_valid])

print('Validation dataset')

print(f'Precision: {valid_precision:.4f}')
print(f'Recall: {valid_recall:.4f}')
print(f'F-measure: {valid_f_measure:.4f}')
print(f'ROC_AUC: {valid_roc_auc:.4f}')

Validation dataset
Precision: 0.2799
Recall: 0.8763
F-measure: 0.4243
ROC_AUC: 0.8969


In [90]:
valid_precision = precision_score(valid_labels, predicted_valid_labels)
valid_recall = recall_score(valid_labels, predicted_valid_labels)
valid_f_measure = f1_score(valid_labels, predicted_valid_labels)
valid_roc_auc = roc_auc_score(valid_labels, [x[1] for x in prediction_probs_valid])

print('Validation dataset')

print(f'Precision: {valid_precision:.4f}')
print(f'Recall: {valid_recall:.4f}')
print(f'F-measure: {valid_f_measure:.4f}')
print(f'ROC_AUC: {valid_roc_auc:.4f}')

Validation dataset
Precision: 0.2830
Recall: 0.8395
F-measure: 0.4233
ROC_AUC: 0.8951


In [26]:
df_submit = pd.DataFrame(columns=['tweet_id', 'label'])

df_submit['tweet_id'] = df_test['tweet_id'].values
df_submit['label'] = [float(x[1]) for x in prediction_probs_test]

df_submit.to_csv('solution.csv', sep=',', index=False)