In [38]:
import pandas as pd
import re
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords

df = pd.read_csv('train.csv')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Exploring the dataset

In [39]:
df.head()['Context']

0    I'm going through some things with my feelings...
1    I'm going through some things with my feelings...
2    I'm going through some things with my feelings...
3    I'm going through some things with my feelings...
4    I'm going through some things with my feelings...
Name: Context, dtype: object

In [40]:
df.head()['Response']

0    If everyone thinks you're worthless, then mayb...
1    Hello, and thank you for your question and see...
2    First thing I'd suggest is getting the sleep y...
3    Therapy is essential for those that are feelin...
4    I first want to let you know that you are not ...
Name: Response, dtype: object

In [41]:
data = df['Response'].astype(str)

In [42]:
data[:5]

0    If everyone thinks you're worthless, then mayb...
1    Hello, and thank you for your question and see...
2    First thing I'd suggest is getting the sleep y...
3    Therapy is essential for those that are feelin...
4    I first want to let you know that you are not ...
Name: Response, dtype: object

Splitting the sentences into space separated words and punctuations. Putting a space between punctuation and word basically.

In [43]:
def create_data(text):
    text = text.lower()
    text = re.findall(r'\w+|[^\s\w]+', text)
    text = ' '.join(word for word in text)
    return text

In [44]:
data = [create_data(x) for x in data]

Finding out the symbols present in the dataset

In [45]:
puncs = dict()

for sentence in data:
    result = re.findall(r'[!"\$%&\'()*+,\-.\/:;=#@?\[\\\]^_`{|}~]*', sentence)
    for r in result:
        for ch in r:
            if ch in puncs:
                puncs[ch] += 1
            else:
                puncs[ch] = 1

puncs

{"'": 9182,
 ',': 24750,
 '.': 33642,
 '-': 3145,
 '!': 1394,
 '"': 3663,
 '#': 10,
 ';': 572,
 ':': 1075,
 ')': 1682,
 '(': 1459,
 '?': 3072,
 '/': 2966,
 '%': 54,
 '&': 68,
 '[': 15,
 ']': 15,
 '$': 13,
 '*': 19,
 '~': 38,
 '_': 73,
 '=': 23,
 '+': 1445,
 '{': 4,
 '}': 4,
 '@': 1}

Removing symbols that are not punctuations or are statistically insignificant due to their low frequency.

In [46]:
REMOVE_SYM = re.compile('[*#+~_%&={}@]$\[\]]')

def clean_text(text):
    text = REMOVE_SYM.sub('', text)
    return text

In [47]:
data = [clean_text(x) for x in data]

The space-separated and cleaned data.

In [48]:
data[0]

"if everyone thinks you ' re worthless , then maybe you need to find new people to hang out with . seriously , the social context in which a person lives is a big influence in self - esteem . otherwise , you can go round and round trying to understand why you ' re not worthless , then go back to the same crowd and be knocked down again . there are many inspirational messages you can find in social media . maybe read some of the ones which state that no person is worthless , and that everyone has a good purpose to their life . also , since our culture is so saturated with the belief that if someone doesn ' t feel good about themselves that this is somehow terrible . bad feelings are part of living . they are the motivation to remove ourselves from situations and relationships which do us more harm than good . bad feelings do feel terrible . your feeling of worthlessness may be good in the sense of motivating you to find out that you are much better than your feelings today ."

Splitting the lines into tokens.

In [49]:
data = [x.split() for x in data]

In [50]:
data[0][:5]

['if', 'everyone', 'thinks', 'you', "'"]

Flattening the list

In [51]:
data = [x for xs in data for x in xs]

In [52]:
data[0]

'if'

Label data according to the following labeling style:
For any word if the following word is a punctuation label it with the tag of the punctuation, otherwise provide the default tag 'O'.

In [53]:
tagged = []

pdict = {'.': 'PERIOD', ',': 'COMMA', '!': 'EXCL', '-': 'HYPHEN', '"': 'DQUOTE', "'": 'SQUOTE', ';': 'SCOLON', ':': 'COLON', '(' : 'PSTART', ')' : 'PEND', '?': 'QUESTION', '/': 'SLASH'}

for i in range(0, len(data) - 1):
    if (data[i] not in pdict):
        if (data[i + 1] in pdict):
            tagged.append([data[i], pdict[data[i + 1]]])
        else:
            tagged.append([data[i], 'O'])

Sample

In [55]:
tagged[3]

['you', 'SQUOTE']

Size of dataset

In [56]:
len (tagged)

650463

Initializing token style for BERT model

In [None]:
from transformers import *

TOKEN_IDX = {
    'bert': {
        'START_SEQ': 101,
        'PAD': 0,
        'END_SEQ': 102,
        'UNK': 100
    }
}

punctuation_dict = {'O': 0, 'PERIOD': 1, 'COMMA': 2, 'EXCL': 3, 'HYPHEN' : 4, 'DQUOTE' : 5, 'SQUOTE': 6, 'SCOLON' : 7, 'COLON' : 8, 'PSTART' : 9, 'PEND' : 10, 'QUESTION' : 11, 'SLASH': 12}

MODELS = { 'bert-base-uncased': (BertModel, BertTokenizer, 768, 'bert') }

Dataset augmentations supported

In [None]:
alpha_sub = 0.40
alpha_del = 0.40
tokenizer = None
sub_style = 'unk'


def augment_none(x, y, y_mask, x_aug, y_aug, y_mask_aug, i, token_style):
    x_aug.append(x[i])
    y_aug.append(y[i])
    y_mask_aug.append(y_mask[i])


def augment_substitute(x, y, y_mask, x_aug, y_aug, y_mask_aug, i, token_style):
    if sub_style == 'rand':
        x_aug.append(np.random.randint(tokenizer.vocab_size))
    else:
        x_aug.append(TOKEN_IDX[token_style]['UNK'])
    y_aug.append(y[i])
    y_mask_aug.append(y_mask[i])


def augment_insert(x, y, y_mask, x_aug, y_aug, y_mask_aug, i, token_style):
    x_aug.append(TOKEN_IDX[token_style]['UNK'])
    y_aug.append(0)
    y_mask_aug.append(1)
    x_aug.append(x[i])
    y_aug.append(y[i])
    y_mask_aug.append(y_mask[i])


def augment_delete(x, y, y_mask, x_aug, y_aug, y_mask_aug, i, token_style):
    return


def augment_all(x, y, y_mask, x_aug, y_aug, y_mask_aug, i, token_style):
    r = np.random.rand()
    if r < alpha_sub:
        augment_substitute(x, y, y_mask, x_aug, y_aug, y_mask_aug, i, token_style)
    elif r < alpha_sub + alpha_del:
        augment_delete(x, y, y_mask, x_aug, y_aug, y_mask_aug, i, token_style)
    else:
        augment_insert(x, y, y_mask, x_aug, y_aug, y_mask_aug, i, token_style)


# supported augmentation techniques
AUGMENTATIONS = {
    'none': augment_none,
    'substitute': augment_substitute,
    'insert': augment_insert,
    'delete': augment_delete,
    'all': augment_all
}

In [57]:
import torch
import numpy as np

Create Dataloader

In [None]:
def parse_data(dset, tokenizer, sequence_len, token_style):
    data_items = []
    while idx < len (dset):
        x = [TOKEN_IDX[token_style]['START_SEQ']]
        y = [0]
        y_mask = [1]

        while len (x) < sequence_len - 1 and idx < len (dset):
            word, punc = dset[idx]
            tokens = tokenizer.tokenize(word)
            if len(tokens) + len(x) >= sequence_len:
                break
            else:
                for i in range(len(tokens) - 1):
                    x.append(tokenizer.convert_tokens_to_ids(tokens[i]))
                    y.append(0)
                    y_mask.append(0)
                if len(tokens) > 0:
                    x.append(tokenizer.convert_tokens_to_ids(tokens[-1]))
                else:
                    x.append(TOKEN_IDX[token_style]['UNK'])
                y.append(punctuation_dict[punc])
                y_mask.append(1)
                idx += 1
            x.append(TOKEN_IDX[token_style]['END_SEQ'])
            y.append(0)
            y_mask.append(1)
            if len(x) < sequence_len:
                x = x + [TOKEN_IDX[token_style]['PAD'] for _ in range(sequence_len - len(x))]
                y = y + [0 for _ in range(sequence_len - len(y))]
                y_mask = y_mask + [0 for _ in range(sequence_len - len(y_mask))]
            attn_mask = [1 if token != TOKEN_IDX[token_style]['PAD'] else 0 for token in x]
            data_items.append([x, y, attn_mask, y_mask])
    return data_items

class Dataset(torch.utils.data.Dataset):
    def __init__(self, files, tokenizer, sequence_len, token_style, is_train=False, augment_rate=0.1,
                 augment_type='substitute'):
        self.data = parse_data(files, tokenizer, sequence_len, token_style)
        self.sequence_len = sequence_len
        self.augment_rate = augment_rate
        self.token_style = token_style
        self.is_train = is_train
        self.augment_type = augment_type

    def __len__(self):
        return len(self.data)

    def _augment(self, x, y, y_mask):
        x_aug = []
        y_aug = []
        y_mask_aug = []
        for i in range(len(x)):
            r = np.random.rand()
            if r < self.augment_rate:
                AUGMENTATIONS[self.augment_type](x, y, y_mask, x_aug, y_aug, y_mask_aug, i, self.token_style)
            else:
                x_aug.append(x[i])
                y_aug.append(y[i])
                y_mask_aug.append(y_mask[i])

        if len(x_aug) > self.sequence_len:
            # len increased due to insert
            x_aug = x_aug[0:self.sequence_len]
            y_aug = y_aug[0:self.sequence_len]
            y_mask_aug = y_mask_aug[0:self.sequence_len]
        elif len(x_aug) < self.sequence_len:
            # len decreased due to delete
            x_aug = x_aug + [TOKEN_IDX[self.token_style]['PAD'] for _ in range(self.sequence_len - len(x_aug))]
            y_aug = y_aug + [0 for _ in range(self.sequence_len - len(y_aug))]
            y_mask_aug = y_mask_aug + [0 for _ in range(self.sequence_len - len(y_mask_aug))]

        attn_mask = [1 if token != TOKEN_IDX[self.token_style]['PAD'] else 0 for token in x]
        return x_aug, y_aug, attn_mask, y_mask_aug

    def __getitem__(self, index):
        x = self.data[index][0]
        y = self.data[index][1]
        attn_mask = self.data[index][2]
        y_mask = self.data[index][3]

        if self.is_train and self.augment_rate > 0:
            x, y, attn_mask, y_mask = self._augment(x, y, y_mask)

        x = torch.tensor(x)
        y = torch.tensor(y)
        attn_mask = torch.tensor(attn_mask)
        y_mask = torch.tensor(y_mask)

        return x, y, attn_mask, y_mask


Create model using BERT layers

In [None]:
import torch.nn as nn

class DeepPunctuation(nn.Module):
    def __init__(self, pretrained_model, freeze_bert=False, lstm_dim=-1):
        super(DeepPunctuation, self).__init__()
        self.output_dim = len(punctuation_dict)
        self.bert_layer = MODELS[pretrained_model][0].from_pretrained(pretrained_model)
        # Freeze bert layers
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False
        bert_dim = MODELS[pretrained_model][2]
        if lstm_dim == -1:
            hidden_size = bert_dim
        else:
            hidden_size = lstm_dim
        self.lstm = nn.LSTM(input_size=bert_dim, hidden_size=hidden_size, num_layers=1, bidirectional=True)
        self.linear = nn.Linear(in_features=hidden_size*2, out_features=len(punctuation_dict))

    def forward(self, x, attn_masks):
        if len(x.shape) == 1:
            x = x.view(1, x.shape[0])  # add dummy batch for single sample
        # (B, N, E) -> (B, N, E)
        x = self.bert_layer(x, attention_mask=attn_masks)[0]
        # (B, N, E) -> (N, B, E)
        x = torch.transpose(x, 0, 1)
        x, (_, _) = self.lstm(x)
        # (N, B, E) -> (B, N, E)
        x = torch.transpose(x, 0, 1)
        x = self.linear(x)
        return x

Create splits of dataset

In [58]:
train_data = tagged[:550000]
test_data = tagged[550000:]
valid_data = train_data[:100000]
train_data = train_data[100000:]

Rest of the assignment could not be completed in the given time