In [1]:
import torch
import random
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from more_itertools import locate


from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import pipeline
from torch.optim import AdamW

from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3070 Laptop GPU'

In [3]:
df = pd.read_csv("train.txt", delimiter='\t', header=None, encoding='cp850',
                  names=['real', 'ruined'])
df

Unnamed: 0,real,ruined
0,"En route , they pick up a seemingly-harmless h...","En route , they pick up a seemingly-harmless h..."
1,"Jobs, however, are not created by supporting a...","Jobs, however, are not created by supporting a..."
2,"Due to circumstances , he is forced to abandon...","Due to circumstances , he is forced to abandon..."
3,I should just like to highlight two issues.,I should just like to hihglight two issues.
4,Jerry is finally released from hospital and re...,Jerry is finally released from hospital and re...
...,...,...
996613,"Mr President, as we have decided to vote, mayb...","to President, as we have decided to vote, mayb..."
996614,"He sends Stokes on a six month trip to Earth ,...","He swims Stokes on a six month trip to Earth ,..."
996615,Wellington however refuses to assist their foo...,Wellington . refuses to assist their foolhardy...
996616,There is one particular issue which I should l...,There is one particular which I should like to...


In [4]:
#num_data_used = 10

num_data_used = len(df) // 5
data_real = df[["real"]].copy().iloc[:num_data_used]
data_real.rename(columns={"real": "text"}, inplace = True)
data_real.insert(1, "label", 1)
data_real["tags"] = data_real["text"]

pairs = []

for index, row in data_real.iterrows():
    tags = []
    for (word, tag) in pos_tag(word_tokenize(row["tags"]), tagset='universal'):
        tags.append(tag)
        pairs.extend([(word, tag)])
    data_real.at[index,'tags'] = tags
data_real

Unnamed: 0,text,label,tags
0,"En route , they pick up a seemingly-harmless h...",1,"[NOUN, NOUN, ., PRON, VERB, PRT, DET, ADJ, NOU..."
1,"Jobs, however, are not created by supporting a...",1,"[NOUN, ., ADV, ., VERB, ADV, VERB, ADP, VERB, ..."
2,"Due to circumstances , he is forced to abandon...",1,"[ADJ, PRT, NOUN, ., PRON, VERB, VERB, PRT, VER..."
3,I should just like to highlight two issues.,1,"[PRON, VERB, ADV, VERB, PRT, VERB, NUM, NOUN, .]"
4,Jerry is finally released from hospital and re...,1,"[NOUN, VERB, ADV, VERB, ADP, NOUN, CONJ, NOUN,..."
...,...,...,...
199318,I guess I'd like to look him up when I get back.',1,"[PRON, VERB, PRON, VERB, VERB, PRT, VERB, PRON..."
199319,"The final mugger , Nirvana , gets away .",1,"[DET, ADJ, NOUN, ., NOUN, ., VERB, ADV, .]"
199320,"During the game , however , Laura 's new frien...",1,"[ADP, DET, NOUN, ., ADV, ., NOUN, PRT, ADJ, NO..."
199321,"Realizing the ship is close , Aubrey hastily p...",1,"[VERB, DET, NOUN, VERB, ADV, ., NOUN, ADV, VER..."


In [5]:
data_ruin = data_real.copy()

In [6]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
vocab = tokenizer.get_vocab()

In [2]:
pos_txt = []
neg_txt = []
with open("train.txt", encoding="cp863") as txt:
    lines = txt.read().splitlines()
    for line in lines:
        parts = line.split("\t")
        pos_txt.append(parts[0])
        neg_txt.append(parts[1])

In [3]:
pairs = []
pos_tags = []
for sent in pos_txt:
    tags = []
    for (word, tag) in pos_tag(word_tokenize(sent), tagset='universal'):
        tags.append(tag)
        pairs.extend([(word, tag)])
    pos_tags.append(tags)

In [4]:
dictionary = {}
for pair in pairs:
    if pair[1] not in dictionary:
         dictionary[pair[1]] = set()
    dictionary[pair[1]].add(pair[0])

In [7]:
count = 0
k = 10000
verb_dict = []
ver_dict = list(dictionary["VERB"])
random.seed(322)
random.shuffle(ver_dict)
for i in range(len(ver_dict)):
    word = "Ġ" + ver_dict[i]
    if word in vocab:
        verb_dict.append(word)
        count += 1
    if count >= k:
        break


In [8]:
count = 0
k = 10000
noun_dict = []
n_dict = list(dictionary["NOUN"])
random.seed(322)
random.shuffle(n_dict)
for i in range(len(n_dict)):
    word = "Ġ" + n_dict[i]
    if word in vocab:
        noun_dict.append(word)
        count += 1
    if count == k:
        break  

In [9]:
count = 0
k = 10000
adv_dict = []
a_dict = list(dictionary["ADV"])
random.seed(322)
random.shuffle(a_dict)
for i in range(len(a_dict)):
    word = "Ġ" + a_dict[i]
    if word in vocab:
        adv_dict.append(word)
        count += 1
    if count == k:
        break  

In [10]:
count = 0
k = 10000
adj_dict = []
ad_dict = list(dictionary["ADJ"])
random.seed(322)
random.shuffle(ad_dict)
for i in range(len(ad_dict)):
    word = "Ġ" + ad_dict[i]
    if word in vocab:
        adj_dict.append(word)
        count += 1
    if count == k:
        break 

In [11]:
count = 0
k = 10000
adp_dict = []
ap_dict = list(dictionary["ADP"])
random.seed(322)
random.shuffle(ap_dict)
for i in range(len(ap_dict)):
    word = "Ġ" + ap_dict[i]
    if word in vocab:
        adp_dict.append(word)
        count += 1
    if count == k:
        break 

In [12]:
count = 0
k = 10000
num_dict = []
ap_dict = list(dictionary["NUM"])
random.seed(322)
random.shuffle(ap_dict)
for i in range(len(ap_dict)):
    word = "Ġ" + ap_dict[i]
    if word in vocab:
        num_dict.append(word)
        count += 1
    if count == k:
        break 

In [13]:
print(len(verb_dict))
print(len(noun_dict))
print(len(adv_dict))
print(len(adj_dict))
print(len(adp_dict))

10000
10000
2359
9986
973


In [14]:
nlp = pipeline("fill-mask", model="roberta-base", device=0)
mask = nlp.tokenizer.mask_token

In [16]:
data_ruin = []
MAX_LEN = 128
sents = []
for i in range(len(pos_txt)):
    list_words = word_tokenize(pos_txt[i])[:MAX_LEN]
    tags = pos_tags[i]
    tags = tags[:len(list_words)]
    index = list(locate(tags, lambda a: a == "NOUN"))
    if index:
        random.seed(322)
        chosen_index = random.choice(index)
        list_words[chosen_index] = mask
        sent = " ".join(list_words)
    else:
        index = list(locate(tags, lambda a: a == "VERB"))
        if index:
            random.seed(322)
            chosen_index = random.choice(index)
            list_words[chosen_index] = mask
            sent = " ".join(list_words[:MAX_LEN])
        else:
            index = list(locate(tags, lambda a: a == "ADJ"))
            if index:
                random.seed(322)
                chosen_index = random.choice(index)
                list_words[chosen_index] = mask
                sent = " ".join(list_words[:MAX_LEN])
            else:
                index = list(locate(tags, lambda a: a == "ADV"))
                if index:
                    random.seed(322)
                    chosen_index = random.choice(index)
                    list_words[chosen_index] = mask
                    sent = " ".join(list_words[:MAX_LEN])
                else:
                    index = list(locate(tags, lambda a: a == "PRON"))
                    if not index:
                        print(pos_txt[i], tags)
                        chosen_index = 0
                        list_words[-1] = mask
                        sent = " ".join(list_words[:MAX_LEN])
                    else:
                        random.seed(322)
                        chosen_index = random.choice(index)
                        list_words[chosen_index] = mask
                        sent = " ".join(list_words[:MAX_LEN])
    sents.append(sent)

One, two, three. ['NUM', '.', 'NUM', '.', 'NUM', '.']
15,000,000 . ['NUM', '.']
: 237 248 . ['.', 'NUM', 'NUM', '.']
NO !!!!!!!!! '' ['DET', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.']
'Ouse-parlourmaid. ['NUM', '.']
Paragraph 45: ['.', 'NUM', '.']
6 trillion. ['NUM', 'NUM', '.']
Yes, or no?" ['X', '.', 'CONJ', 'DET', '.', '.']
263 , 268 . ['NUM', '.', 'NUM', '.']
2000 \/ - . ['NUM', 'X', '.', '.']
No, no, no! ['DET', '.', 'DET', '.', 'DET', '.']
17, 30, 31, 33, 36, 37, 38, 40, 62, 64, 86, 90, 100, 101, 105 ['NUM', '.', 'NUM', '.', 'NUM', '.', 'NUM', '.', 'NUM', '.', 'NUM', '.', 'NUM', '.', 'NUM', '.', 'NUM', '.', 'NUM', '.', 'NUM', '.', 'NUM', '.', 'NUM', '.', 'NUM', '.', 'NUM']
But which one ? ['CONJ', 'DET', 'NUM', '.']
- - - 1953 ['.', '.', '.', 'NUM']
One billion. ['NUM', 'NUM', '.']
16, 18, 27, 28, 61, 87, 104 ['NUM', '.', 'NUM', '.', 'NUM', '.', 'NUM', '.', 'NUM', '.', 'NUM', '.', 'NUM']
2, 4, 6, 7, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 27, 28, 30, 33, 35

In [17]:
len(sents)

1000000

In [26]:
sents[1]

'Jobs , however , are not created by supporting a few multinational giants , which will often strike sail at the first signs of an economic headwind , leaving their massive redundant crews to take to the lifeboats ; on the contrary , jobs are created by promoting small and medium-sized businesses , which , after all , are the largest overall employers of labour , not only in Austria .'

In [30]:
pos_txt[1]

'Jobs, however, are not created by supporting a few multinational giants, which will often strike sail at the first signs of an economic headwind, leaving their massive redundant crews to take to the lifeboats; on the contrary, jobs are created by promoting small and medium-sized businesses, which, after all, are the largest overall employers of manpower, not only in Austria.'

In [31]:
out = pos_txt[0]+ "\t" + sents[0]
out

'En route , they pick up a seemingly-harmless hitchhiker , and continue their journey , only for their car to break down in a deserted motel on a lonely highway .\tEn route , they pick up a seemingly-harmless hitchhiker , and continue their journey , only for their car to break down in a deserted motel on a lonely one .'

In [37]:
with open("part2.txt", "w", encoding="cp850", errors='replace') as f:
    for i in range(len(sents)):
        out = pos_txt[i]+ "\t" + sents[i]
        print(out, file=f)

In [24]:
results = nlp(sents, targets=num_dict, top_k=1)
results

[[{'score': 6.4101195675903e-06,
   'token': 65,
   'token_str': ' one',
   'sequence': 'En route, they pick up a seemingly-harmless hitchhiker, and continue their journey, only for their car to break down in a deserted motel on a lonely one.'}],
 [{'score': 0.03172511234879494,
   'token': 6610,
   'token_str': ' labour',
   'sequence': 'Jobs, however, are not created by supporting a few multinational giants, which will often strike sail at the first signs of an economic headwind, leaving their massive redundant crews to take to the lifeboats ; on the contrary, jobs are created by promoting small and medium-sized businesses, which, after all, are the largest overall employers of labour, not only in Austria.'}],
 [{'score': 0.001212306204251945,
   'token': 193,
   'token_str': ' 2017',
   'sequence': 'Due to circumstances, he is forced to abandon his studies, though he comes first in his school in the 2017 exam.'}],
 [{'score': 9.86386658041738e-05,
   'token': 80,
   'token_str': ' t

In [25]:
for j in range(len(sents)):
    sents[j] = sents[j].replace(mask, results[j][0]["token_str"][1:])



In [19]:
sentences = data.text.values
sentences = [str(sentence) + " [SEP] [CLS]" for sentence in sentences]
labels = data.label.values

In [20]:
sentences[0]

'En route , they pick up a seemingly-harmless hitchhiker , and continue their journey , only for their car to break down in a deserted motel on a lonely highway . [SEP] [CLS]'

In [21]:
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

Tokenize the first sentence:
['En', 'Ġroute', 'Ġ,', 'Ġthey', 'Ġpick', 'Ġup', 'Ġa', 'Ġseemingly', '-', 'harm', 'less', 'Ġhitch', 'h', 'iker', 'Ġ,', 'Ġand', 'Ġcontinue', 'Ġtheir', 'Ġjourney', 'Ġ,', 'Ġonly', 'Ġfor', 'Ġtheir', 'Ġcar', 'Ġto', 'Ġbreak', 'Ġdown', 'Ġin', 'Ġa', 'Ġdeserted', 'Ġmotel', 'Ġon', 'Ġa', 'Ġlonely', 'Ġhighway', 'Ġ.', 'Ġ[', 'SE', 'P', ']', 'Ġ[', 'CL', 'S', ']']


In [22]:
MAX_LEN = 128
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")


In [23]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [24]:
PATH = "./checkpoints_new/tokenizer/"
tokenizer.save_pretrained(PATH)

('./checkpoints_new/tokenizer/tokenizer_config.json',
 './checkpoints_new/tokenizer/special_tokens_map.json',
 './checkpoints_new/tokenizer/vocab.json',
 './checkpoints_new/tokenizer/merges.txt',
 './checkpoints_new/tokenizer/added_tokens.json')

In [25]:
#Use train_test_split to split our data into train and validation sets for training

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=322, test_size=0.2)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=322, test_size=0.2)

In [26]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [27]:
batch_size = 32

# Create an iterator of our data with torch DataLoader. 
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [28]:
TOKEN_PATH = "./checkpoints_new/tokenizer/"
BEST_MODEL_PATH = ".\checkpoints_new\epoch0_valacc_92.09080647184247_roberta"

In [29]:
tokenizer = RobertaTokenizer.from_pretrained(TOKEN_PATH, local_files_only=True)
model = RobertaForSequenceClassification.from_pretrained(BEST_MODEL_PATH, local_files_only=True)
model.cuda()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [30]:
model.eval()

with torch.no_grad():
    correct = 0
    total = 0
    for i, batch in enumerate(validation_dataloader):
        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        prediction = torch.argmax(outputs[0],dim=1)
        total += b_labels.size(0)
        correct += (prediction==b_labels).sum().item()

print("Validation Accuracy: {}".format(100 * correct / total))

Validation Accuracy: 56.1369622475856
