In [789]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import string
import regex as re
import time
import torch.optim as optim
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [790]:
df = pd.read_csv("Headlines.csv")
df.head()

Unnamed: 0,label,headline,valid
0,sarcastic,dateline nbc report inspired by actual events,False
1,sarcastic,goldfish dying to be petted just once,False
2,nonsarcastic,scalia's utter moral failure exposed,False
3,nonsarcastic,video captures courthouse beating of inmate ac...,False
4,nonsarcastic,bernie sanders has a very lonely but very comm...,False


In [791]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26709 entries, 0 to 26708
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   label     26709 non-null  object
 1   headline  26709 non-null  object
 2   valid     26709 non-null  bool  
dtypes: bool(1), object(2)
memory usage: 443.5+ KB


In [792]:
df.dropna(axis=0)

Unnamed: 0,label,headline,valid
0,sarcastic,dateline nbc report inspired by actual events,False
1,sarcastic,goldfish dying to be petted just once,False
2,nonsarcastic,scalia's utter moral failure exposed,False
3,nonsarcastic,video captures courthouse beating of inmate ac...,False
4,nonsarcastic,bernie sanders has a very lonely but very comm...,False
...,...,...,...
26704,nonsarcastic,"kim kardashian channels cruella de vil, plus m...",True
26705,sarcastic,students thankful standardized curriculum spar...,True
26706,sarcastic,mortgage market collapse threatens nation's ba...,True
26707,nonsarcastic,the outrageous dessert you can make in a slow ...,True


In [793]:
labels = list(df["label"])
headlines = list(df["headline"])

In [794]:
headlines[:10]

['dateline nbc report inspired by actual events',
 'goldfish dying to be petted just once',
 "scalia's utter moral failure exposed",
 'video captures courthouse beating of inmate accused of killing chicago child',
 'bernie sanders has a very lonely but very committed following on wall street',
 'yoga, mindfulness and weight management',
 'gwen stefani teases possible no doubt album',
 'an invitation to do something about the environment',
 "'mr. falafel' owner does not actually like being addressed as mr. falafel",
 'new york bomber sought an isis-inspired attack with failed device, investigators say']

In [795]:
train_len = int(0.8*len(headlines))
val_len = int(0.1*len(headlines))
train_labels = labels[:train_len]
val_labels = labels[train_len:train_len+val_len]
test_labels = labels[train_len+val_len:]
train_data = headlines[:train_len]
val_data = headlines[train_len:train_len+val_len]
test_data = headlines[train_len+val_len:]

In [796]:
print(len(train_data), len(val_data), len(test_data))
# remove sentences longer than 15 words
def remove_long_sentences(data, labels):
    new_data = []
    new_labels = []
    for i in range(len(data)):
        words = data[i].split()
        if len(words) <= 15:
            new_data.append(data[i])
            new_labels.append(labels[i])

    return new_data, new_labels

train_data, train_labels = remove_long_sentences(train_data, train_labels)
val_data, val_labels = remove_long_sentences(val_data, val_labels)
test_data, test_labels = remove_long_sentences(test_data, test_labels)

label_mapping = {'sarcastic': 0, 'nonsarcastic': 1}   # 0 for sarcastic, 1 for non-sarcastic
train_labels = [label_mapping[label] for label in train_labels]
val_labels = [label_mapping[label] for label in val_labels]
test_labels = [label_mapping[label] for label in test_labels]


print(len(train_data), len(val_data), len(test_data))
print(len(train_labels), len(val_labels), len(test_labels))

21367 2670 2672
20513 2558 2565
20513 2558 2565


In [797]:
def clean_data(sent):
    sent = sent.encode('ascii', 'ignore').decode('ascii')           
    sent = re.sub(r'https?:\/\/\S+', '', sent)
    sent = re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", '', sent)
    sent = re.sub("#[A-Za-z0-9_]+", '', sent)
    sent = re.sub("[0-9_]+", "", sent)
    sent = re.sub(r"[^\w\s\0-9.]+|\.(?=.*\.)", "", sent)
    sent = re.sub(r'[^\w\s]', " ", sent)
    sent = re.sub(' +', ' ', sent)
    sent = re.sub(r"^\s+|\s+$", "", sent)
    return sent

In [798]:
def pre_process(headlines,g_dict, labels):
    final_tokenized_data=[]
    final_labels = []
    # for string in headlines:
    #     sentences = sent_tokenize(string)
    #     for line in sentences:
    #         temp_line = clean_data(line.lower())
    #         tokens = word_tokenize(temp_line)
    #         for i, word in enumerate(tokens):
    #             if word not in g_dict.keys():
    #                 tokens[i]= "<unk>"
    #         tokens = ['<sos>']+tokens+['<eos>']
    #         final_tokenized_data.append(tokens)
    # return final_tokenized_data
    for i in range(len(headlines)):
        temp_line = clean_data(headlines[i].lower())
        tokens = word_tokenize(temp_line)
        for j, word in enumerate(tokens):
            if word not in g_dict.keys():
                tokens[j]= "<unk>"
        tokens = ['<sos>']+tokens+['<eos>']
        final_tokenized_data.append(tokens)
        final_labels.append(labels[i])
    return final_tokenized_data, final_labels

In [799]:
def pre_process_for_cls(headlines,g_dict):
    final_tokenized_data=[]
    for string in headlines:
        temp_line = clean_data(string.lower())
        tokens = word_tokenize(temp_line)
        for i, word in enumerate(tokens):
            if word not in g_dict.keys():
                tokens[i]= "<unk>"
        final_tokenized_data.append(tokens)
    return final_tokenized_data

In [800]:
glove_dict = {}
wordtoint_glove = {}
inttoword_glove = {}
count = 0
with open("archive/glove.6B.50d.txt","r") as f:
    for line in f:
        glove_sent = line.split(' ')
        key = glove_sent[0]
        value = glove_sent[1:]
        value = list(map(float, value))
        glove_dict[key] = value
        wordtoint_glove[key] = count
        inttoword_glove[count] = key
        count += 1

glove_dict["<unk>"] = np.zeros(50)
glove_dict["<sos>"] = np.ones(50)
glove_dict["<eos>"] = np.ones(50)*2
glove_dict["<pad>"] = np.ones(50)*3

wordtoint_glove["<unk>"] = count
inttoword_glove[count] = "<unk>"
count+=1
wordtoint_glove["<sos>"] = count
inttoword_glove[count] = "<sos>"
count+=1
wordtoint_glove["<eos>"] = count
inttoword_glove[count] = "<eos>"
count+=1
wordtoint_glove["<pad>"] = count
inttoword_glove[count] = "<pad>"

In [801]:
print(len(train_data), len(val_data), len(test_data))

final_train_data, final_train_labels = pre_process(train_data, glove_dict, train_labels)
final_val_data, final_val_labels = pre_process(val_data, glove_dict, val_labels)
final_test_data, final_test_labels = pre_process(test_data, glove_dict, test_labels)

print(len(final_train_data), len(final_val_data), len(final_test_data))
print(len(final_train_labels), len(final_val_labels), len(final_test_labels))

20513 2558 2565
20513 2558 2565
20513 2558 2565


In [802]:
final_train_data[:5]

[['<sos>',
  'dateline',
  'nbc',
  'report',
  'inspired',
  'by',
  'actual',
  'events',
  '<eos>'],
 ['<sos>', 'goldfish', 'dying', 'to', 'be', 'petted', 'just', 'once', '<eos>'],
 ['<sos>', 'scalia', 's', 'utter', 'moral', 'failure', 'exposed', '<eos>'],
 ['<sos>',
  'video',
  'captures',
  'courthouse',
  'beating',
  'of',
  'inmate',
  'accused',
  'of',
  'killing',
  'chicago',
  'child',
  '<eos>'],
 ['<sos>',
  'bernie',
  'sanders',
  'has',
  'a',
  'very',
  'lonely',
  'but',
  'very',
  'committed',
  'following',
  'on',
  'wall',
  'street',
  '<eos>']]

In [803]:
list_vocab = []
for line in final_train_data:
    list_vocab.extend(line)
    
array_vocab = np.array(list_vocab)
vocab, frequency_words = np.unique(array_vocab, return_counts=True)

vocab_dict = {}
count=0
for i in range(len(vocab)):
    if frequency_words[i]>3:
        vocab_dict[vocab[i]] = count
        count+=1

vocab_dict["<pad>"] = count

In [804]:
# padding
def padding(sentences):
    max_len = 0
    for sent in sentences:
        if len(sent)>max_len:
            max_len = len(sent)
    for i, sent in enumerate(sentences):
        if len(sent)<max_len:
            sentences[i] = sent + ['<pad>']*(max_len-len(sent))
    return sentences

In [805]:
# in final_train_data, pad all the sentences
final_train_data = padding(final_train_data)
final_val_data = padding(final_val_data)
final_test_data = padding(final_test_data)

print(len(final_train_data), len(final_val_data), len(final_test_data))
print(final_train_data[:5])




20513 2558 2565
[['<sos>', 'dateline', 'nbc', 'report', 'inspired', 'by', 'actual', 'events', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<sos>', 'goldfish', 'dying', 'to', 'be', 'petted', 'just', 'once', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<sos>', 'scalia', 's', 'utter', 'moral', 'failure', 'exposed', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<sos>', 'video', 'captures', 'courthouse', 'beating', 'of', 'inmate', 'accused', 'of', 'killing', 'chicago', 'child', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<sos>', 'bernie', 'sanders', 'has', 'a', 'very', 'lonely', 'but', 'very', 'committed', 'following', 'on', 'wall', 'street', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']]


In [806]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, labels, wordtoint, inttoword):
        self.data = data
        self.wordtoint = wordtoint
        self.inttoword = inttoword
        self.labels = labels
        words_sentences = []
        for sentence in self.data:
            words = [word if word in self.wordtoint else "<unk>" for word in sentence]
            words_sentences.append(words)

        self.words_sentences = words_sentences

    def __len__(self):
        return len(self.words_sentences)
    
    def __getitem__(self, index):
        x = torch.tensor([self.wordtoint[word] for word in self.words_sentences[index]])
        y = torch.tensor(self.labels[index])
        return x, y  # Return a one-dimensional target tensor


In [807]:
batch_size = 128

# print(len(final_train_data), len(train_labels), len(final_val_data), len(val_labels), len(final_test_data), len(test_labels))

train_loader = torch.utils.data.DataLoader(Dataset(final_train_data, final_train_labels, vocab_dict, inttoword_glove), batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(Dataset(final_val_data, final_val_labels, vocab_dict, inttoword_glove), batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(Dataset(final_test_data, final_test_labels, vocab_dict, inttoword_glove), batch_size=batch_size, shuffle=True)

In [808]:
hidden_size = 600
embedding_size = 600
num_layers = 1
num_classes = 2
learning_rate = 0.001
num_epochs = 10


In [809]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x = self.embedding(x)
        out, (h_n, c_n) = self.lstm(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out


In [810]:
model = LSTM(len(vocab_dict), hidden_size, num_classes, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [811]:
def train(model, train_loader, val_loader, num_epochs, optimizer, criterion):
    train_losses = []
    val_losses = []
    train_acc = []
    val_acc = []
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        total = 0
        correct = 0
        for batch_idx, (data, target) in enumerate(train_loader):
            data = data.to(device)
            target = target.to(device)
            optimizer.zero_grad()
            output = model(data)
            # print("Output shape:", output.shape)
            # print("Target shape:", target.shape)
            loss = criterion(output.squeeze(), target)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            total += target.size(0)
            _, predicted = torch.max(output.data, 1)
            correct += (predicted == target).sum().item()
            print("Epoch: {}, Batch: {}, Loss: {}, Accuracy: {}".format(epoch, batch_idx, loss.item(), (correct/total)*100))
        train_losses.append(total_loss/len(train_loader))
        train_acc.append((correct/total)*100)
        print("Epoch: {}, Training Loss: {}".format(epoch, total_loss/len(train_loader)))
        val_loss = 0
        val_total = 0
        val_correct = 0
        model.eval()
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(val_loader):
                data = data.to(device)
                target = target.to(device)
                output = model(data)
                loss = criterion(output.squeeze(), target)
                val_loss += loss.item()
                val_total += target.size(0)
                _, predicted = torch.max(output.data, 1)
                val_correct += (predicted == target).sum().item()
            val_losses.append(val_loss/len(val_loader))
            val_acc.append((val_correct/val_total)*100)
            print("Epoch: {}, Validation Loss: {}, Validation Accuracy: {}".format(epoch, val_loss/len(val_loader), (val_correct/val_total)*100))
    return train_losses, val_losses, train_acc, val_acc

In [812]:
train_losses, val_losses, train_acc, val_acc = train(model, train_loader, val_loader, num_epochs, optimizer, criterion)

Epoch: 0, Batch: 0, Loss: 0.6936159729957581, Accuracy: 42.96875
Epoch: 0, Batch: 1, Loss: 2.1822524070739746, Accuracy: 47.65625
Epoch: 0, Batch: 2, Loss: 1.0162992477416992, Accuracy: 51.041666666666664
Epoch: 0, Batch: 3, Loss: 0.6782487034797668, Accuracy: 53.515625
Epoch: 0, Batch: 4, Loss: 0.8161976933479309, Accuracy: 50.15625
Epoch: 0, Batch: 5, Loss: 0.753948450088501, Accuracy: 49.21875
Epoch: 0, Batch: 6, Loss: 0.7301640510559082, Accuracy: 48.10267857142857
Epoch: 0, Batch: 7, Loss: 0.7004614472389221, Accuracy: 47.265625
Epoch: 0, Batch: 8, Loss: 0.6883935928344727, Accuracy: 48.09027777777778
Epoch: 0, Batch: 9, Loss: 0.6889710426330566, Accuracy: 48.75
Epoch: 0, Batch: 10, Loss: 0.6490573883056641, Accuracy: 50.42613636363637
Epoch: 0, Batch: 11, Loss: 0.6752417087554932, Accuracy: 51.171875
Epoch: 0, Batch: 12, Loss: 0.7025417685508728, Accuracy: 51.38221153846154
Epoch: 0, Batch: 13, Loss: 0.7064882516860962, Accuracy: 51.5625
Epoch: 0, Batch: 14, Loss: 0.6894631385803

In [813]:
print(train_losses)
print(val_losses)
print(train_acc)
print(val_acc)

[0.6438525174715504, 0.3750426643383429, 0.23797084918673733, 0.13217204519958228, 0.0703992367901417, 0.0447580324202452, 0.02903494787668543, 0.019578035338781774, 0.014937046991215775, 0.014170160106750587]
[0.4588153839111328, 0.357867069542408, 0.3535968132317066, 0.399478055536747, 0.5004635065793991, 0.5388987690210343, 0.6251110181212425, 0.6567952513694764, 0.6640172496438026, 0.6945355981588364]
[62.58470238385414, 83.69326768390776, 90.75708087554234, 95.51991420075075, 97.8793935553064, 98.70813630380734, 99.12250767805781, 99.43937990542582, 99.52225418027592, 99.605128455126]
[78.81157154026583, 83.85457388584831, 85.69194683346365, 85.02736512900704, 84.71462079749804, 85.10555121188429, 85.3010164190774, 84.75371383893668, 85.26192337763878, 85.10555121188429]


In [814]:
# testing
def test(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(test_loader):
            data = data.to(device)
            target = target.to(device)
            output = model(data)
            _, predicted = torch.max(output.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()
    print("Accuracy: {}".format(correct/total))
    

In [815]:
test(model, test_loader)

Accuracy: 0.8444444444444444
