<a href="https://colab.research.google.com/github/omier/NLP-final-project/blob/master/NLP_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Init Notebook

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!git clone https://github.com/omier/NLP-final-project.git

fatal: destination path 'NLP-final-project' already exists and is not an empty directory.


In [3]:
!pip install -qq transformers

In [4]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import plotly.express as px
import pprint
from torchtext import data
from bs4 import BeautifulSoup
import re

import warnings
warnings.filterwarnings('ignore')

In [5]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f05223cc1b0>

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [7]:
!mkdir -p .vector_cache/ && cp drive/MyDrive/glove.6B.zip .vector_cache/glove.6B.zip

# Helpers

In [8]:
def train_epoch(model, data_loader, loss_fn, optimizer, scheduler):
  losses = []
  correct_predictions = 0
  
  for features, labels in data_loader:
    features = features.to(device)
    labels = labels.to(device)

    outputs = torch.squeeze(model(features))

    # for accuracy calculation
    predicted_labels = (outputs > 0.5).float() * 1
    correct_predictions += torch.sum(predicted_labels == labels)

    # calculate and save loss
    loss = loss_fn(outputs, labels)
    losses.append(loss.item())

    # optimizer step
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    if scheduler:
      scheduler.step()
    optimizer.zero_grad()
  
  return {'accuracy': correct_predictions.item() / len(data_loader.dataset),
          'loss': np.mean(losses)}

In [9]:
def eval_model(model, loss_fn, sets=['validation']):
  with torch.no_grad():
    sets_metrics = dict()

    for set_name, dataloader in data_loaders.items():
      if set_name in sets:
        losses = []
        correct_predictions = 0
  
        for features, labels in dataloader:
          features = features.to(device)
          labels = labels.to(device)

          outputs = torch.squeeze(model(features))

          # for accuracy calculation
          predicted_labels = (outputs > 0.5).float() * 1
          correct_predictions += torch.sum(predicted_labels == labels)

          # calculate and save loss
          losses.append(loss_fn(outputs, labels).item())

        sets_metrics[set_name] = {'accuracy': correct_predictions.item() / len(dataloader.dataset),
                                  'loss': np.mean(losses)}

    return sets_metrics


In [10]:
def train(model, n_epochs, train_data_loader, loss_fn, optimizer, scheduler):
  history = []

  for epoch in range(n_epochs):
    print(f'Epoch {epoch + 1}/{n_epochs} Metrics')
    
    current_metrics = { 'train': 
      train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        scheduler
      ) 
    }
    current_metrics.update(eval_model(model, loss_fn))

    pprint.pprint(current_metrics, indent=4)
    print('-' * 10)

    history.append(current_metrics)

  return history

In [11]:
def plot(history):
  metrics_map = dict()

  for e_sets in history:
    for set_name, set_metrics in e_sets.items():
      for metric_name, metric_value in set_metrics.items():

        if metric_name not in metrics_map:
          metrics_map[metric_name] = dict()
        if set_name not in metrics_map[metric_name]:
          metrics_map[metric_name][set_name] = []

        metrics_map[metric_name][set_name].append(metric_value)
      
  for metric_name, sets in metrics_map.items():
    df = None
    for set_name, set_metrics in sets.items():
      size = len(set_metrics)
      if df is None:
        df = pd.DataFrame({"epoch": np.linspace(1, size, size),
                      metric_name: set_metrics,
                      "set": [set_name] * size})
      else:
        df = df.append(pd.DataFrame({"epoch": np.linspace(1, size, size),
                      metric_name: set_metrics,
                      "set": [set_name] * size}), ignore_index=True)

    fig = px.line(df, x="epoch", y=metric_name, line_group="set", title=f"epoch {metric_name} per dataset", color="set", hover_name="set")
    fig.show()

In [12]:
def multi_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    

    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    return acc

def binary_accuracy(preds, y):
    rounded_preds = torch.round(preds)

    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

# Import BERT

In [13]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'

In [14]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [15]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME).to(device)

In [16]:
bert_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

# Load Data + features extraction using BERT

In [17]:
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import re

In [18]:
nltk.download('stopwords')

print(stopwords.words('english')[:15])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours']


In [19]:
df = pd.read_csv("NLP-final-project/Data/Combined_News_DJIA.csv")

In [20]:
df = df.drop(['Date'], axis=1)
tops = df.columns[1:]

df['headline'] = df[tops[:1]].apply(lambda row:'.'.join(row.values.astype(str)), axis=1)
df = df.drop(tops, axis=1)

df['label'] = df['Label']
df = df.drop(['Label'], axis=1)

# remove special characters
df = df.replace('b\"|b\'|\\\\|\\\"', '', regex=True)

df.head()

Unnamed: 0,headline,label
0,Georgia 'downs two Russian warplanes' as count...,0
1,Why wont America and Nato help us? If they won...,1
2,Remember that adorable 9-year-old who sang at ...,0
3,U.S. refuses Israel weapons to attack Iran: r...,0
4,All the experts admit that we should legalise ...,1


In [21]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"u s", "american", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    ## Stemming
    text = text.split()
    stemmer = nltk.stem.SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)

    return text

In [22]:
def punctuation_stopwords_removal(headlines):
    # filters charecter-by-charecter
    remove_punctuation = [ch for ch in headlines if ch not in string.punctuation]
    # convert them back to sentences and split into words
    remove_punctuation = "".join(remove_punctuation).split()
    return remove_punctuation

    # # remove stop words
    # filtered_headlines = [word for word in remove_punctuation if word.lower() not in stopwords.words('english')]
    # return filtered_headlines

In [23]:
df.loc[:, 'headline'] = df['headline'].apply(clean_text)

In [24]:
df.head()

Unnamed: 0,headline,label
0,georgia would own two russian warplan countri ...,0
1,wont america nato help us wont help us now hel...,1
2,rememb ador 9 - year - old sang open ceremoni ...,0
3,american refus israel weapon attack iran : report,0
4,expert admit legalis drug,1


In [25]:
tokenized = df['headline'].apply((lambda x: tokenizer.encode(x)))
max_len = 0
for i in tokenized.values:
    if len(i)>max_len:
        max_len = len(i)
print(max_len)

61


In [26]:
len(df)

1989

In [27]:
padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)

input_ids = torch.tensor(padded).to(device)
attention_mask = torch.tensor(attention_mask).to(device)
labels = torch.tensor(df['label'].to_numpy(), dtype=torch.float).to(device)

In [28]:
with torch.no_grad():
    features = bert_model(input_ids, attention_mask=attention_mask).pooler_output

In [29]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=RANDOM_SEED)
test_features, val_features, test_labels, val_labels = train_test_split(test_features, test_labels, test_size=0.5, random_state=RANDOM_SEED)

In [30]:
print(train_features.shape, train_labels.shape)

torch.Size([1591, 768]) torch.Size([1591])


In [31]:
class HeadlinesStocksDataset(Dataset):
  def __init__(self, features, labels):
    self.features = features
    self.labels = labels

  def __len__(self):
    return len(self.features)

  def __getitem__(self, item):
    return self.features[item], self.labels[item]

In [32]:
BATCH_SIZE = 8

def create_data_loader(features, labels, batch_size, shuffle=False):
  return DataLoader(
    HeadlinesStocksDataset(features, labels),
    batch_size=batch_size,
    shuffle=shuffle
  )

In [33]:
train_data_loader = create_data_loader(train_features, train_labels, BATCH_SIZE, shuffle=True)
val_data_loader = create_data_loader(val_features, val_labels, BATCH_SIZE)
test_data_loader = create_data_loader(test_features, test_labels, BATCH_SIZE)

In [34]:
data_loaders = {'train': train_data_loader, 'validation': val_data_loader, 'test': test_data_loader}

In [35]:
for set_name, loader in data_loaders.items():
  print(f'{set_name}: {len(loader.dataset)}')

train: 1591
validation: 199
test: 199


# Logistic Regression Classifier using BERT extracted features

In [36]:
class LogisticClassifier(nn.Module):
  def __init__(self):
    super(LogisticClassifier, self).__init__()
    self.classifier = nn.Sequential(nn.Linear(bert_model.config.hidden_size, 1),
                                    nn.Sigmoid())

  def forward(self, x):
    return self.classifier(x)  


In [37]:
model = LogisticClassifier().to(device)

In [38]:
EPOCHS = 10
optimizer = AdamW(model.parameters())
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.BCELoss().to(device)

In [39]:
%%time
history = train(model, EPOCHS, train_data_loader, loss_fn, optimizer, scheduler)

Epoch 1/10 Metrics
{   'train': {'accuracy': 0.5021998742928976, 'loss': 0.7093503759733996},
    'validation': {'accuracy': 0.4020100502512563, 'loss': 0.7200483560562134}}
----------
Epoch 2/10 Metrics
{   'train': {'accuracy': 0.4984286612193589, 'loss': 0.7139096119296011},
    'validation': {'accuracy': 0.5879396984924623, 'loss': 0.6837418627738953}}
----------
Epoch 3/10 Metrics
{   'train': {'accuracy': 0.5059710873664363, 'loss': 0.703237214579654},
    'validation': {'accuracy': 0.5829145728643216, 'loss': 0.6812760376930237}}
----------
Epoch 4/10 Metrics
{   'train': {'accuracy': 0.5135135135135135, 'loss': 0.700339201407217},
    'validation': {'accuracy': 0.5728643216080402, 'loss': 0.6893471479415894}}
----------
Epoch 5/10 Metrics
{   'train': {'accuracy': 0.4965430546825896, 'loss': 0.7001251662196826},
    'validation': {'accuracy': 0.5728643216080402, 'loss': 0.6846896266937256}}
----------
Epoch 6/10 Metrics
{   'train': {'accuracy': 0.5304839723444374, 'loss': 0.69

In [40]:
plot(history)

In [41]:
eval_model(model, loss_fn, sets=['test'])

{'test': {'accuracy': 0.4824120603015075, 'loss': 0.6910644412040711}}

# LSTM

In [42]:
df = pd.read_csv("NLP-final-project/Data/Combined_News_DJIA.csv")

In [43]:
PATH = "concat_headlines.csv"

df = df.drop(['Date'], axis=1)
tops = df.columns[1:]

df['headline'] = df[tops[:1]].apply(lambda row:'.'.join(row.values.astype(str)), axis=1)
df = df.drop(tops, axis=1)

df['label'] = df['Label']
df = df.drop(['Label'], axis=1)

# remove special characters
# df = df.replace('b\"|b\'|\\\\|\\\"', '', regex=True)
df = df.dropna()
df.to_csv(PATH)
df.head()

Unnamed: 0,headline,label
0,"b""Georgia 'downs two Russian warplanes' as cou...",0
1,b'Why wont America and Nato help us? If they w...,1
2,b'Remember that adorable 9-year-old who sang a...,0
3,b' U.S. refuses Israel weapons to attack Iran:...,0
4,b'All the experts admit that we should legalis...,1


In [44]:
TEXT = data.Field(tokenize='spacy',batch_first=True,include_lengths=True)
LABEL = data.LabelField(dtype = torch.float,batch_first=True)

In [45]:
fields = [(None, None), ('headline', TEXT), ('label', LABEL), ]

In [46]:
training_data = data.TabularDataset(path=PATH, format='csv', fields=fields, skip_header=True)

In [47]:
train_data, valid_data = training_data.split(split_ratio=0.7)

In [48]:
TEXT.build_vocab(train_data, min_freq=3, vectors = "glove.6B.100d")  
LABEL.build_vocab(train_data)

In [49]:
# No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

# No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

# Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

# Word dictionary
print(TEXT.vocab.itos)
print(LABEL.vocab.itos)

Size of TEXT vocabulary: 1848
Size of LABEL vocabulary: 2
[('the', 916), ('to', 871), ('.', 802), (',', 766), ('of', 719), ('in', 604), ('a', 499), ("'", 475), ('"', 450), ('and', 443)]
['1', '0']


In [50]:
BATCH_SIZE = 16
# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.headline),
    sort_within_batch=True,
    device = device)

In [51]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
    
        super().__init__()          
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
              
        self.classifier = nn.Sequential(nn.Linear(hidden_dim * (2 if bidirectional else 1), output_dim), nn.Sigmoid())


    def forward(self, text, text_lengths):
        
        # text = [batch size,sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # hidden = [batch size, num layers * num directions,hid dim]
        # cell = [batch size, num layers * num directions,hid dim]
        
        # concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        # hidden = [batch size, hid dim * num directions]
        outputs = self.classifier(hidden)
        
        return outputs

In [52]:
# define hyperparameters
size_of_vocab = len(TEXT.vocab)
embedding_dim = 100
num_hidden_nodes = 256
num_output_nodes = 1
num_layers = 2
bidirection = True
dropout = 0.2

In [53]:
# instantiate the model
model = LSTMModel(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = True, dropout = dropout).to(device)

In [54]:
model

LSTMModel(
  (embedding): Embedding(1848, 100)
  (lstm): LSTM(100, 256, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (classifier): Sequential(
    (0): Linear(in_features=512, out_features=1, bias=True)
    (1): Sigmoid()
  )
)

In [55]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss().to(device)

In [56]:
def train_lstm(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        text, text_lengths = batch.headline
        text_lengths = text_lengths.to("cpu")
        
        predictions = model(text, text_lengths).squeeze()

        # compute the loss
        loss = criterion(predictions, batch.label)        
        
        # compute the accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [57]:
def evaluate_lstm(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
        for batch in iterator:
            # retrieve text and no. of words
            text, text_lengths = batch.headline
            text_lengths = text_lengths.to("cpu")
        
            predictions = model(text, text_lengths).squeeze()
                      
            # compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [58]:
%%time
N_EPOCHS = 3
history = []

for epoch in range(N_EPOCHS):
    print(f'Epoch {epoch + 1}:')

    # train the model
    train_loss, train_acc = train_lstm(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate_lstm(model, valid_iterator, criterion)

    current_metrics = {'train':
                {  
                    'accuracy': train_acc,
                    'loss': train_loss
                },
                'validation':
                {
                    'accuracy': valid_acc,
                    'loss': valid_loss
                }
              }
    history.append(current_metrics)
    pprint.pprint(current_metrics, indent=4)
    print('-' * 10)

Epoch 1:
{   'train': {'accuracy': 0.5272988505747126, 'loss': 0.6946897630033821},
    'validation': {'accuracy': 0.506578947368421, 'loss': 0.6976632083717146}}
----------
Epoch 2:
{   'train': {'accuracy': 0.569683908045977, 'loss': 0.6802755580551323},
    'validation': {'accuracy': 0.511842105341585, 'loss': 0.7082373233217942}}
----------
Epoch 3:
{   'train': {'accuracy': 0.6767241379310345, 'loss': 0.6109539037463309},
    'validation': {'accuracy': 0.4937500000784272, 'loss': 0.7930271609833366}}
----------
CPU times: user 2.48 s, sys: 74.7 ms, total: 2.56 s
Wall time: 2.57 s


In [59]:
plot(history)

# LSTM transfer learning

In [60]:
df = pd.read_csv('drive/MyDrive/all-data.csv',delimiter=',',encoding='latin-1', header=None)
df = df.rename(columns={0:'sentiment',1:'message'})

sentiment  = {'positive': 0,'neutral': 1,'negative': 2} 
df.sentiment = [sentiment[item] for item in df.sentiment]

df.head()

Unnamed: 0,sentiment,message
0,1,"According to Gran , the company has no plans t..."
1,1,Technopolis plans to develop in stages an area...
2,2,The international electronic industry company ...
3,0,With the new production plant the company woul...
4,0,According to the company 's updated strategy f...


In [61]:
FIN_PATH = "fin_dataset.csv"
def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')
    return text

df['message'] = df['message'].apply(cleanText)
df.to_csv(FIN_PATH)
df.head()

Unnamed: 0,sentiment,message
0,1,"according to gran , the company has no plans t..."
1,1,technopolis plans to develop in stages an area...
2,2,the international electronic industry company ...
3,0,with the new production plant the company woul...
4,0,according to the company 's updated strategy f...


In [62]:
FIN_TEXT = data.Field(tokenize='spacy',batch_first=True,include_lengths=True)
FIN_LABEL = data.LabelField(dtype = torch.float,batch_first=True)

In [63]:
fin_training_data = data.TabularDataset(path=FIN_PATH, format='csv', fields=[(None, None), ('sentiment', FIN_LABEL), ('message', FIN_TEXT)], skip_header=True)

In [64]:
vars(fin_training_data.examples[0])

{'message': ['according',
  'to',
  'gran',
  ',',
  'the',
  'company',
  'has',
  'no',
  'plans',
  'to',
  'move',
  'all',
  'production',
  'to',
  'russia',
  ',',
  'although',
  'that',
  'is',
  'where',
  'the',
  'company',
  'is',
  'growing',
  '.'],
 'sentiment': '1'}

In [65]:
fin_train_data, fin_valid_data = fin_training_data.split(split_ratio=0.7)

In [66]:
FIN_TEXT.build_vocab(fin_train_data,min_freq=3,vectors = "glove.6B.100d")  
FIN_LABEL.build_vocab(fin_train_data)

In [67]:
#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(FIN_TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(FIN_LABEL.vocab))

Size of TEXT vocabulary: 2857
Size of LABEL vocabulary: 3


In [68]:
BATCH_SIZE = 16

# Load an iterator
fin_train_iterator, fin_valid_iterator = data.BucketIterator.splits(
    (fin_train_data, fin_valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.message),
    sort_within_batch=True,
    device = device)

In [69]:
class MyLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        super().__init__()          
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        directions = (2 if bidirectional else 1)
        lin_size = hidden_dim * directions
        
        self.classifier = nn.Sequential(nn.Linear(lin_size, int(lin_size / 2)),
                                        nn.ReLU(),
                                        nn.Linear(int(lin_size / 2), int(lin_size / 4)),
                                        nn.ReLU(),
                                        nn.Linear(int(lin_size / 4), output_dim)
                                        )


    def forward(self, text, text_lengths):
        # text = [batch size,sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # hidden = [batch size, num layers * num directions,hid dim]
        # cell = [batch size, num layers * num directions,hid dim]
        
        # concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        # hidden = [batch size, hid dim * num directions]
        outputs = self.classifier(hidden)
        
        return outputs


    def extract_sentiment(self, packed_embedded):
      packed_output, (hidden, cell) = self.lstm(packed_embedded)
      hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)

      return self.classifier(hidden)

In [70]:
# define hyperparameters
size_of_vocab = len(FIN_TEXT.vocab)
embedding_dim = 100
num_hidden_nodes = 256
num_output_nodes = 3
num_layers = 2
bidirection = True
dropout = 0.3

In [71]:
# instantiate the model
fin_model = MyLSTM(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, 
                   bidirectional = True, dropout = dropout).to(device)

In [72]:
fin_model

MyLSTM(
  (embedding): Embedding(2857, 100)
  (lstm): LSTM(100, 256, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (classifier): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=3, bias=True)
  )
)

In [73]:
N_EPOCHS = 15
optimizer = optim.Adam(fin_model.parameters(), weight_decay=0.001)
total_steps = len(fin_train_iterator) * N_EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
criterion = nn.CrossEntropyLoss().to(device)

In [74]:
def train_lstm_transfer(model, iterator, optimizer, scheduler, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        text, text_lengths = batch.message
        text_lengths = text_lengths.to("cpu")
        
        predictions = model(text, text_lengths)

        loss = criterion(predictions, batch.sentiment.long())        
        acc = multi_acc(predictions, batch.sentiment)   
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
   
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [75]:
def evaluate_lstm_transfer(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
        for batch in iterator:
            #retrieve text and no. of words
            text, text_lengths = batch.message
            text_lengths = text_lengths.to("cpu")
        
            predictions = model(text, text_lengths)
                      
            # compute loss and accuracy
            loss = criterion(predictions, batch.sentiment.long())
            acc = multi_acc(predictions, batch.sentiment)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [76]:
%%time
history = []
for epoch in range(N_EPOCHS):
    print(f'Epoch {epoch + 1}:')

    # train the model
    train_loss, train_acc = train_lstm_transfer(fin_model, fin_train_iterator, optimizer, scheduler, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate_lstm_transfer(fin_model, fin_valid_iterator, criterion)
    
    current_metrics = {'train':
                {  
                    'accuracy': train_acc,
                    'loss': train_loss
                },
                'validation':
                {
                    'accuracy': valid_acc,
                    'loss': valid_loss
                }
              }
    history.append(current_metrics)
    pprint.pprint(current_metrics, indent=4)
    print('-' * 10)

Epoch 1:
{   'train': {'accuracy': 0.6229363207547169, 'loss': 0.8705539072178444},
    'validation': {'accuracy': 0.6244113032634442, 'loss': 0.8643524646759033}}
----------
Epoch 2:
{   'train': {'accuracy': 0.6518278301886793, 'loss': 0.8135104776834542},
    'validation': {'accuracy': 0.6319662483183892, 'loss': 0.8239373613844861}}
----------
Epoch 3:
{   'train': {'accuracy': 0.6668632075471698, 'loss': 0.7692473994673423},
    'validation': {'accuracy': 0.6500196234210507, 'loss': 0.8037130420024579}}
----------
Epoch 4:
{   'train': {'accuracy': 0.6818985849056604, 'loss': 0.7423848211765289},
    'validation': {'accuracy': 0.6534536893551166, 'loss': 0.8078521712795719}}
----------
Epoch 5:
{   'train': {'accuracy': 0.6889740566037735, 'loss': 0.7157601764039049},
    'validation': {'accuracy': 0.6534536893551166, 'loss': 0.8003024770008339}}
----------
Epoch 6:
{   'train': {'accuracy': 0.7113797169811321, 'loss': 0.6779468797920447},
    'validation': {'accuracy': 0.64167974

In [77]:
plot(history)

Learn our original data using this pre-trained model

In [78]:
def train_transfered(model, iterator, optimizer, scheduler, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        text, text_lengths = batch.headline
        text_lengths = text_lengths.to("cpu")
        
        predictions = model(text, text_lengths).squeeze()

        loss = criterion(predictions, batch.label)        
        acc = binary_accuracy(predictions, batch.label)   
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [79]:
def evaluate_transfered(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
        for batch in iterator:
            #retrieve text and no. of words
            text, text_lengths = batch.headline
            text_lengths = text_lengths.to("cpu")
        
            predictions = model(text, text_lengths).squeeze()
                      
            #compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [80]:
class TransferLearningFromSentiment(nn.Module):
    def __init__(self, vocab_size, embedding_dim, trained_model, fine_tunning=False):
        super().__init__()          

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # trained model
        self.trained_model = trained_model

        for param in self.trained_model.parameters():
            param.requires_grad = fine_tunning
        
        self.classifier = nn.Sequential(nn.Linear(3, 1), nn.Sigmoid())


    def forward(self, text, text_lengths):
        # text = [batch size,sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)

        # use trained model
        hidden = self.trained_model.extract_sentiment(packed_embedded)
        
        out = self.classifier(hidden)

        return out


In [81]:
tl_net = TransferLearningFromSentiment(len(TEXT.vocab), embedding_dim, fin_model, fine_tunning=True).to(device)

In [82]:
tl_net

TransferLearningFromSentiment(
  (embedding): Embedding(1848, 100)
  (trained_model): MyLSTM(
    (embedding): Embedding(2857, 100)
    (lstm): LSTM(100, 256, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
    (classifier): Sequential(
      (0): Linear(in_features=512, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=128, bias=True)
      (3): ReLU()
      (4): Linear(in_features=128, out_features=3, bias=True)
    )
  )
  (classifier): Sequential(
    (0): Linear(in_features=3, out_features=1, bias=True)
    (1): Sigmoid()
  )
)

In [83]:
N_EPOCHS = 3
optimizer = optim.Adam(tl_net.parameters())
total_steps = len(train_iterator) * N_EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
criterion = nn.BCELoss().to(device)

In [84]:
%%time
history = []
for epoch in range(N_EPOCHS):
    print(f'Epoch {epoch + 1}:')
    
    # train the model
    train_loss, train_acc = train_transfered(tl_net, train_iterator, optimizer, scheduler, criterion)

    # evaluate the model
    valid_loss, valid_acc = evaluate_transfered(tl_net, valid_iterator, criterion)

    current_metrics = {'train':
                {  
                    'accuracy': train_acc,
                    'loss': train_loss
                },
                'validation':
                {
                    'accuracy': valid_acc,
                    'loss': valid_loss
                }
              }
    history.append(current_metrics)
    pprint.pprint(current_metrics, indent=4)
    print('-' * 10)

Epoch 1:
{   'train': {'accuracy': 0.5043103448275862, 'loss': 0.7223928447427421},
    'validation': {'accuracy': 0.511842105341585, 'loss': 0.6929069616292652}}
----------
Epoch 2:
{   'train': {'accuracy': 0.5517241379310345, 'loss': 0.6849125822385153},
    'validation': {'accuracy': 0.5032894736842105, 'loss': 0.7097487073195609}}
----------
Epoch 3:
{   'train': {'accuracy': 0.5790229885057471, 'loss': 0.6763920215354569},
    'validation': {'accuracy': 0.5049342105263158, 'loss': 0.7001572690512005}}
----------
CPU times: user 3.09 s, sys: 37.1 ms, total: 3.13 s
Wall time: 3.14 s


In [85]:
plot(history)

# N-gram document terms matrix

In [86]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk import bigrams, trigrams
from collections import Counter, defaultdict

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [87]:
df = pd.read_csv("NLP-final-project/Data/Combined_News_DJIA.csv")
df.head(1)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,Top10,Top11,Top12,Top13,Top14,Top15,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",b'Georgian troops retreat from S. Osettain cap...,b'Did the U.S. Prep Georgia for War with Russia?',b'Rice Gives Green Light for Israel to Attack ...,b'Announcing:Class Action Lawsuit on Behalf of...,"b""So---Russia and Georgia are at war and the N...","b""China tells Bush to stay out of other countr...",b'Did World War III start today?',b'Georgia Invades South Ossetia - if Russia ge...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge"""


In [88]:
df_train = df[df['Date'] < '20150101']
df_test = df[df['Date'] > '20141231']
df_test = df_test.reset_index()

In [89]:
# Removing punctuations
tops = df_train.iloc[:,2:27]
tops.replace(to_replace="[^a-zA-Z]", value=" ", regex=True, inplace=True)

tops.columns = [str(i) for i in range(25)]
tops.head(5)

# Converting headlines to lower case
for column in tops.columns:
    tops[column] = tops[column].str.lower()
tops.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
0,b georgia downs two russian warplanes as cou...,b breaking musharraf to be impeached,b russia today columns of troops roll into so...,b russian tanks are moving towards the capital...,b afghan children raped with impunity u n ...,b russian tanks have entered south ossetia...,b breaking georgia invades south ossetia rus...,b the enemy combatent trials are nothing but...,b georgian troops retreat from s osettain cap...,b did the u s prep georgia for war with russia,b rice gives green light for israel to attack ...,b announcing class action lawsuit on behalf of...,b so russia and georgia are at war and the n...,b china tells bush to stay out of other countr...,b did world war iii start today,b georgia invades south ossetia if russia ge...,b al qaeda faces islamist backlash,b condoleezza rice the us would not act to p...,b this is a busy day the european union has ...,b georgia will withdraw soldiers from ir...,b why the pentagon thinks attacking iran is a ...,b caucasus in crisis georgia invades south os...,b indian shoe manufactory and again in a se...,b visitors suffering from mental illnesses ban...,b no help for mexico s kidnapping surge


In [90]:
headlines = []
for row in range(0, len(tops.index)):
    headlines.append(' '.join(str(x) for x in tops.iloc[row, 0:25]))

testheadlines = []
for row in range(0, len(df_test.index)):
    testheadlines.append(' '.join(str(x) for x in df_test.iloc[row, 2:27]))

In [91]:
basicvectorizer3 = CountVectorizer(ngram_range=(2,3))
train_trigrams_document_terms_matrix = basicvectorizer3.fit_transform(headlines)
test_trigrams_document_terms_matrix = basicvectorizer3.transform(testheadlines)

In [92]:
def get_sparse_matrix(document_terms_matrix):
    coo = document_terms_matrix.tocoo()
    row = torch.from_numpy(coo.row.astype(np.int64)).to(torch.long)
    col = torch.from_numpy(coo.col.astype(np.int64)).to(torch.long)
    edge_index = torch.stack([row, col], dim=0)
    val = torch.from_numpy(coo.data.astype(np.int64)).to(torch.int64)
    sprase_matrix = torch.sparse.FloatTensor(edge_index, val, torch.Size(coo.shape))

    return sprase_matrix

In [93]:
train_sprase_matrix = get_sparse_matrix(train_trigrams_document_terms_matrix)
test_sprase_matrix = get_sparse_matrix(test_trigrams_document_terms_matrix)

In [94]:
class SparseDataset(torch.utils.data.Dataset):
    def __init__(self, sprase_matrix, labels):
        self.sprase_matrix = sprase_matrix
        self.labels = labels

   
    def __len__(self):
        return len(self.sprase_matrix)

    def __getitem__(self, item):
        return self.sprase_matrix[item].to_dense().float(), torch.Tensor([self.labels[item]])

In [95]:
train_ds = SparseDataset(train_sprase_matrix, df_train["Label"])
test_ds = SparseDataset(test_sprase_matrix, df_test["Label"])

In [96]:
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=16, shuffle=False)

In [97]:
terms_size = train_sprase_matrix.size(1)

In [98]:
class MyLogisticRegression(nn.Module):
    def __init__(self, terms_size):
      super().__init__()

      self.classifier = nn.Sequential(nn.Linear(terms_size, 1), nn.Sigmoid())

    def forward(self, document_terms_dense):
      return self.classifier(document_terms_dense)

In [99]:
logistic_reg_model = MyLogisticRegression(terms_size).to(device)

In [100]:
logistic_reg_model

MyLogisticRegression(
  (classifier): Sequential(
    (0): Linear(in_features=1095042, out_features=1, bias=True)
    (1): Sigmoid()
  )
)

In [101]:
loss_function = nn.BCELoss().to(device)
optimizer = optim.Adam(logistic_reg_model.parameters(), weight_decay=0.001)

In [102]:
def train_ngram(model, trainloader, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    for sprase_vectors, labels in trainloader:
        sprase_vectors = sprase_vectors.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()   
        predictions = model(sprase_vectors)

        loss = criterion(predictions, labels)   
        acc = binary_accuracy(predictions, labels)   
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(trainloader), epoch_acc / len(trainloader)

In [103]:
def evaluate_ngram(model, loader, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    with torch.no_grad():
        for sprase_vectors, labels in loader:
            sprase_vectors = sprase_vectors.to(device)
            labels = labels.to(device)
        
            predictions = model(sprase_vectors)
                      
            # compute loss and accuracy
            loss = criterion(predictions, labels)   
            acc = binary_accuracy(predictions, labels)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(loader), epoch_acc / len(loader)

In [104]:
%%time
N_EPOCHS = 3
history = []
for epoch in range(N_EPOCHS):
    print(f'Epoch {epoch + 1}:')

    # train the model
    train_loss, train_acc = train_ngram(logistic_reg_model, train_loader, optimizer, loss_function)

    # evaluate the model
    test_loss, test_acc = evaluate_ngram(logistic_reg_model, test_loader, loss_function)
 
    current_metrics = {'train':
                {  
                    'accuracy': train_acc,
                    'loss': train_loss
                },
                'test':
                {
                    'accuracy': test_acc,
                    'loss': test_loss
                }
              }
    history.append(current_metrics)
    pprint.pprint(current_metrics, indent=4)
    print('-' * 10)

Epoch 1:
{   'test': {'accuracy': 0.8390624995032946, 'loss': 0.24868011356253797},
    'train': {'accuracy': 0.5211385837477497, 'loss': 0.6950695102031414}}
----------
Epoch 2:
{   'test': {'accuracy': 0.8421875002483526, 'loss': 0.2437740716462334},
    'train': {'accuracy': 1.0, 'loss': 0.01105673591662039}}
----------
Epoch 3:
{   'test': {'accuracy': 0.8421875002483526, 'loss': 0.24347264125632742},
    'train': {'accuracy': 1.0, 'loss': 0.009305100466132674}}
----------
CPU times: user 56.9 s, sys: 1.31 s, total: 58.2 s
Wall time: 58.2 s


In [105]:
plot(history)

# N-Gram language model per label classification

In [106]:
# Create a placeholder for model
lang_models = { 
    0: defaultdict(lambda: defaultdict(lambda: 0)),
    1: defaultdict(lambda: defaultdict(lambda: 0))
}

# Count frequency of co-occurance 
for headlines_concat, label in zip(headlines, df_train["Label"]):
    for headline in nltk.sent_tokenize(headlines_concat.strip()):
        for w1, w2, w3 in trigrams(nltk.word_tokenize(headline), pad_right=True, pad_left=True):
            lang_models[label][(w1, w2)][w3] += 1
      
# Transform the counts to probabilities
for label, model in lang_models.items():
    for w1_w2 in model:
        total_count = float(sum(model[w1_w2].values()))
        for w3 in model[w1_w2]:
            model[w1_w2][w3] /= total_count

In [107]:
success = 0
count = 0
for headlines_concat, label in zip(testheadlines, df_test["Label"]):
    for headline in nltk.sent_tokenize(headlines_concat.strip()):
        labels_count = Counter()
        for w1, w2, w3 in trigrams(nltk.word_tokenize(headline), pad_right=True, pad_left=True):
            pred_label = -1
            max_prob = -1
            for lang_label, model in lang_models.items():
                prob = lang_models[lang_label][(w1, w2)][w3]
                if prob > max_prob:
                    max_prob = prob
                    pred_label = lang_label
            
            labels_count[pred_label] += 1
        
        pred_label, _ = labels_count.most_common(1)[0]
        count += 1
        if label == pred_label:
            success += 1

print(f"test accuracy: {success/count*100:.2f}%")

test accuracy: 52.57%
