<a href="https://colab.research.google.com/github/omier/NLP-final-project/blob/master/NLP_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Init Notebook

In [1]:
!git clone https://github.com/omier/NLP-final-project.git

fatal: destination path 'NLP-final-project' already exists and is not an empty directory.


In [2]:
!pip install -qq transformers

In [3]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import plotly.express as px
import pprint

import warnings
warnings.filterwarnings('ignore')

In [4]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f87aeb13828>

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

# Import BERT

In [6]:
PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'

In [7]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [8]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertModel: ['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn.lin2.weight', 'd

In [9]:
bert_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

# Load Data + features extraction using BERT

In [69]:
df = pd.read_csv("NLP-final-project/Data/Combined_News_DJIA.csv")

In [70]:
df = df.drop(['Date'], axis=1)
tops = df.columns[1:]

df['headline'] = df[tops[:1]].apply(lambda row:'.'.join(row.values.astype(str)), axis=1)
df = df.drop(tops, axis=1)

df['label'] = df['Label']
df = df.drop(['Label'], axis=1)

# remove special characters
df = df.replace('b\"|b\'|\\\\|\\\"', '', regex=True)

df.head()

Unnamed: 0,headline,label
0,Georgia 'downs two Russian warplanes' as count...,0
1,Why wont America and Nato help us? If they won...,1
2,Remember that adorable 9-year-old who sang at ...,0
3,U.S. refuses Israel weapons to attack Iran: r...,0
4,All the experts admit that we should legalise ...,1


In [71]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [72]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

print(stopwords.words('english')[:15])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours']


In [14]:
def punctuation_stopwords_removal(headlines):
    # filters charecter-by-charecter
    remove_punctuation = [ch for ch in headlines if ch not in string.punctuation]
    # convert them back to sentences and split into words
    remove_punctuation = "".join(remove_punctuation).split()
    # remove stop words
    filtered_headlines = [word for word in remove_punctuation if word.lower() not in stopwords.words('english')]
    return filtered_headlines

In [73]:
df.loc[:, 'headline'] = df['headline'].apply(punctuation_stopwords_removal)

In [74]:
df.head()

Unnamed: 0,headline,label
0,"[Georgia, downs, two, Russian, warplanes, coun...",0
1,"[wont, America, Nato, help, us, wont, help, us...",1
2,"[Remember, adorable, 9yearold, sang, opening, ...",0
3,"[US, refuses, Israel, weapons, attack, Iran, r...",0
4,"[experts, admit, legalise, drugs]",1


In [17]:
tokenized = df['headline'].apply((lambda x: tokenizer.encode(x, add_special_token=True)))
max_len = 0
for i in tokenized.values:
    if len(i)>max_len:
        max_len = len(i)
print(max_len)

38


In [18]:
padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)

input_ids = torch.tensor(padded).to(device)
attention_mask = torch.tensor(attention_mask).to(device)
labels = torch.tensor(df['label'].to_numpy(), dtype=torch.float).to(device)

In [19]:
with torch.no_grad():
    features = bert_model(input_ids, attention_mask=attention_mask).pooler_output

In [20]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=RANDOM_SEED)
test_features, val_features, test_labels, val_labels = train_test_split(test_features, test_labels, test_size=0.5, random_state=RANDOM_SEED)

In [21]:
class HeadlinesStocksDataset(Dataset):
  def __init__(self, features, labels):
    self.features = features
    self.labels = labels

  def __len__(self):
    return len(self.features)

  def __getitem__(self, item):
    return self.features[item], self.labels[item]

In [22]:
BATCH_SIZE = 8

def create_data_loader(features, labels, batch_size, shuffle=False):
  return DataLoader(
    HeadlinesStocksDataset(features, labels),
    batch_size=batch_size,
    shuffle=shuffle
  )

In [23]:
train_data_loader = create_data_loader(train_features, train_labels, BATCH_SIZE, shuffle=True)
val_data_loader = create_data_loader(val_features, val_labels, BATCH_SIZE)
test_data_loader = create_data_loader(test_features, test_labels, BATCH_SIZE)

In [24]:
data_loaders = {'train': train_data_loader, 'validation': val_data_loader, 'test': test_data_loader}

In [25]:
for set_name, loader in data_loaders.items():
  print(f'{set_name}: {len(loader.dataset)}')

train: 1591
validation: 199
test: 199


# Helpers

In [34]:
def train_epoch(model, data_loader, loss_fn, optimizer, scheduler):
  losses = []
  correct_predictions = 0

  is_lstm = 'lstm' in type(model).__name__.lower()
  
  for features, labels in data_loader:
    features = features.to(device)
    labels = labels.to(device)

    if is_lstm:
      model.init_hidden(features.size(0))
    
    outputs = torch.squeeze(model(features))

    # for accuracy calculation
    predicted_labels = (outputs > 0.5).float() * 1
    correct_predictions += torch.sum(predicted_labels == labels)

    # calculate and save loss
    loss = loss_fn(outputs, labels)
    losses.append(loss.item())

    # optimizer step
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  
  return {'accuracy': correct_predictions.item() / len(data_loader.dataset),
          'loss': np.mean(losses)}

In [27]:
def eval_model(model, loss_fn, sets=['validation']):
  with torch.no_grad():
    sets_metrics = dict()

    for set_name, dataloader in data_loaders.items():
      if set_name in sets:
        losses = []
        correct_predictions = 0
  
        for data in dataloader:
          features, labels = data

          features = features.to(device)
          labels = labels.to(device)

          # call model
          outputs = torch.squeeze(model(features))

          # for accuracy calculation
          predicted_labels = (outputs > 0.5).float() * 1
          correct_predictions += torch.sum(predicted_labels == labels)

          # calculate and save loss
          losses.append(loss_fn(outputs, labels).item())

        sets_metrics[set_name] = {'accuracy': correct_predictions.item() / len(dataloader.dataset),
                                  'loss': np.mean(losses)}

    return sets_metrics


In [28]:
def train(model, n_epochs, train_data_loader, loss_fn, optimizer, scheduler):
  history = []

  for epoch in range(n_epochs):
    print(f'Epoch {epoch + 1}/{n_epochs} Metrics')
    
    current_metrics = { 'train': 
      train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        scheduler
      ) 
    }
    current_metrics.update(eval_model(model, loss_fn))

    pprint.pprint(current_metrics, indent=4)
    print('-' * 10)

    history.append(current_metrics)

  return history

In [29]:
def plot(history):
  metrics_map = dict()

  for e_sets in history:
    for set_name, set_metrics in e_sets.items():
      for metric_name, metric_value in set_metrics.items():

        if metric_name not in metrics_map:
          metrics_map[metric_name] = dict()
        if set_name not in metrics_map[metric_name]:
          metrics_map[metric_name][set_name] = []

        metrics_map[metric_name][set_name].append(metric_value)
      
  for metric_name, sets in metrics_map.items():
    df = None
    for set_name, set_metrics in sets.items():
      size = len(set_metrics)
      if df is None:
        df = pd.DataFrame({"epoch": np.linspace(1, size, size),
                      metric_name: set_metrics,
                      "set": [set_name] * size})
      else:
        df = df.append(pd.DataFrame({"epoch": np.linspace(1, size, size),
                      metric_name: set_metrics,
                      "set": [set_name] * size}), ignore_index=True)

    fig = px.line(df, x="epoch", y=metric_name, line_group="set", title=f"epoch {metric_name} per dataset", color="set", hover_name="set")
    fig.show()

# Logistic Regression Classifier

In [35]:
class LogisticClassifier(nn.Module):
  def __init__(self):
    super(LogisticClassifier, self).__init__()
    self.classifier = nn.Sequential(nn.Linear(bert_model.config.hidden_size, 1),
                                    nn.Sigmoid())

  def forward(self, x):
    return self.classifier(x)  


In [36]:
model = LogisticClassifier().to(device)

In [37]:
EPOCHS = 20
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.BCELoss().to(device)

In [38]:
%%time
history = train(model, EPOCHS, train_data_loader, loss_fn, optimizer, scheduler)

Epoch 1/20 Metrics
{   'train': {'accuracy': 0.5216844751728472, 'loss': 0.6927140028632466},
    'validation': {'accuracy': 0.5879396984924623, 'loss': 0.6828751826286316}}
----------
Epoch 2/20 Metrics
{   'train': {'accuracy': 0.5248271527341295, 'loss': 0.69177551754755},
    'validation': {'accuracy': 0.5829145728643216, 'loss': 0.6899937677383423}}
----------
Epoch 3/20 Metrics
{   'train': {'accuracy': 0.5298554368321811, 'loss': 0.691940714366472},
    'validation': {'accuracy': 0.5879396984924623, 'loss': 0.6827329325675965}}
----------
Epoch 4/20 Metrics
{   'train': {'accuracy': 0.5260842237586424, 'loss': 0.691779176493985},
    'validation': {'accuracy': 0.5879396984924623, 'loss': 0.6810651659965515}}
----------
Epoch 5/20 Metrics
{   'train': {'accuracy': 0.5260842237586424, 'loss': 0.6915786032700658},
    'validation': {'accuracy': 0.5879396984924623, 'loss': 0.6822347259521484}}
----------
Epoch 6/20 Metrics
{   'train': {'accuracy': 0.5260842237586424, 'loss': 0.6916

In [39]:
plot(history)

In [40]:
eval_model(model, loss_fn, sets=['test'])

{'test': {'accuracy': 0.5527638190954773, 'loss': 0.6882190203666687}}

# Naive Bayes

In [41]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

In [42]:
X_train = train_features.cpu().detach().numpy()
X_test = test_features.cpu().detach().numpy()
y_train = train_labels.cpu().detach().numpy()
y_test = test_labels.cpu().detach().numpy()

In [43]:
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 199 points : 96


# Build Corpus and convert words to ids dataframe

In [75]:
df.head()

Unnamed: 0,headline,label
0,"[Georgia, downs, two, Russian, warplanes, coun...",0
1,"[wont, America, Nato, help, us, wont, help, us...",1
2,"[Remember, adorable, 9yearold, sang, opening, ...",0
3,"[US, refuses, Israel, weapons, attack, Iran, r...",0
4,"[experts, admit, legalise, drugs]",1


In [76]:
corpus = []

for index, word_list in df["headline"].iteritems():
  corpus.extend(word_list)

vocab_to_int = {w:i+1 for i, w in enumerate(set(corpus))}

In [77]:
df.loc[:, 'headline'] = df['headline'].apply(lambda headline: [vocab_to_int[word] for word in headline] + [0]*(max_len - len(headline)))

In [79]:
df.head()

Unnamed: 0,headline,label
0,"[775, 1775, 4970, 3664, 6906, 7271, 1856, 8999...",0
1,"[8137, 1087, 8730, 1769, 140, 8137, 1769, 140,...",1
2,"[1469, 343, 525, 2707, 1837, 3481, 9160, 0, 0,...",0
3,"[2882, 4636, 763, 2614, 1878, 5376, 2537, 0, 0...",0
4,"[1823, 450, 5211, 5424, 0, 0, 0, 0, 0, 0, 0, 0...",1


In [80]:
headlines = [item for sublist in df['headline'].tolist() for item in sublist]
features = torch.LongTensor(headlines).reshape(len(df), -1).to(device)
labels = torch.FloatTensor(df['label'].to_numpy()).to(device)

In [81]:
print(features.shape, labels.shape)

torch.Size([1989, 38]) torch.Size([1989])


In [82]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=RANDOM_SEED)
test_features, val_features, test_labels, val_labels = train_test_split(test_features, test_labels, test_size=0.5, random_state=RANDOM_SEED)

In [83]:
train_data_loader = create_data_loader(train_features, train_labels, BATCH_SIZE, shuffle=True)
val_data_loader = create_data_loader(val_features, val_labels, BATCH_SIZE)
test_data_loader = create_data_loader(test_features, test_labels, BATCH_SIZE)

# LSTM

In [84]:
class SentimentLSTM(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, directions=1, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super().__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.directions = directions
        self.bidirectional = (directions == 2)
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True, bidirectional=self.bidirectional)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)

        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim * directions, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        embedded = self.embedding(x)
        lstm_out, self.hidden = self.lstm(embedded, self.hidden)
      
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output
        return sig_out

    def init_hidden(self, batch_size):
      self.hidden = (torch.zeros(self.n_layers * self.directions, batch_size, self.hidden_dim).to(device=device),
                     torch.zeros(self.n_layers * self.directions, batch_size, self.hidden_dim).to(device=device))

In [85]:
# Instantiate the model w/ hyperparams
vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2

In [86]:
LSTM_model = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers).to(device)

In [87]:
LSTM_model

SentimentLSTM(
  (embedding): Embedding(9298, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)

In [88]:
optimizer = AdamW(LSTM_model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.BCELoss().to(device)

In [None]:
%%time
LSTM_history = train(LSTM_model, EPOCHS, train_data_loader, loss_fn, optimizer, scheduler)

In [None]:
plot(LSTM_history)