#Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import string
import os

!pip install comet-ml &> /dev/null
import comet_ml

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report


!pip install sentence-transformers &> /dev/null
from sentence_transformers import SentenceTransformer, util
!pip install transformers &> /dev/null
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

!pip install pytorch-lightning &> /dev/null
import pytorch_lightning as pl
from pytorch_lightning.loggers import CometLogger
import tensorboard
!pip install torchviz &> /dev/null
from torchviz import make_dot
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

Mounted at /content/drive


comet_ml is installed but `COMET_API_KEY` is not set.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Dataset Loading

In [2]:
HEADLINE_COLUMN = "Headline"
BODY_COLUMN = "articleBody"
STANCE_COLUMN = "Stance"

categories = {"agree":0,"disagree":1,"discuss":2,"unrelated":3}

train_bodies_csv = pd.read_csv("/content/drive/My Drive/Uni Work/train_bodies.csv")
train_stances_csv = pd.read_csv("/content/drive/My Drive/Uni Work/train_stances.csv")
train_dataset = train_stances_csv.merge(train_bodies_csv, on='Body ID', how="inner")

train_headlines = train_dataset[HEADLINE_COLUMN].values
train_bodies = train_dataset[BODY_COLUMN].values
train_truth = np.array([categories[stance] for stance in train_dataset[STANCE_COLUMN].values])
train_related_truth = np.array([int(stance<3) for stance in train_truth]) # unrelated = 0, related = 1

test_bodies_csv = pd.read_csv("/content/drive/My Drive/Uni Work/competition_test_bodies.csv")
test_stances_csv = pd.read_csv("/content/drive/My Drive/Uni Work/competition_test_stances.csv")
test_dataset = test_stances_csv.merge(test_bodies_csv, on='Body ID', how="inner")

test_headlines = test_dataset[HEADLINE_COLUMN].values
test_bodies = test_dataset[BODY_COLUMN].values
test_truth = np.array([categories[stance] for stance in test_dataset[STANCE_COLUMN].values])
test_related_truth = np.array([int(stance<3) for stance in test_truth]) # unrelated = 0, related = 1

train_bodies_csv = train_stances_csv = test_bodies_csv = test_stances_csv = None # free memory

# Data Cleaning

In [3]:
stop = set(stopwords.words('english'))

# from practicals
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    
    return url.sub('', text)

# from practicals
def remove_html(text):
    html = re.compile(r'<.*?>')
    
    return html.sub('', text)

# from practicals
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', text)

def data_cleanup(data):
    x = list(map(remove_URL, data))
    x = list(map(remove_html, data))
    x = list(map(remove_emoji, data))
    x = np.array(x)
    return x

def remove_stop_words(string):
    split_words = string.split()
    return " ".join([word for word in split_words if word.lower() not in stop])

for index, (headline, body) in enumerate(zip(train_headlines, train_bodies)):
    train_headlines[index] = remove_stop_words(headline)
    train_bodies[index] = remove_stop_words(body)

for index, (test_headline, test_body) in enumerate(zip(test_headlines, test_bodies)):
    test_headlines[index] = remove_stop_words(test_headline)
    test_bodies[index] = remove_stop_words(test_body)

#train_headlines = data_cleanup(train_headlines)
#train_bodies = data_cleanup(train_bodies)
#test_headlines = data_cleanup(test_headlines)
#test_bodies = data_cleanup(test_bodies)

# Feature Extraction

## TF-IDF

In [4]:
# tf-idf feature extraction 
train_tfidf_data = np.concatenate((np.unique(train_headlines), np.unique(train_bodies)))
tfidf = TfidfVectorizer(decode_error='ignore', lowercase=True, min_df=2, stop_words=stop, max_features=10000) 
train_tfidf = tfidf.fit(train_tfidf_data.astype('U')) 
tfidf_bodies = tfidf.transform(train_bodies.astype('U'))
tfidf_headlines = tfidf.transform(train_headlines.astype('U'))

tfidf_test_bodies = tfidf.transform(test_bodies.astype('U'))
tfidf_test_headlines = tfidf.transform(test_headlines.astype('U'))

In [None]:
print(np.shape(tfidf_bodies))
print(np.shape(tfidf_headlines))
print(np.shape(tfidf.transform([train_bodies[0]])))

## S-BERT

In [4]:
sbert_encoder = SentenceTransformer("paraphrase-MiniLM-L6-v2")

sbert_bodies = sbert_encoder.encode(train_bodies, show_progress_bar=True)
sbert_headlines = sbert_encoder.encode(train_headlines, show_progress_bar=True)

#sbert_test_bodies = sbert_encoder.encode(test_bodies, show_progress_bar=True)
#sbert_test_headlines = sbert_encoder.encode(test_headlines, show_progress_bar=True)

HBox(children=(FloatProgress(value=0.0, max=83426730.0), HTML(value='')))




You try to use a model that was created with version 1.2.0, however, your version is 1.1.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





HBox(children=(FloatProgress(value=0.0, description='Batches', max=1562.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=1562.0, style=ProgressStyle(description_wid…




In [None]:
print(np.shape(sbert_bodies))
print(np.shape(sbert_headlines))
print(np.shape(sbert_test_bodies))
print(np.shape(sbert_test_headlines))

#2ai) Machine Learning Method: Logistic Regression

##TF-IDF Embedding

In [6]:
# unrelated = 0, related = 1

tfidf_similarity_vectors = []
for headline, body in zip(tfidf_headlines, tfidf_bodies):
    tfidf_similarity_vectors.append(cosine_similarity(headline, body)[0])
tfidf_similarity_vectors = np.array(tfidf_similarity_vectors)
from time import time
start = time()
model = LogisticRegression().fit(tfidf_similarity_vectors, train_related_truth)
print(time() - start)
train_predictions = model.predict(tfidf_similarity_vectors)
print("********************** Train Metrics **********************")
print(classification_report(train_related_truth, train_predictions))
print("***********************************************************")
# test set
tfidf_similarity_vectors = []
for headline, body in zip(tfidf_test_headlines, tfidf_test_bodies):
    tfidf_similarity_vectors.append(cosine_similarity(headline, body)[0])
tfidf_similarity_vectors = np.array(tfidf_similarity_vectors)

test_predictions = model.predict(tfidf_similarity_vectors)
print("********************** Test  Metrics **********************")
print(classification_report(test_related_truth, test_predictions))
print("***********************************************************")

0.12362003326416016
********************** Train Metrics **********************
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     36545
           1       0.97      0.92      0.95     13427

    accuracy                           0.97     49972
   macro avg       0.97      0.96      0.96     49972
weighted avg       0.97      0.97      0.97     49972

***********************************************************
********************** Test  Metrics **********************
              precision    recall  f1-score   support

           0       0.94      0.99      0.96     18349
           1       0.96      0.84      0.89      7064

    accuracy                           0.95     25413
   macro avg       0.95      0.91      0.93     25413
weighted avg       0.95      0.95      0.94     25413

***********************************************************


##SBERT Embedding

In [5]:
# unrelated = 0, related = 1
import pickle
sbert_similarity_vectors = []
for headline, body in zip(sbert_headlines, sbert_bodies):
    sbert_similarity_vectors.append(util.pytorch_cos_sim(headline, body))
sbert_similarity_vectors = np.array(sbert_similarity_vectors)

sbert_model = LogisticRegression().fit(sbert_similarity_vectors.reshape(-1,1), train_related_truth)


train_predictions = sbert_model.predict(sbert_similarity_vectors.reshape(-1,1))
print("********************** Train Metrics **********************")
print(classification_report(train_related_truth, train_predictions))
print("***********************************************************")
# test set
sbert_similarity_vectors = []
for headline, body in zip(sbert_test_headlines, sbert_test_bodies):
    sbert_similarity_vectors.append(util.pytorch_cos_sim(headline, body))
sbert_similarity_vectors = np.array(sbert_similarity_vectors)

test_predictions = sbert_model.predict(sbert_similarity_vectors.reshape(-1,1))
print("********************** Test  Metrics **********************")
print(classification_report(test_related_truth, test_predictions))
print("***********************************************************")


Mounted at /content/gdrive


'\ntrain_predictions = sbert_model.predict(sbert_similarity_vectors.reshape(-1,1))\nprint("********************** Train Metrics **********************")\nprint(classification_report(train_related_truth, train_predictions))\nprint("***********************************************************")\n# test set\nsbert_similarity_vectors = []\nfor headline, body in zip(sbert_test_headlines, sbert_test_bodies):\n    sbert_similarity_vectors.append(util.pytorch_cos_sim(headline, body))\nsbert_similarity_vectors = np.array(sbert_similarity_vectors)\n\ntest_predictions = sbert_model.predict(sbert_similarity_vectors.reshape(-1,1))\nprint("********************** Test  Metrics **********************")\nprint(classification_report(test_related_truth, test_predictions))\nprint("***********************************************************")\n'

#2aii) Deep Learning Method

##Dataset

In [9]:
from random import sample
indicies = list(range(len(train_headlines)))
train_indicies = sample(indicies,int(len(indicies)*0.8))
val_indicies = set(indicies) - set(train_indicies)
train_data_headlines = []
train_data_bodies = []
train_data_stances = []
val_data_headlines = []
val_data_bodies = []
val_data_stances = []

for i in train_indicies:
    train_data_headlines.append(train_headlines[i])
    train_data_bodies.append(train_bodies[i])
    train_data_stances.append(train_related_truth[i])

train_data_headlines = np.array(train_data_headlines)
train_data_bodies = np.array(train_data_bodies)
train_data_stances = np.array(train_data_stances)

for i in val_indicies:
    val_data_headlines.append(train_headlines[i])
    val_data_bodies.append(train_bodies[i])
    val_data_stances.append(train_related_truth[i])

val_data_headlines = np.array(val_data_headlines)
val_data_bodies = np.array(val_data_bodies)
val_data_stances = np.array(val_data_stances)
#train_headlines = train_bodies = None

class NewsDataset(Dataset):
    def __init__(self, headlines, bodies, stances):
        self.headlines = headlines
        self.bodies = bodies
        self.stances = stances

    def __len__(self):
        return len(self.headlines)

    def __getitem__(self, idx):
        x = (self.headlines[idx], self.bodies[idx])
        y = self.stances[idx]
        return x, y

train_dataset = NewsDataset(train_data_headlines, train_data_bodies, train_data_stances)
val_dataset = NewsDataset(val_data_headlines, val_data_bodies, val_data_stances)
test_dataset = NewsDataset(test_headlines, test_bodies, test_related_truth)

##Unrelated/Related Classification Model

In [3]:
# used pytorch lightning to make the model neater, code layout from here: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#minimal-example
class UnrelatedClassifer(pl.LightningModule):
    def __init__(self, transformer=None, tfidf=None):
        super().__init__()
        self.gru = nn.GRU(input_size=768 if transformer else 20000,
                            hidden_size=512,
                            num_layers =2,
                            dropout=0.2,
                            batch_first=True)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(512, 1)
        self.sigmoid = nn.Sigmoid()
        self.loss = nn.BCELoss()
        self.transformer = transformer
        self.tfidf = tfidf       

    def forward(self, x):
        headline, body = x
        with torch.no_grad():
            if self.transformer:
                headline = self.transformer.encode(headline, convert_to_tensor=True)
                body = self.transformer.encode(body, convert_to_tensor=True)
                x = torch.cat((headline,body), dim=1)
                x = x.unsqueeze(dim=1)
            else:
                headline = self.tfidf.transform(headline).todense()
                body = self.tfidf.transform(body).todense()
                x = np.concatenate((headline,body), axis=1)
                x = torch.Tensor(x).to("cuda")
                x = x.unsqueeze(dim=1)
    
        x, _ = self.gru(x)
        x = self.relu(x)
        x = self.linear(x)
        x = self.sigmoid(x)
        return x.squeeze()

    def configure_optimizers(self):
        lr = 1e-3
        optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        self.log("lr",lr)
        return optimizer

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        y = y.float()
        x_hat = self.forward(x)
        loss = self.loss(x_hat, y)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        y = y.float()
        x_hat = self.forward(x)
        print(x_hat)
        print(y)
        loss = self.loss(x_hat, y)
        self.log('val_loss', loss)

    def test_step(self, batch, idx):
        x, y = batch
        y = y.float()
        x_hat = self.forward(x)
        loss = self.loss(x_hat, y)
        self.log('test_loss', loss)
        accuracy = torch.sum(torch.round(x_hat) == y) / len(y)
        self.log('test_acc', accuracy)
        return loss

###Training w/ SBERT

In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, num_workers=2, pin_memory=True, shuffle=False, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=32, num_workers=2, pin_memory=True, shuffle=False, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=32, num_workers=2, pin_memory=True, shuffle=False, drop_last=True)

# model
transformer = SentenceTransformer("paraphrase-MiniLM-L6-v2")
model = UnrelatedClassifer(transformer=transformer)

#logging
comet_logger = CometLogger(
    api_key='5BHASx2Q9HBWhSzOPryQC92zu',
    workspace=os.environ.get('nikesh'),
    project_name='fake-news-detection',
    experiment_name='SBERT Unrelated DL'
)

# training
trainer = pl.Trainer(gpus=1, max_epochs=5, logger=comet_logger)
trainer.fit(model, train_loader, val_loader) 
trainer.test(model, test_loader)

You try to use a model that was created with version 1.2.0, however, your version is 1.1.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



CometLogger will be initialized in online mode
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/nikesh/fake-news-detection/dfa2fb1fcbef49bc8d6da82fb0c93455


  | Name        | Type                | Params
----------------------------------------------------
0 | gru         | GRU                 | 3.5 M 
1 | relu        | ReLU                | 0     
2 | linear      | Linear              | 513   
3 | sigmoid     | Sigmoid             | 0     
4 | loss        | BCELoss             | 0     
5 | transformer | SentenceTransformer | 22.7 M
----------------------------------------------------
26.3 M    Trainable params
0         Non-trainable params
26.3 M    Total params
10

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/nikesh/fake-news-detection/dfa2fb1fcbef49bc8d6da82fb0c93455
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     loss [625]       : (6.593811576749431e-07, 1.6669036149978638)
COMET INFO:     train_loss [125] : (1.303862632084929e-06, 0.9222912192344666)
COMET INFO:     val_loss [5]     : (0.13477300107479095, 0.3404513895511627)
COMET INFO:   Others:
COMET INFO:     Name : SBERT Unrelated DL
COMET INFO:   Uploads:
COMET INFO:     environment details : 1
COMET INFO:     filename            : 1
COMET INFO:     installed packages  : 1
COMET INFO:     model graph         : 1
COMET INFO:     notebook            : 1
COMET INFO:     os packages         : 1
COMET INFO:     source_code         : 1
COMET INFO: ---------------------------





COMET INFO: Uploading metrics, params, and assets to Comet before program termination (may take several seconds)
COMET INFO: The Python SDK has 3600 seconds to finish before aborting...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/nikesh/fake-news-detection/dfa2fb1fcbef49bc8d6da82fb0c93455



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.8705016374588013, 'test_loss': 0.4792836606502533}
--------------------------------------------------------------------------------


[{'test_acc': 0.8705016374588013, 'test_loss': 0.4792836606502533}]

###Training w/ TFIDF

In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, num_workers=2, pin_memory=True, shuffle=False, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=32, num_workers=2, pin_memory=True, shuffle=False, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=32, num_workers=2, pin_memory=True, shuffle=False, drop_last=True)

# model
train_tfidf_data = np.concatenate((np.unique(train_headlines), np.unique(train_bodies)))
tfidf = TfidfVectorizer(decode_error='ignore', lowercase=True, min_df=2, stop_words=stop, max_features=10000) 
train_tfidf = tfidf.fit(train_tfidf_data.astype('U')) 
model = UnrelatedClassifer(tfidf=train_tfidf)

#logging
comet_logger = CometLogger(
    api_key='5BHASx2Q9HBWhSzOPryQC92zu',
    workspace=os.environ.get('nikesh'),
    project_name='fake-news-detection',
    experiment_name='TFIDF Unrelated DL (E=10)'
)

# training
trainer = pl.Trainer(gpus=1, max_epochs=10, logger=comet_logger)
trainer.fit(model, train_loader, val_loader)
trainer.test(model, test_loader)

CometLogger will be initialized in online mode
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/nikesh/fake-news-detection/40155d2e2fd64a2bbbb206a0c894778a


  | Name    | Type    | Params
------------------------------------
0 | gru     | GRU     | 33.1 M
1 | relu    | ReLU    | 0     
2 | linear  | Linear  | 513   
3 | sigmoid | Sigmoid | 0     
4 | loss    | BCELoss | 0     
------------------------------------
33.1 M    Trainable params
0         Non-trainable params
33.1 M    Total params
132.344   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

tensor([0.4951, 0.4950, 0.4949, 0.4962, 0.4962, 0.4952, 0.4958, 0.4962, 0.4946,
        0.4961, 0.4959, 0.4949, 0.4945, 0.4959, 0.4944, 0.4946, 0.4954, 0.4962,
        0.4964, 0.4961, 0.4948, 0.4961, 0.4971, 0.4959, 0.4970, 0.4957, 0.4955,
        0.4959, 0.4961, 0.4965, 0.4962, 0.4958], device='cuda:0')
tensor([1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       device='cuda:0')
tensor([0.4960, 0.4953, 0.4957, 0.4963, 0.4958, 0.4963, 0.4954, 0.4960, 0.4955,
        0.4956, 0.4960, 0.4964, 0.4967, 0.4964, 0.4970, 0.4954, 0.4974, 0.4967,
        0.4951, 0.4967, 0.4961, 0.4976, 0.4969, 0.4973, 0.4963, 0.4971, 0.4958,
        0.4968, 0.4969, 0.4955, 0.4960, 0.4959], device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       device='cuda:0')


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…


Detected KeyboardInterrupt, attempting graceful shutdown...

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.7245424389839172, 'test_loss': 0.5975430011749268}
--------------------------------------------------------------------------------


[{'test_acc': 0.7245424389839172, 'test_loss': 0.5975430011749268}]

#2b) Deep Learning for Agree/Disagree/Discuss

##Dataset

In [4]:
from random import sample
from random import random
from random import choice

indicies = []
truth_map = {0:[1.0,0.0,0.0],1:[0.0,1.0,0.0],2:[0.0,0.0,1.0]}

for i in range(len(train_truth)):
    if train_truth[i] < 3:
        indicies.append(i)
train_indicies = sample(indicies,int(len(indicies)*0.8))
val_indicies = set(indicies) - set(train_indicies)

train_data_headlines = []
train_data_bodies = []
train_data_stances = []
train_data_stance_location = {0:[], 1:[], 2:[]}

val_data_headlines = []
val_data_bodies = []
val_data_stances = []

for idx, i in enumerate(train_indicies):
    train_data_headlines.append(train_headlines[i])
    train_data_bodies.append(train_bodies[i])
    train_data_stances.append(train_truth[i])
    train_data_stance_location[train_truth[i]].append(idx)

train_data_headlines = np.array(train_data_headlines)
train_data_bodies = np.array(train_data_bodies)
train_data_stances = np.array(train_data_stances)

for i in val_indicies:
    val_data_headlines.append(train_headlines[i])
    val_data_bodies.append(train_bodies[i])
    val_data_stances.append(train_truth[i])

val_data_headlines = np.array(val_data_headlines)
val_data_bodies = np.array(val_data_bodies)
val_data_stances = np.array(val_data_stances)
#train_headlines = train_bodies = None


# test set
indicies = []
test_data_headlines = []
test_data_bodies = []
test_data_stances = []

for i in range(len(test_headlines)):
    if test_truth[i] < 3:
        indicies.append(i)

for i in indicies:
    test_data_headlines.append(test_headlines[i])
    test_data_bodies.append(test_bodies[i])
    test_data_stances.append(test_truth[i]) 

test_data_headlines = np.array(test_data_headlines)
test_data_bodies = np.array(test_data_bodies)
test_data_stances = np.array(test_data_stances)

class StanceDataset(Dataset):
    def __init__(self, headlines, bodies, stances, tokenizer, locations=None, loc=False):
        self.headlines = headlines
        self.bodies = bodies
        self.stances = stances
        self.tokenizer = tokenizer
        self.locations = locations
        self.loc = loc

    def __len__(self):
        return len(self.headlines)

    def __getitem__(self, idx):
        if self.loc:
            r = random()
            if r < 0.1:
                idx = choice(self.locations[2])
            elif r < 0.3:
                idx = choice(self.locations[1])

        headline = self.headlines[idx]
        body = self.bodies[idx]
        headline_tokens = self.tokenizer.encode_plus(headline,
                                                    add_special_tokens=True,
                                                    max_length=512,
                                                    padding="max_length",
                                                    truncation=True,
                                                    return_token_type_ids=True,
                                                    return_tensors="pt")
        body_tokens = self.tokenizer.encode_plus(body,
                                                add_special_tokens=True,
                                                max_length=512,
                                                padding="max_length",
                                                truncation=True,
                                                return_token_type_ids=True,
                                                return_tensors="pt")

        headline_vals = (headline_tokens['input_ids'], headline_tokens['attention_mask'], headline_tokens["token_type_ids"])
        body_vals = (body_tokens['input_ids'], body_tokens['attention_mask'], body_tokens["token_type_ids"])
        x = (headline_vals, body_vals)
        y = self.stances[idx]
        return x, y

tokenizer = RobertaTokenizer.from_pretrained("distilroberta-base", truncation=True, do_lower_case=True)
train_dataset = StanceDataset(train_data_headlines, train_data_bodies, train_data_stances, tokenizer, locations=train_data_stance_location, loc=True)
val_dataset = StanceDataset(val_data_headlines, val_data_bodies, val_data_stances, tokenizer)
test_dataset = StanceDataset(test_data_headlines, test_data_bodies, test_data_stances, tokenizer)

print(len(train_data_headlines))
print(len(val_data_headlines))
print(len(test_data_headlines))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…


10741
2686
7064


##Agree/Discuss/Disagree Classification Model

###Model

In [7]:
# used pytorch lightning to make the model neater, code layout from here: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#minimal-example
class SentimentClassifer(pl.LightningModule):
    def __init__(self, transformer, train_dataset=None, learning_rate=1e-2):
        super().__init__()
        self.gru = nn.GRU(input_size=1536,
                            hidden_size=512,
                            num_layers=2,
                            dropout=0.2,
                            batch_first=True)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(512, 3)
        self.softmax = nn.Softmax(dim=2)
        self.loss = nn.CrossEntropyLoss() # change weights
        self.transformer = transformer
        self.learning_rate = learning_rate 
        self.train_dataset = train_dataset

    def forward(self, x):
        headline_vals, body_vals = x
        with torch.no_grad():
            headline_roberta = self.transformer(input_ids=headline_vals[0].squeeze(dim=1), 
                                                attention_mask=headline_vals[1].squeeze(dim=1), 
                                                token_type_ids=headline_vals[2].squeeze(dim=1))
            headline_roberta = headline_roberta[0][:,0]
            body_roberta = self.transformer(input_ids=body_vals[0].squeeze(dim=1), 
                                            attention_mask=body_vals[1].squeeze(dim=1), 
                                            token_type_ids=body_vals[2].squeeze(dim=1))
            body_roberta = body_roberta[0][:,0]
            x = torch.cat((headline_roberta,body_roberta), dim=1)
            headline_roberta = body_roberta = None
            x = x.unsqueeze(dim=1)

        x, _ = self.gru(x)
        x = self.relu(x)
        x = self.linear(x)
        x = self.softmax(x)
        return x.squeeze()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        self.log("lr",self.learning_rate)
        return optimizer

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=128, num_workers=2, pin_memory=True, shuffle=False, drop_last=True)

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        x_hat = self.forward(x)
        loss = self.loss(x_hat, y)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        with torch.no_grad():
            x_hat = self.forward(x)
            loss = self.loss(x_hat, y)
        self.log('val_loss', loss)

    def test_step(self, batch, idx):
        x, y = batch
        with torch.no_grad():
            x_hat = self.forward(x)
            loss = self.loss(x_hat, y)
        self.log('test_loss', loss)
        accuracy = torch.sum(x_hat.argmax(1) == y) / len(y)
        self.log('test_acc', accuracy)
        return loss

###Training

In [6]:
train_loader = DataLoader(train_dataset, batch_size=64, num_workers=2, pin_memory=True, shuffle=False, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=32, num_workers=2, pin_memory=True, shuffle=False, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=64, num_workers=2, pin_memory=True, shuffle=False, drop_last=True)

# model
transformer = RobertaModel.from_pretrained("distilroberta-base")
add_model = SentimentClassifer(transformer, train_dataset)

#logging
comet_logger = CometLogger(
    api_key='5BHASx2Q9HBWhSzOPryQC92zu',
    workspace=os.environ.get('nikesh'),
    project_name='fake-news-detection',
    experiment_name='Roberta Sentiment DL')

# training
trainer = pl.Trainer(gpus=1, max_epochs=10, auto_lr_find=True, logger=comet_logger)
trainer.tune(add_model)
trainer.fit(add_model, train_loader, val_loader)

trainer.test(add_model, test_loader)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=480.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=331070498.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
CometLogger will be initialized in online mode
GPU available: True, used: True
TPU available: False, using: 0 TPU cores

you defined a validation_step but have no val_dataloader. Skipping val loop

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type             | Params
------

HBox(children=(FloatProgress(value=0.0, description='Finding best initial lr', style=ProgressStyle(description…

Restored states from the checkpoint file at /content/lr_find_temp_model.ckpt
Learning rate set to 0.0013182567385564075
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/nikesh/fake-news-detection/de68bf4b600448d3a69502fa39fcff58


  | Name        | Type             | Params
-------------------------------------------------
0 | gru         | GRU              | 4.7 M 
1 | relu        | ReLU             | 0     
2 | linear      | Linear           | 1.5 K 
3 | softmax     | Softmax          | 0     
4 | loss        | CrossEntropyLoss | 0     
5 | transformer | RobertaModel     | 82.1 M
-------------------------------------------------
86.8 M    Trainable params
0         Non-trainable params
86.8 M    Total params
347.379   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.6389204263687134, 'test_loss': 0.8965885043144226}
--------------------------------------------------------------------------------


[{'test_acc': 0.6389204263687134, 'test_loss': 0.8965885043144226}]

In [7]:
from google.colab import drive
drive.mount('/content/gdrive')
path = F"/content/gdrive/My Drive/10epochCEOverSample.ckpt"
torch.save(add_model.state_dict(), path)

Mounted at /content/gdrive


###Test loop

In [10]:
test_loader = DataLoader(test_dataset, batch_size=128, num_workers=2, pin_memory=True, shuffle=False, drop_last=True)

from google.colab import drive
drive.mount('/content/gdrive')
path = F"/content/gdrive/My Drive/10epochCEOverSample.ckpt"
transformer = RobertaModel.from_pretrained("distilroberta-base")
add_model = SentimentClassifer(transformer)
add_model.load_state_dict(torch.load(path))
add_model.train(mode=False)
add_model.to("cuda")

predicted_values = []
truth = []
for _ in range(53):
    x, y = next(iter(test_loader))
    truth += [int(i.numpy()) for i in y]
    x = ((x[0][0].to("cuda"), x[0][1].to("cuda"), x[0][2].to("cuda")), (x[1][0].to("cuda"), x[1][1].to("cuda"), x[1][2].to("cuda")))
    x = add_model.forward(x)
    predictions = [i.cpu().detach().numpy().argmax() for i in x]
    predicted_values += predictions

# categories = {"agree":0,"disagree":1,"discuss":2}
print("********************** Test  Metrics **********************")
print(classification_report(truth, predicted_values))
print("***********************************************************")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


********************** Test  Metrics **********************
              precision    recall  f1-score   support

           0       0.28      0.37      0.32      1855
           1       0.00      0.00      0.00        53
           2       0.72      0.64      0.68      4876

    accuracy                           0.56      6784
   macro avg       0.33      0.34      0.33      6784
weighted avg       0.59      0.56      0.58      6784

***********************************************************



Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [9]:
print(predicted_values)
print(truth)

[array([7.0698610e-05, 7.0994097e-06, 9.9992216e-01], dtype=float32), array([4.9743545e-04, 4.0572827e-06, 9.9949849e-01], dtype=float32), array([1.2824926e-02, 2.2617305e-05, 9.8715252e-01], dtype=float32), array([7.5344363e-04, 1.1535004e-05, 9.9923503e-01], dtype=float32), array([6.3211223e-06, 7.8551584e-06, 9.9998581e-01], dtype=float32), array([8.8178031e-06, 9.2899145e-06, 9.9998188e-01], dtype=float32), array([9.5089963e-06, 2.4566052e-05, 9.9996591e-01], dtype=float32), array([4.1002771e-05, 4.4275836e-05, 9.9991477e-01], dtype=float32), array([1.2995692e-05, 1.7463750e-05, 9.9996948e-01], dtype=float32), array([2.1391736e-05, 2.7819953e-05, 9.9995089e-01], dtype=float32), array([2.0546144e-05, 3.6841018e-05, 9.9994266e-01], dtype=float32), array([3.1358864e-05, 3.6658836e-05, 9.9993193e-01], dtype=float32), array([5.7522982e-01, 5.6633864e-05, 4.2471355e-01], dtype=float32), array([2.3161124e-04, 5.0000763e-05, 9.9971837e-01], dtype=float32), array([3.471289e-04, 5.412235e-05

#End-to-End Runthrough

##Setup

In [4]:
def preprocess(headline, body, tokenizer):
        headline_tokens = tokenizer.encode_plus(headline,
                                                    add_special_tokens=True,
                                                    max_length=512,
                                                    padding="max_length",
                                                    truncation=True,
                                                    return_token_type_ids=True,
                                                    return_tensors="pt")
        body_tokens = tokenizer.encode_plus(body,
                                                add_special_tokens=True,
                                                max_length=512,
                                                padding="max_length",
                                                truncation=True,
                                                return_token_type_ids=True,
                                                return_tensors="pt")

        headline_vals = (headline_tokens['input_ids'], headline_tokens['attention_mask'], headline_tokens["token_type_ids"])
        body_vals = (body_tokens['input_ids'], body_tokens['attention_mask'], body_tokens["token_type_ids"])
        x = (headline_vals, body_vals)
        return x

In [5]:
sbert_encoder = SentenceTransformer("paraphrase-MiniLM-L6-v2")
sbert_test_bodies = sbert_encoder.encode(test_bodies, show_progress_bar=True)
sbert_test_headlines = sbert_encoder.encode(test_headlines, show_progress_bar=True)

sbert_similarity_vectors = []
for headline, body in zip(sbert_test_headlines, sbert_test_bodies):
    sbert_similarity_vectors.append(util.pytorch_cos_sim(headline, body))
sbert_similarity_vectors = np.array(sbert_similarity_vectors)

You try to use a model that was created with version 1.2.0, however, your version is 1.1.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





HBox(children=(FloatProgress(value=0.0, description='Batches', max=795.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=795.0, style=ProgressStyle(description_widt…




##Run

In [17]:
# sbert_model is unrelated/related classification
# add_model is agree/disagree/discuss classification
final_predictions = [3 for i in range(len(test_headlines))]
# Step 1: classify related/unrelated
# unrelated = 0, related = 1
# test set

from google.colab import drive
import pickle
drive.mount('/content/gdrive')
path = F"/content/gdrive/My Drive/ml_model.sav"
sbert_model = pickle.load(open(path, 'rb'))

test_predictions = sbert_model.predict(sbert_similarity_vectors.reshape(-1,1))

# Step 2: seperate out all "related" predictions to be fed into next model

indicies = []
for i in range(len(test_predictions)):
    if test_predictions[i] == 1:
        indicies.append(i)

# Step 3: classify agree/disagree/discuss

path = F"/content/gdrive/My Drive/10epochCEOverSample.ckpt"
transformer = RobertaModel.from_pretrained("distilroberta-base")
tokenizer = RobertaTokenizer.from_pretrained("distilroberta-base", truncation=True, do_lower_case=True)
add_model = SentimentClassifer(transformer)
add_model.load_state_dict(torch.load(path))
add_model.train(mode=False)
add_model.to("cuda")

for i in indicies:
    x, y = preprocess(test_headlines[i], test_bodies[i], tokenizer)
    x = ((x[0].to("cuda"), x[1].to("cuda"), x[2].to("cuda")), (y[0].to("cuda"), y[1].to("cuda"), y[2].to("cuda")))
    x = add_model.forward(x)
    prediction = x.cpu().detach().numpy().argmax()
    final_predictions[i] = prediction

# categories = {"agree":0,"disagree":1,"discuss":2}

# Step 4: calculate class accuracies
print(classification_report(test_truth, final_predictions))

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


              precision    recall  f1-score   support

           0       0.46      0.54      0.50      1903
           1       0.30      0.25      0.27       697
           2       0.75      0.70      0.72      4464
           3       0.98      0.98      0.98     18349

    accuracy                           0.88     25413
   macro avg       0.62      0.62      0.62     25413
weighted avg       0.88      0.88      0.88     25413



In [12]:
final_predictions

[3,
 3,
 3,
 3,
 array(7.069875e-05, dtype=float32),
 3,
 3,
 3,
 array(0.00049744, dtype=float32),
 3,
 3,
 array(0.012825, dtype=float32),
 3,
 array(0.00075345, dtype=float32),
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 array(6.3211346e-06, dtype=float32),
 3,
 3,
 3,
 array(8.817862e-06, dtype=float32),
 3,
 3,
 3,
 3,
 array(9.509068e-06, dtype=float32),
 array(4.100332e-05, dtype=float32),
 3,
 3,
 3,
 3,
 array(1.2995704e-05, dtype=float32),
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 array(2.1391961e-05, dtype=float32),
 3,
 3,
 array(2.05463e-05, dtype=float32),
 3,
 3,
 3,
 3,
 array(3.135919e-05, dtype=float32),
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 array(0.5752316, dtype=float32),
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
