#Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import string
import os

!pip install comet-ml &> /dev/null
# import comet_ml at the top of your file
import comet_ml


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report

#!pip install transformers &> /dev/null
!pip install sentence-transformers &> /dev/null
#from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer, util

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

!pip install pytorch-lightning &> /dev/null
import pytorch_lightning as pl
from pytorch_lightning.loggers import CometLogger
import tensorboard
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


comet_ml is installed but `COMET_API_KEY` is not set.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Dataset Loading

In [2]:
HEADLINE_COLUMN = "Headline"
BODY_COLUMN = "articleBody"
STANCE_COLUMN = "Stance"

categories = {"agree":0,"disagree":1,"discuss":2,"unrelated":3}

train_bodies_csv = pd.read_csv("/content/drive/My Drive/Uni Work/train_bodies.csv")
train_stances_csv = pd.read_csv("/content/drive/My Drive/Uni Work/train_stances.csv")
train_dataset = train_stances_csv.merge(train_bodies_csv, on='Body ID', how="inner")

train_headlines = train_dataset[HEADLINE_COLUMN].values
train_bodies = train_dataset[BODY_COLUMN].values
train_truth = np.array([categories[stance] for stance in train_dataset[STANCE_COLUMN].values])
train_related_truth = np.array([int(stance<3) for stance in train_truth]) # unrelated = 0, related = 1

test_bodies_csv = pd.read_csv("/content/drive/My Drive/Uni Work/competition_test_bodies.csv")
test_stances_csv = pd.read_csv("/content/drive/My Drive/Uni Work/competition_test_stances.csv")
test_dataset = test_stances_csv.merge(test_bodies_csv, on='Body ID', how="inner")

test_headlines = test_dataset[HEADLINE_COLUMN].values
test_bodies = test_dataset[BODY_COLUMN].values
test_truth = np.array([categories[stance] for stance in test_dataset[STANCE_COLUMN].values])
test_related_truth = np.array([int(stance<3) for stance in test_truth]) # unrelated = 0, related = 1

train_bodies_csv = train_stances_csv = test_bodies_csv = test_stances_csv = None # free memory

# Data Cleaning

In [3]:
stop = set(stopwords.words('english'))

# from practicals
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    
    return url.sub('', text)

# from practicals
def remove_html(text):
    html = re.compile(r'<.*?>')
    
    return html.sub('', text)

# from practicals
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', text)

def data_cleanup(data):
    x = list(map(remove_URL, data))
    x = list(map(remove_html, data))
    x = list(map(remove_emoji, data))
    x = np.array(x)
    return x

def remove_stop_words(string):
    split_words = string.split()
    return " ".join([word for word in split_words if word.lower() not in stop])

for index, (headline, body) in enumerate(zip(train_headlines, train_bodies)):
    train_headlines[index] = remove_stop_words(headline)
    train_bodies[index] = remove_stop_words(body)

for index, (test_headline, test_body) in enumerate(zip(test_headlines, test_bodies)):
    test_headlines[index] = remove_stop_words(test_headline)
    test_bodies[index] = remove_stop_words(test_body)

#train_headlines = data_cleanup(train_headlines)
#train_bodies = data_cleanup(train_bodies)
#test_headlines = data_cleanup(test_headlines)
#test_bodies = data_cleanup(test_bodies)

# Feature Extraction

## TF-IDF

In [18]:
# tf-idf feature extraction 
train_tfidf_data = np.concatenate((np.unique(train_headlines), np.unique(train_bodies)))
tfidf = TfidfVectorizer(decode_error='ignore', lowercase=True, min_df=2, stop_words=stop, max_features=10000) 
train_tfidf = tfidf.fit(train_tfidf_data.astype('U')) 
tfidf_bodies = tfidf.transform(train_bodies.astype('U'))
tfidf_headlines = tfidf.transform(train_headlines.astype('U'))

tfidf_test_bodies = tfidf.transform(test_bodies.astype('U'))
tfidf_test_headlines = tfidf.transform(test_headlines.astype('U'))

In [None]:
print(np.shape(tfidf_bodies))
print(np.shape(tfidf_headlines))
print(np.shape(tfidf.transform([train_bodies[0]])))


## S-BERT

In [None]:
sbert_encoder = SentenceTransformer("paraphrase-MiniLM-L6-v2")

sbert_bodies = sbert_encoder.encode(train_bodies, show_progress_bar=True)
sbert_headlines = sbert_encoder.encode(train_headlines, show_progress_bar=True)

sbert_test_bodies = sbert_encoder.encode(test_bodies, show_progress_bar=True)
sbert_test_headlines = sbert_encoder.encode(test_headlines, show_progress_bar=True)

HBox(children=(FloatProgress(value=0.0, max=83426730.0), HTML(value='')))




You try to use a model that was created with version 1.2.0, however, your version is 1.1.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





HBox(children=(FloatProgress(value=0.0, description='Batches', max=1562.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=1562.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=795.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=795.0, style=ProgressStyle(description_widt…




In [None]:
print(np.shape(sbert_bodies))
print(np.shape(sbert_headlines))
print(np.shape(sbert_test_bodies))
print(np.shape(sbert_test_headlines))

(49972, 384)
(49972, 384)
(25413, 384)
(25413, 384)


#2ai) Machine Learning Method: Logistic Regression

##TF-IDF Embedding

In [None]:
# unrelated = 0, related = 1

tfidf_similarity_vectors = []
for headline, body in zip(tfidf_headlines, tfidf_bodies):
    tfidf_similarity_vectors.append(cosine_similarity(headline, body)[0])
tfidf_similarity_vectors = np.array(tfidf_similarity_vectors)

model = LogisticRegression().fit(tfidf_similarity_vectors, train_related_truth)
train_predictions = model.predict(tfidf_similarity_vectors)
print("********************** Train Metrics **********************")
print(classification_report(train_related_truth, train_predictions))
print("***********************************************************")
# test set
tfidf_similarity_vectors = []
for headline, body in zip(tfidf_test_headlines, tfidf_test_bodies):
    tfidf_similarity_vectors.append(cosine_similarity(headline, body)[0])
tfidf_similarity_vectors = np.array(tfidf_similarity_vectors)

test_predictions = model.predict(tfidf_similarity_vectors)
print("********************** Test  Metrics **********************")
print(classification_report(test_related_truth, test_predictions))
print("***********************************************************")

KeyboardInterrupt: ignored

##SBERT Embedding

In [None]:
# unrelated = 0, related = 1

sbert_similarity_vectors = []
for headline, body in zip(sbert_headlines, sbert_bodies):
    sbert_similarity_vectors.append(util.pytorch_cos_sim(headline, body))
sbert_similarity_vectors = np.array(sbert_similarity_vectors)

model = LogisticRegression().fit(sbert_similarity_vectors.reshape(-1,1), train_related_truth)
train_predictions = model.predict(sbert_similarity_vectors.reshape(-1,1))
print("********************** Train Metrics **********************")
print(classification_report(train_related_truth, train_predictions))
print("***********************************************************")
# test set
sbert_similarity_vectors = []
for headline, body in zip(sbert_test_headlines, sbert_test_bodies):
    sbert_similarity_vectors.append(util.pytorch_cos_sim(headline, body))
sbert_similarity_vectors = np.array(sbert_similarity_vectors)

test_predictions = model.predict(sbert_similarity_vectors.reshape(-1,1))
print("********************** Test  Metrics **********************")
print(classification_report(test_related_truth, test_predictions))
print("***********************************************************")

********************** Train Metrics **********************
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     36545
           1       0.95      0.93      0.94     13427

    accuracy                           0.97     49972
   macro avg       0.96      0.96      0.96     49972
weighted avg       0.97      0.97      0.97     49972

***********************************************************
********************** Test  Metrics **********************
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     18349
           1       0.96      0.95      0.96      7064

    accuracy                           0.98     25413
   macro avg       0.97      0.97      0.97     25413
weighted avg       0.98      0.98      0.98     25413

***********************************************************


#2aii) Deep Learning Method

##Dataset

In [4]:
# must do proper test train split
class NewsDataset(Dataset):
    def __init__(self, headlines, bodies, stances):
        self.headlines = headlines
        self.bodies = bodies
        self.stances = stances

    def __len__(self):
        return len(self.headlines)

    def __getitem__(self, idx):
        x = (self.headlines[idx], self.bodies[idx])
        y = self.stances[idx]
        return x, y

train_dataset = NewsDataset(train_headlines[:40000], train_bodies[:40000], train_related_truth[:40000])
val_dataset = NewsDataset(train_headlines[40000:], train_bodies[40000:], train_related_truth[40000:])
test_dataset = NewsDataset(test_headlines, test_bodies, test_related_truth)

##Unrelated/Related Classification Model

In [8]:
class UnrelatedClassifer(pl.LightningModule):
    def __init__(self, transformer=None, tfidf=None):
        super().__init__()
        self.gru = nn.GRU(input_size=768 if transformer else 20000,
                            hidden_size=512,
                            num_layers =2,
                            dropout=0.2,
                            batch_first=True)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(512, 1)
        self.sigmoid = nn.Sigmoid()
        self.loss = nn.BCELoss()
        self.transformer = transformer
        self.tfidf = tfidf       

    def forward(self, x):
        headline, body = x
        with torch.no_grad():
            if self.transformer:
                headline = self.transformer.encode(headline, convert_to_tensor=True)
                body = self.transformer.encode(body, convert_to_tensor=True)
                x = torch.cat((headline,body), dim=1)
                x = x.unsqueeze(dim=1)
            else:
                headline = self.tfidf.transform(headline).todense()
                body = self.tfidf.transform(body).todense()
                x = np.concatenate((headline,body), axis=1)
                x = torch.Tensor(x).to("cuda")
                x = x.unsqueeze(dim=1)
    
        x, _ = self.gru(x)
        x = self.relu(x)
        x = self.linear(x)
        x = self.sigmoid(x)
        return x.squeeze()

    def configure_optimizers(self):
        lr = 1e-3
        optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        self.log("lr",lr)
        return optimizer

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        y = y.float()
        x_hat = self.forward(x)
        loss = self.loss(x_hat, y)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        y = y.float()
        x_hat = self.forward(x)
        loss = self.loss(x_hat, y)
        self.log('val_loss', loss)

    def test_step(self, batch, idx):
        x, y = batch
        y = y.float()
        x_hat = self.forward(x)
        loss = self.loss(x_hat, y)
        self.log('test_loss', loss)
        accuracy = torch.sum(torch.round(x_hat) == y) / len(y)
        self.log('test_acc', accuracy)
        return loss

###Training w/ SBERT

In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, num_workers=2, pin_memory=True, shuffle=False, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=32, num_workers=2, pin_memory=True, shuffle=False, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=32, num_workers=2, pin_memory=True, shuffle=False, drop_last=True)

# model
transformer = SentenceTransformer("paraphrase-MiniLM-L6-v2")
model = UnrelatedClassifer(transformer=transformer)

#logging
comet_logger = CometLogger(
    api_key='5BHASx2Q9HBWhSzOPryQC92zu',
    workspace=os.environ.get('nikesh'),
    project_name='fake-news-detection',
    experiment_name='SBERT Unrelated DL'
)

# training
trainer = pl.Trainer(gpus=1, max_epochs=5, logger=comet_logger)
trainer.fit(model, train_loader, val_loader) 
trainer.test(model, test_loader)

You try to use a model that was created with version 1.2.0, however, your version is 1.1.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



CometLogger will be initialized in online mode
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/nikesh/fake-news-detection/dfa2fb1fcbef49bc8d6da82fb0c93455


  | Name        | Type                | Params
----------------------------------------------------
0 | gru         | GRU                 | 3.5 M 
1 | relu        | ReLU                | 0     
2 | linear      | Linear              | 513   
3 | sigmoid     | Sigmoid             | 0     
4 | loss        | BCELoss             | 0     
5 | transformer | SentenceTransformer | 22.7 M
----------------------------------------------------
26.3 M    Trainable params
0         Non-trainable params
26.3 M    Total params
10

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/nikesh/fake-news-detection/dfa2fb1fcbef49bc8d6da82fb0c93455
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     loss [625]       : (6.593811576749431e-07, 1.6669036149978638)
COMET INFO:     train_loss [125] : (1.303862632084929e-06, 0.9222912192344666)
COMET INFO:     val_loss [5]     : (0.13477300107479095, 0.3404513895511627)
COMET INFO:   Others:
COMET INFO:     Name : SBERT Unrelated DL
COMET INFO:   Uploads:
COMET INFO:     environment details : 1
COMET INFO:     filename            : 1
COMET INFO:     installed packages  : 1
COMET INFO:     model graph         : 1
COMET INFO:     notebook            : 1
COMET INFO:     os packages         : 1
COMET INFO:     source_code         : 1
COMET INFO: ---------------------------





COMET INFO: Uploading metrics, params, and assets to Comet before program termination (may take several seconds)
COMET INFO: The Python SDK has 3600 seconds to finish before aborting...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/nikesh/fake-news-detection/dfa2fb1fcbef49bc8d6da82fb0c93455



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.8705016374588013, 'test_loss': 0.4792836606502533}
--------------------------------------------------------------------------------


[{'test_acc': 0.8705016374588013, 'test_loss': 0.4792836606502533}]

###Training w/ TFIDF

In [10]:
train_loader = DataLoader(train_dataset, batch_size=32, num_workers=2, pin_memory=True, shuffle=False, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=32, num_workers=2, pin_memory=True, shuffle=False, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=32, num_workers=2, pin_memory=True, shuffle=False, drop_last=True)

# model
train_tfidf_data = np.concatenate((np.unique(train_headlines), np.unique(train_bodies)))
tfidf = TfidfVectorizer(decode_error='ignore', lowercase=True, min_df=2, stop_words=stop, max_features=10000) 
train_tfidf = tfidf.fit(train_tfidf_data.astype('U')) 
model = UnrelatedClassifer(tfidf=train_tfidf)

#logging
comet_logger = CometLogger(
    api_key='5BHASx2Q9HBWhSzOPryQC92zu',
    workspace=os.environ.get('nikesh'),
    project_name='fake-news-detection',
    experiment_name='TFIDF Unrelated DL (E=8)'
)

# training
trainer = pl.Trainer(gpus=1, max_epochs=8, logger=comet_logger)
trainer.fit(model, train_loader, val_loader)
trainer.test(model, test_loader)

CometLogger will be initialized in online mode
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/nikesh/fake-news-detection/cb805f931761422d9e76243c84ffb6b9
COMET INFO:   Parameters [count]:
COMET INFO:     analyzer      : word
COMET INFO:     binary        : 1
COMET INFO:     decode_error  : ignore
COMET INFO:     dtype [2]     : <class 'numpy.float64'>
COMET INFO:     encoding      : utf-8
COMET INFO:     input         : content
COMET INFO:     lowercase     : True
COMET INFO:     max_df        : 1.0
COMET INFO:     max_features  : 10000
COMET INFO:     min_df        : 2
COMET INFO:     ngram_range   : (1, 1)
COMET INFO:     norm          : l2
COMET INFO:     preprocessor  : 1
COME

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/nikesh/fake-news-detection/87bf8b1534594dd884fda3c4dafa223b
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     loss [1000]      : (1.3623934137285687e-05, 2.6242551803588867)
COMET INFO:     train_loss [200] : (1.0392019248683937e-05, 1.503598928451538)
COMET INFO:     val_loss [8]     : (0.40000513195991516, 0.6900495290756226)
COMET INFO:   Others:
COMET INFO:     Name : TFIDF Unrelated DL (E=8)
COMET INFO:   Uploads:
COMET INFO:     environment details : 1
COMET INFO:     filename            : 1
COMET INFO:     installed packages  : 1
COMET INFO:     model graph         : 1
COMET INFO:     notebook            : 1
COMET INFO:     os packages         : 1
COMET INFO:     source_code         : 1
COMET INFO: ---------------------------





COMET INFO: Uploading 1 metrics, params and output messages
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/nikesh/fake-news-detection/87bf8b1534594dd884fda3c4dafa223b



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.7068177461624146, 'test_loss': 0.9404726624488831}
--------------------------------------------------------------------------------


[{'test_acc': 0.7068177461624146, 'test_loss': 0.9404726624488831}]

#2b) Deep Learning for Agree/Disagree/Discuss