In [None]:
'''This file was used to experiment the RoBERTa and various ensemble models with RoBERTa on the given dataset.'''

from google.colab import files
uploaded = files.upload()

Saving RedditDevDataSrc.csv to RedditDevDataSrc.csv
Saving RedditTestDataSrc.csv to RedditTestDataSrc.csv
Saving RedditTrainDataSrc.csv to RedditTrainDataSrc.csv
Saving TwitterDevDataSrc.csv to TwitterDevDataSrc.csv
Saving TwitterTestDataSrc.csv to TwitterTestDataSrc.csv
Saving TwitterTrainDataSrc.csv to TwitterTrainDataSrc.csv


In [None]:
import tensorflow as tf
# Getting GPU device name.
device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
import torch
# If a GPU is available
if torch.cuda.is_available():    
    #set device to GPU   
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If no GPU is available
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
#Importing necessary libraries
!pip install transformers

import re
import scipy
import pandas         as pd
import io
import numpy          as np
import copy
import seaborn        as sns

import transformers
from transformers                     import  RobertaModel, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
import torch



from sklearn.metrics                  import classification_report
from sklearn.feature_extraction.text  import TfidfVectorizer

from torch                            import nn, optim
from torch.utils                      import data
from sklearn.decomposition            import PCA

#Seeding for deterministic results
RANDOM_SEED = 64
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

if torch.cuda.is_available():
   torch.cuda.manual_seed(RANDOM_SEED)
   torch.cuda.manual_seed_all(RANDOM_SEED) 
   torch.backends.cudnn.deterministic = True  
   torch.backends.cudnn.benchmark = False


CLASS_NAMES = ['support', 'deny', 'query', 'comment']
MAX_LENGTH = 200
BATCH_SIZE = 4
EPOCHS = 6
HIDDEN_UNITS = 128

tokenizer = transformers.RobertaTokenizer.from_pretrained('roberta-large')  #Use roberta-large or roberta-base

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 4.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 8.5MB/s 
[?25hCollecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 23.3MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)

  import pandas.util.testing as tm


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [None]:
#Converting labels to numbers
def label_to_int(label):
  if label   == 'support':
    return 0
  elif label == 'deny':
    return 1
  elif label == 'query':
    return 2
  elif label == 'comment':
    return 3


#Pre-processing Twitter and Reddit Posts to handle URLs and Mentions. 
#Replaces URLs with $URL$ and mentions with $MENTION$
def processText(text):
  text = re.sub(r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?", "$URL$",text.strip())
  text = re.sub(r"(@[A-Za-z0-9]+)", "$MENTION$", text.strip())

  return text

In [None]:
'''Processing all of Twitter and Reddit data frames to 
    1. Get rid of all NaN values
    2. Remove columns not useful for the Model
    3. Process text 
    4. Return a combined frame consisting of both Twitter and Reddit data'''

    
def processStanceData(twitterDf, RedditDf):
  frames = [twitterDf, RedditDf]

  resultDf = pd.concat(frames)                                                      #Concatenating twitter and reddit data
  result1  = resultDf.replace(np.nan, '', regex=True)                               #Getting rid of NaN values

  result1['labelvalue'] = result1.label_x.apply(label_to_int)                       #Converting labels to numbers
  result1['SrcInre']    = result1['inreText'].str.cat(result1['sourceText'],sep=" ")

  data = result1[['text_x', 'id', 'inre_x', 'source_x' ,'label_x','SrcInre', 'labelvalue' ]].copy()


  '''replyText           - the reply post (whose stance towards the target needs to be learnt)
     replyTextId         - the ID of the reply post
     previousText        - the text to which replyText was replied
     sourceText          - the source post of the conversation thread
     label               - the label value assigned to each post
     previoysPlusSrctext - the concatenation of the previousText and the sourceText
     labelValue          - the numberic value assigned to each label'''

  data.columns = ['replyText', 'replyTextId', 'previousText', 'sourceText', 'label', 'previousPlusSrcText', 'labelValue']

  data['pReplyText']           = data.replyText.apply(processText)
  data['pPreviousPlusSrcText'] = data.previousPlusSrcText.apply(processText)
  return data


In [None]:
#Reading Twitter and Reddit data (train, dev and test) onto dataFrames
twitterTrainDf  = pd.read_csv(io.StringIO(uploaded['TwitterTrainDataSrc.csv'].decode('utf-8')))
redditTrainDf   = pd.read_csv(io.StringIO(uploaded['RedditTrainDataSrc.csv'].decode('utf-8')))

twitterDevDf    = pd.read_csv(io.StringIO(uploaded['TwitterDevDataSrc.csv'].decode('utf-8')))
redditDevDf     = pd.read_csv(io.StringIO(uploaded['RedditDevDataSrc.csv'].decode('utf-8')))

twitterTestDf   = pd.read_csv(io.StringIO(uploaded['TwitterTestDataSrc.csv'].decode('utf-8')))
redditTestDf    = pd.read_csv(io.StringIO(uploaded['RedditTestDataSrc.csv'].decode('utf-8')))

#Processing Twitter and Reddit dataframe containig training data
trainDf = processStanceData(twitterTrainDf, redditTrainDf)
trainDf

Unnamed: 0,replyText,replyTextId,previousText,sourceText,label,previousPlusSrcText,labelValue,pReplyText,pPreviousPlusSrcText
0,Mike Brown was staying with his grandmother fo...,498280126254428160,,,support,,0,Mike Brown was staying with his grandmother fo...,
1,Witness: Police allegedly stopped Mike Brown a...,498430783699554305,,,support,,0,Witness: Police allegedly stopped Mike Brown a...,
2,Line of police cars with high beams on greets ...,499366666300846081,,,support,,0,Line of police cars with high beams on greets ...,
3,"Currently the #FoxNews website has zero, repea...",499368931367608320,,,support,,0,"Currently the #FoxNews website has zero, repea...",
4,St. Louis Co Police tell me ofcr shot a man wh...,499456140044824576,,,support,,0,St. Louis Co Police tell me ofcr shot a man wh...,
...,...,...,...,...,...,...,...,...,...
693,Quote:\n\n&gt; I was opening Turnberry the day...,e2by4oh,e2bxvw0,8yktu5,deny,[deleted] Jon Sopel: Bizarre. @realDonaldTrump...,1,Quote:\n\n&gt; I was opening Turnberry the day...,[deleted] Jon Sopel: Bizarre. $MENTION$ says h...
694,[deleted],e2bynsb,e2by4oh,8yktu5,comment,Quote:\n\n&gt; I was opening Turnberry the day...,3,[deleted],Quote:\n\n&gt; I was opening Turnberry the day...
695,He said he was opening it the day before Brexi...,e2bz0mz,e2bynsb,8yktu5,comment,[deleted] Jon Sopel: Bizarre. @realDonaldTrump...,3,He said he was opening it the day before Brexi...,[deleted] Jon Sopel: Bizarre. $MENTION$ says h...
696,"""Well if you remember I was opening Turnberry ...",e2c1gqf,e2btp0f,8yktu5,comment,[deleted] Jon Sopel: Bizarre. @realDonaldTrump...,3,"""Well if you remember I was opening Turnberry ...",[deleted] Jon Sopel: Bizarre. $MENTION$ says h...


In [None]:
#Processing Twitter and Reddit dataframe containig development data
devDf = processStanceData(twitterDevDf, redditDevDf)
devDf

Unnamed: 0,replyText,replyTextId,previousText,sourceText,label,previousPlusSrcText,labelValue,pReplyText,pPreviousPlusSrcText
0,Every 28 hours a black male is killed in the U...,498293668655423488,,,support,,0,Every 28 hours a black male is killed in the U...,
1,.@AP I demand you retract the lie that people ...,498486826269548545,,,deny,,1,.$MENTION$ I demand you retract the lie that p...,
2,Police name the officer who shot #Ferguson tee...,500280249629036544,,,support,,0,Police name the officer who shot #Ferguson tee...,
3,Police have named the cop who shot Michael Bro...,500298588992593920,,,support,,0,Police have named the cop who shot Michael Bro...,
4,At the war memorial in. Ottawa. A soldier has ...,524923293711998976,,,support,,0,At the war memorial in. Ottawa. A soldier has ...,
...,...,...,...,...,...,...,...,...,...
431,That's pretty interesting. \n\nThey were talki...,e3ckunl,e3beeop,934q6t,comment,It turns out that iodine does decrease in tabl...,3,That's pretty interesting. \n\nThey were talki...,It turns out that iodine does decrease in tabl...
432,Is iodized salt not INCREDIBLY common in the USA?,e3cnfmf,934q6t,934q6t,comment,Iodine increases IQ and is an essential part o...,3,Is iodized salt not INCREDIBLY common in the USA?,Iodine increases IQ and is an essential part o...
433,Who does the website state you can only buy on...,e3cr1ps,e3ar20p,934q6t,comment,"I take this daily, you'll notice a difference ...",3,Who does the website state you can only buy on...,"I take this daily, you'll notice a difference ..."
434,&gt;It’s estimated that nearly one-third of th...,e3cz12q,e3apatz,934q6t,comment,Your title says that iodine is absent from alm...,3,&gt;It’s estimated that nearly one-third of th...,Your title says that iodine is absent from alm...


In [None]:
#Processing Twitter and Reddit dataframe containig test data
testDf = processStanceData(twitterTestDf, redditTestDf)
testDf

Unnamed: 0,replyText,replyTextId,previousText,sourceText,label,previousPlusSrcText,labelValue,pReplyText,pPreviousPlusSrcText
0,Rep. Sheila Jackson Lee has no shame. I still ...,443938194715713536,,,support,,0,Rep. Sheila Jackson Lee has no shame. I still ...,
1,"ICYMI: ""Rep Sheila Jackson Lee (D-Tx) Wants Hu...",774165935041093633,,,support,,0,"ICYMI: ""Rep Sheila Jackson Lee (D-Tx) Wants Hu...",
2,Clinton camp delays Weather Channel ad buy aft...,784071228248109057,,,support,,0,Clinton camp delays Weather Channel ad buy aft...,
3,Clinton camp delays Weather Channel ad buy aft...,784118929799073793,,,support,,0,Clinton camp delays Weather Channel ad buy aft...,
4,Clinton camp delays Weather Channel ad buy aft...,784216706080178176,,,support,,0,Clinton camp delays Weather Channel ad buy aft...,
...,...,...,...,...,...,...,...,...,...
756,"Lawl. I'm assuming you're a troll. But if not,...",c5o445z,c5o2sto,xn2bn,comment,Doesn't mean it's not fake. You idiots voted a...,3,"Lawl. I'm assuming you're a troll. But if not,...",Doesn't mean it's not fake. You idiots voted a...
757,"not saying bush was the best, or palin was the...",c5o46c3,c5o445z,xn2bn,comment,"Lawl. I'm assuming you're a troll. But if not,...",3,"not saying bush was the best, or palin was the...","Lawl. I'm assuming you're a troll. But if not,..."
758,^^^^^\nthey would do that if someone hadn't se...,c5o47lt,c5nt4le,xn2bn,comment,When they bring up the college ID that shows h...,3,^^^^^\nthey would do that if someone hadn't se...,When they bring up the college ID that shows h...
759,"You are right about that, but we didn't know s...",c5o4cdk,c5o46c3,xn2bn,comment,"not saying bush was the best, or palin was the...",3,"You are right about that, but we didn't know s...","not saying bush was the best, or palin was the..."


In [None]:
#Creates a dataset which will be used to feed to RoBERTa
class StanceDataset(data.Dataset):

  def __init__(self, firstSeq, secondSeq, TextSrcInre, labelValue,  tokenizer, max_len):
    self.firstSeq    = firstSeq      #First input sequence that will be supplied to RoBERTa
    self.secondSeq   = secondSeq     #Second input sequence that will be supplied to RoBERTa
    self.TextSrcInre = TextSrcInre   #Concatenation of reply+ previous+ src text to get features from 1 training example
    self.labelValue  = labelValue    #label value for each training example in the dataset
    self.tokenizer   = tokenizer     #tokenizer that will be used to tokenize input sequences (Uses BERT-tokenizer here)
    self.max_len     = max_len       #Maximum length of the tokens from the input sequence that BERT needs to attend to

  def __len__(self):
    return len(self.labelValue)

  def __getitem__(self, item):
    firstSeq    = str(self.firstSeq[item])
    secondSeq   = str(self.secondSeq[item])
    TextSrcInre = str(self.TextSrcInre[item])
    
    #Encoding the first and the second sequence to a form accepted by RoBERTa
    #RoBERTa does not use token_type_ids to distinguish the first sequence from the second sequnece.
    encoding = tokenizer.encode_plus(
        firstSeq,
        secondSeq,
        max_length = self.max_len,
        add_special_tokens= True,
        truncation = True,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
    )

    return {
        'firstSeq' : firstSeq,
        'secondSeq' : secondSeq,
        'TextSrcInre': TextSrcInre,
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labelValue'  : torch.tensor(self.labelValue[item], dtype=torch.long)
    }


In [None]:
#Creates a data loader
def createDataLoader(dataframe, tokenizer, max_len, batch_size):
  ds = StanceDataset(
      firstSeq    = dataframe.pReplyText.to_numpy(),
      secondSeq   = dataframe.pPreviousPlusSrcText.to_numpy(),
      TextSrcInre = dataframe.TextSrcInre.to_numpy(),
      labelValue  = dataframe.labelValue.to_numpy(),
      tokenizer   = tokenizer,
      max_len     = max_len
  )

  return data.DataLoader(
      ds,
      batch_size  = batch_size,
      shuffle     = True,
      num_workers = 4
  )


In [None]:
#Combining the reply, previous and source texts to get features for 1 training example
trainDf['TextSrcInre'] = trainDf['pReplyText'].str.cat(trainDf['pPreviousPlusSrcText'],sep=" ")
devDf['TextSrcInre']   = devDf['pReplyText'].str.cat(devDf['pPreviousPlusSrcText'],sep=" ")
testDf['TextSrcInre']  = testDf['pReplyText'].str.cat(testDf['pPreviousPlusSrcText'],sep=" ")


#Creating data loader for training data
trainDataLoader        = createDataLoader(trainDf, tokenizer, MAX_LENGTH, BATCH_SIZE)

#Creating data loader for development data
developmentDataLoader  = createDataLoader(devDf, tokenizer, MAX_LENGTH, BATCH_SIZE)

#Creating data loader for test data
testDataLoader         = createDataLoader(testDf, tokenizer, MAX_LENGTH, BATCH_SIZE)

In [None]:
#Instantiating the tf-idf vectorizer object
tfidf = TfidfVectorizer(min_df = 10, max_df = 0.5, ngram_range=(1,2))

xtrain = trainDf['TextSrcInre'].tolist()
x_train_feats = tfidf.fit(xtrain)
print(x_train_feats)
print(len(x_train_feats.get_feature_names()))


x_train_transform = x_train_feats.transform(xtrain)
tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(x_train_transform)).float()
print(x_train_transform.shape)


pca = PCA(n_components=128)
p = pca.fit(tfidf_transform_tensor)
#print(p.shape)
#print(p)
X = p.transform(tfidf_transform_tensor)
#torch.from_numpy(X.values)
X = torch.from_numpy(X)
#tfidf_transform_tensor_pca = torch.tensor(scipy.sparse.csr_matrix.todense(X)).float()
#print(X.type())
#print(X.shape)
#print(X)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=None,
                min_df=10, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)
5814
(5217, 5814)


In [None]:
#This class defines the model that was used to pre-train a SNN on TF-IDF features
class Tfidf_Nn(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Inputs to hidden layer linear transformation
        self.hidden  = nn.Linear(len(tfidf.get_feature_names()), HIDDEN_UNITS)
        # Output layer
        self.output  =  nn.Linear(HIDDEN_UNITS, 4)
        self.dropout = nn.Dropout(0.1)
        
        # Defining tanh activation and softmax output 
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        # Pass the input tensor through each of our operations
        x = self.hidden(x)
        #print(x.shape)
        y = self.tanh(x)
        #print(y.shape)
        z = self.dropout(y)
        #print(z.shape)
        z = self.output(z)
        #print(z.shape)
        z = self.softmax(z)
        
        #Returning the ouputs from the hidden layer and the final output layer
        return  y, z
    

In [None]:
#Loading the already trained MLP model that was trained on TF-IDF features. 

from google.colab import drive
drive.mount('/content/gdrive')
snnmodel = Tfidf_Nn()

model_save_name = 'pre-trainedTfidf.pt'
path = F"/content/gdrive/My Drive/{model_save_name}"

snnmodel.load_state_dict(torch.load(path))
snnmodel.eval()

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


Tfidf_Nn(
  (hidden): Linear(in_features=5814, out_features=128, bias=True)
  (output): Linear(in_features=128, out_features=4, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (tanh): Tanh()
  (softmax): Softmax(dim=1)
)

In [None]:
'''This class defines the model that will be used for 
training and testing on the dataset.

Adapted from huggingFace
This RoBERTa model from huggingface outputs the last hidden states
and the pooled output by default. Pooled output is the classification 
token (1st token of the last hidden state) further processed by a Linear
layer and a Tanh activation function.

The pre-trained RoBERTa model is used as the primary model.
This class experiments with RoBERTa and its ensemble with TF-IDF features. 
roberta-only :            No ensembling. This just fine-tunes the RoBERTa model. 
                          The pooled output is passed through a linear layer and 
                          softmax function is finally used for preictions. 

roberta-tfIdf :           This model conatenates the 1st token of last-hidden layer
                          from RoBERTa with TF-IDF features. Various ways of this 
                          concatenation was experimented (using pooled output instead
                          of 1st token of last hidden layer etc)

roberta-pcaTfidf :        This model concatenates the pooled output from
                          RoBERTa with the PCA transformed vector.

roberta-preTrainedTfIdf : This model concatenates the pooled output from
                          RoBERTa with the hidden layer output from a pre-trained
                          SNN that was trained on TF-IDF features.

Used dropout to prevent over-fitting.'''

class StanceClassifier(nn.Module):

  def __init__(self,  n_classes):
    super(StanceClassifier, self).__init__()
    self.robertaModel              = RobertaModel.from_pretrained('roberta-large')    #use roberta-large or roberta-base
    self.model_TFIDF               = snnmodel                                        #Pre-trained SNN trained with TF-IDF features

    self.drop                      = nn.Dropout(p = 0.3)

    self.output                    = nn.Linear(self.robertaModel.config.hidden_size, n_classes)

    self.input_size_tfidf_only     = self.robertaModel.config.hidden_size + len(tfidf.get_feature_names())
    self.input_size_tfidf_pca      = self.robertaModel.config.hidden_size + HIDDEN_UNITS
    
    self.dense                     = nn.Linear( self.input_size_tfidf_only,  self.input_size_tfidf_only)
    self.out_proj                  = nn.Linear( self.input_size_tfidf_only, n_classes)
    self.out_pca                   = nn.Linear( self.input_size_tfidf_pca, n_classes)

    self.input_size_preTrain_tfidf = self.robertaModel.config.hidden_size +  HIDDEN_UNITS 
    self.out                       = nn.Linear(self.input_size_preTrain_tfidf, n_classes)
    
    self.softmax                   = nn.Softmax(dim = 1)

  def forward(self, input_ids, attention_mask, inputs_tfidf_feats, pca_transformed_feats, modelType):
    
    roberta_output     = self.robertaModel(
        input_ids      = input_ids,               #Input sequence tokens
        attention_mask = attention_mask )         #Mask to avoid performing attention on padding tokens
    #print(roberta_output[1].shape)

    if modelType   == 'roberta-only':
      pooled_output = roberta_output[1]           #Using pooled output
      output        = self.drop(pooled_output)
      output        = self.output(output)

    elif modelType == 'roberta-tfIdf':
      soutput = roberta_output[1]#---------        experimenting with pooled output 
      #soutput = roberta_output[0][:, 0, :]        #taking <s> token (equivalent to [CLS] token in BERT)
      x       = torch.cat((soutput, inputs_tfidf_feats) , dim=1)
      x       = self.drop(x)
      output  = self.out_proj(x)

    elif modelType == 'roberta-pcaTfidf':
      soutput = roberta_output[1]
      x       = torch.cat((soutput, pca_transformed_feats) , dim=1)
      x       = self.drop(x)
      output  = self.out_pca(x)

    elif modelType == 'roberta-TrainedTfIdf':
      tfidf_hidddenLayer, tfidf_output = self.model_TFIDF(inputs_tfidf_feats)
      #print(tfidf_hidddenLayer.shape)
      #print(tfidf_output.shape)
    
      #Conactenating pooled output from RoBERTa with the hidden layer from the pre-trained SNN using TF-IDF features. 
      #pooled_output = torch.cat((roberta_output[1], tfidf_output) , dim=1)-------- Experimenting with Output of pre-trained SNN 
      pooled_output = torch.cat((roberta_output[1], tfidf_hidddenLayer) , dim=1)
      output        = self.drop(pooled_output)
      output        = self.out(output)
    
    return self.softmax(output)



In [None]:
'''from google.colab import drive
drive.mount('/content/gdrive')
snnmodel = Tfidf_Nn()

model_save_name = 'pre-trainedTfidf.pt'
path = F"/content/gdrive/My Drive/{model_save_name}"

snnmodel.load_state_dict(torch.load(path))
snnmodel.eval()
model = StanceClassifier(len(CLASS_NAMES))

#Loading fine-trained RoBERTa model on the same dataset
model_save_name = 'RoBERTaLarge_TFIDFV2.pt'
path = F"/content/gdrive/My Drive/{model_save_name}"
model.load_state_dict(torch.load(path))
model.eval()
model = model.to(device)


# = StanceClassifier(len(CLASS_NAMES))
#model = model.to(device)
print(model)

print(snnmodel)'''



'from google.colab import drive\ndrive.mount(\'/content/gdrive\')\nsnnmodel = Tfidf_Nn()\n\nmodel_save_name = \'pre-trainedTfidf.pt\'\npath = F"/content/gdrive/My Drive/{model_save_name}"\n\nsnnmodel.load_state_dict(torch.load(path))\nsnnmodel.eval()\nmodel = StanceClassifier(len(CLASS_NAMES))\n\n#Loading fine-trained RoBERTa model on the same dataset\nmodel_save_name = \'RoBERTaLarge_TFIDFV2.pt\'\npath = F"/content/gdrive/My Drive/{model_save_name}"\nmodel.load_state_dict(torch.load(path))\nmodel.eval()\nmodel = model.to(device)\n\n\n# = StanceClassifier(len(CLASS_NAMES))\n#model = model.to(device)\nprint(model)\n\nprint(snnmodel)'

In [None]:
#Instantiating a StanceClassifier object as our model and loading the model onto the GPU.
model = StanceClassifier(len(CLASS_NAMES))
model = model.to(device)
#print(model)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=482.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1425941629.0, style=ProgressStyle(descr…




In [None]:
'''Using the same optimiser as used in BERT paper
with a different learning rate'''
optimizer = AdamW(model.parameters(), 
                  lr = 2e-6, 
                  correct_bias= False)

totalSteps = len(trainDataLoader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps = totalSteps
)

'''Using class-weights to accomodate heavily imbalanced data. 
These weights were learnt by running several experiments using 
other weights and the weights that produced the best results have
finally been used here'''

weights      = [8.0, 84.0, 8.0, 1.0]
classWeights = torch.FloatTensor(weights)
lossFunction = nn.CrossEntropyLoss(weight = classWeights).to(device)


In [None]:
#This function is used for training the model. 
def train_epoch(
  model,
  dataLoader,
  lossFunction,
  optimizer,
  device,
  scheduler,
  n_examples
):

  model = model.train()
  losses = []
  correctPredictions = 0

  for d in dataLoader:
    
    input_ids              = d["input_ids"].to(device)                           #Loading input ids to GPU
    attention_mask         = d["attention_mask"].to(device)                      #Loading attention mask to GPU
    labelValues            = d["labelValue"].to(device)                          #Loading label value to GPU
    textSrcInre            = d["TextSrcInre"]                                    
    tfidf_transform        = x_train_feats.transform(textSrcInre)
    tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(tfidf_transform)).float()   
    pca_tensor             = p.transform(tfidf_transform_tensor)

    pca_tensor = torch.from_numpy(pca_tensor).float()
    pca_tensor = pca_tensor.to(device)
    tfidf_transform_tensor = tfidf_transform_tensor.to(device)

    #Getting the output from our model (Object of StanceClassification class) for train data
    outputs = model(
      input_ids             = input_ids,
      attention_mask        = attention_mask,
      inputs_tfidf_feats    = tfidf_transform_tensor,
      pca_transformed_feats = pca_tensor,
      modelType             = 'roberta-TrainedTfIdf'
    )

    #Determining the model predictions
    _, predictionIndices = torch.max(outputs, dim=1)
    loss = lossFunction(outputs, labelValues)

    #Calculating the correct predictions for accuracy
    correctPredictions += torch.sum(predictionIndices == labelValues)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return np.mean(losses), correctPredictions.double() / n_examples


In [None]:
#This function is used for evaluating the model on the development and test set
def eval_model(
    model, 
    dataLoader, 
    lossFunction,
    device,
    n_examples
    ):
  
  model = model.eval()
  losses = []
  correctPredictions = 0

  with torch.no_grad():
    for d in dataLoader:

      input_ids              = d["input_ids"].to(device)                          #Loading input ids to GPU
      attention_mask         = d["attention_mask"].to(device)                     #Loading attention mask to GPU
      labelValues            = d["labelValue"].to(device)                         #Loading label values to GPU
      textSrcInre            = d["TextSrcInre"]
      tfidf_transform        = x_train_feats.transform(textSrcInre)
      tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(tfidf_transform)).float()    
      
      pca_tensor             = p.transform(tfidf_transform_tensor)

      pca_tensor = torch.from_numpy(pca_tensor).float()
      pca_tensor = pca_tensor.to(device)
      tfidf_transform_tensor = tfidf_transform_tensor.to(device)

      #Getting the softmax output from model for dev data
      outputs = model(
        input_ids             = input_ids,
        attention_mask        = attention_mask,
        inputs_tfidf_feats    = tfidf_transform_tensor,
        pca_transformed_feats = pca_tensor,
        modelType             = 'roberta-TrainedTfIdf'
      )

      #Determining the model predictions
      _, predictionIndices = torch.max(outputs, dim=1)
      loss = lossFunction(outputs, labelValues)

      #Calculating the correct predictions for accuracy
      correctPredictions += torch.sum(predictionIndices == labelValues)
      losses.append(loss.item())

  return np.mean(losses), correctPredictions.double() / n_examples


In [None]:
#fine tuning ROBERTa and validating it 

for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}')
  trainLoss, trainAccuracy = train_epoch(
    model,
    trainDataLoader,
    lossFunction,
    optimizer,
    device,
    scheduler,
    len(trainDf)
  )
  
  print(f'Training loss {trainLoss} Training accuracy {trainAccuracy}')

  devLoss, devAccuracy = eval_model(
    model,
    developmentDataLoader,
    lossFunction,
    device,
    len(devDf)
  )

  print(f'Development loss {devLoss} Development accuracy {devAccuracy}')
  print()
  
  print()


Epoch 1
Training loss 1.298501740378895 Training accuracy 0.6289055012459267
Development loss 1.2253233229601255 Development accuracy 0.7919191919191919


Epoch 2
Training loss 1.2555404903788219 Training accuracy 0.7230208932336591
Development loss 1.2016303798844736 Development accuracy 0.8020202020202021


Epoch 3
Training loss 1.2344279885748795 Training accuracy 0.7243626605328732
Development loss 1.1308476554770623 Development accuracy 0.818855218855219


Epoch 4
Training loss 1.190851000716403 Training accuracy 0.7400805060379528
Development loss 1.1160803852222299 Development accuracy 0.8316498316498316


Epoch 5
Training loss 1.1655783090554892 Training accuracy 0.757715161970481
Development loss 1.0835556969527276 Development accuracy 0.8154882154882155


Epoch 6
Training loss 1.1490712403794359 Training accuracy 0.7648073605520413
Development loss 1.0594400346920054 Development accuracy 0.8195286195286196




In [None]:
#This function gets the predictions from the model after it is trained.
def get_predictions(model, data_loader):

  model = model.eval()
  review_texta = []
  review_textb = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in data_loader:

      textas                 = d["firstSeq"]
      textbs                 = d["secondSeq"]
      input_ids              = d["input_ids"].to(device)
      attention_mask         = d["attention_mask"].to(device)
      labels                 = d["labelValue"].to(device)
      textSrcInre            = d["TextSrcInre"]
      tfidf_transform        = tfidf.transform(textSrcInre)
      tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(tfidf_transform)).float()

      pca_tensor             =  p.transform(tfidf_transform_tensor)

      pca_tensor = torch.from_numpy(pca_tensor).float()
      pca_tensor = pca_tensor.to(device)
      tfidf_transform_tensor = tfidf_transform_tensor.to(device)

      #Getting the softmax output from model
      outputs = model(
        input_ids             = input_ids,
        attention_mask        = attention_mask,
        inputs_tfidf_feats    = tfidf_transform_tensor,
        pca_transformed_feats = pca_tensor,
        modelType             = 'roberta-TrainedTfIdf'
      )

      _, preds = torch.max(outputs, dim=1)     #Determining the model predictions

      review_texta.extend(textas)
      review_textb.extend(textbs)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(labels)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  
  return review_texta, review_textb, predictions, prediction_probs, real_values

In [None]:
#Getting model predictions on dev dataset
firstSeq_dev, secondSeq_dev, yHat_dev, predProbs_dev, yTest_dev = get_predictions(
  model,
  developmentDataLoader
)

In [None]:
  #Printing classification report for dev dataset (Evaluating the model on Dev set)
print(classification_report(yTest_dev, yHat_dev, target_names= CLASS_NAMES))

              precision    recall  f1-score   support

     support       0.48      0.31      0.38       102
        deny       0.38      0.33      0.35        82
       query       0.62      0.70      0.66       120
     comment       0.89      0.91      0.90      1181

    accuracy                           0.82      1485
   macro avg       0.59      0.56      0.57      1485
weighted avg       0.81      0.82      0.81      1485



In [None]:
#Saving the model onto the drive
from google.colab import drive
drive.mount('/content/gdrive')

model_save_name = 'RoBERTaLarge_TFIDFV2.pt'
path = F"/content/gdrive/My Drive/{model_save_name}" 
torch.save(model.state_dict(), path)

In [None]:
#Getting model predictions on test dataset
firstSeq_test, secondSeq_test, yHat_test, predProbs_test, yTest_test = get_predictions(
  model,
  testDataLoader
)

In [None]:
#Printing classification report for test dataset (Evaluating the model on test set)
print(classification_report(yTest_test, yHat_test, target_names= CLASS_NAMES))

              precision    recall  f1-score   support

     support       0.81      0.31      0.44       157
        deny       0.68      0.53      0.60       101
       query       0.60      0.57      0.58        93
     comment       0.89      0.97      0.93      1476

    accuracy                           0.87      1827
   macro avg       0.74      0.59      0.64      1827
weighted avg       0.86      0.87      0.85      1827



In [None]:
#Saving the predictions onto a CSV file for error analysis
zippedList =  list(zip(firstSeq_test, secondSeq_test, yHat_test, predProbs_test, yTest_test ))
dfObj = pd.DataFrame(zippedList, columns = ['Texta' , 'Textb', 'Ypred', 'YpredsProbs', 'label'])

from google.colab import drive
drive.mount('drive')

dfObj.to_csv('dataPredsFromRoberta_TFIDFV2.csv')
!cp dataPredsFromRoberta_TFIDFV2.csv "drive/My Drive/"