In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

In [2]:
import tensorflow as tf
import tensorflow_hub as hub

In [3]:
train_df=pd.read_csv('../input/ire-major-data-v2/train_df.csv')
test_df=pd.read_csv('../input/ire-major-data-v2/test_set_with_paraphrases.csv')

In [4]:
train_df.head(2)

Unnamed: 0.1,Unnamed: 0,SNO,Intensity,Text,label,#tokens
0,0,839551577044234245,0.533333,Perspective: These agencies helped make Americ...,clickbait,26
1,1,844843703847944192,0.133333,Seven arrests made in raids after London attac...,no-clickbait,21


In [5]:
test_df.head(2)

Unnamed: 0.1,Unnamed: 0,SNO,Intensity,Text,label,#tokens,Paraphrased_Text,Paraphrased_Text_Pegasus
0,0,833654544412340224,0.0,Abu Dhabi awards China’s CNPC a stake in its l...,no-clickbait,25,Abu Dhabi awarded China's CNPC a stake in its ...,The largest oil concession in Abu Dhabi has a ...
1,1,821482229213728768,0.066667,Former CIA rendition agent issues plea to Dona...,no-clickbait,17,Former CIA rendition agent issues plea to Dona...,The former CIA rendition agent is facing jail ...


In [6]:
fill_na=[]
for i in range(len(test_df)):
    text=test_df['Paraphrased_Text_Pegasus'].iloc[i]
    if str(text)==str(np.nan):
        fill_na.append(test_df['Text'].iloc[i])
    else:
        fill_na.append(text)

In [7]:
test_df['Paraphrased_Text_Pegasus']=fill_na

In [8]:
print('Length of train set\t:',len(train_df))
print('Length of test set\t:',len(test_df))
print('Total data size\t\t:',len(train_df)+len(test_df))

Length of train set	: 17497
Length of test set	: 4337
Total data size		: 21834


In [9]:
train_df['label']=['clickbait' if i>=0.5 else 'no-clickbait' for i in train_df['Intensity']]
test_df['label']=['clickbait' if i>=0.5 else 'no-clickbait' for i in test_df['Intensity']]

In [10]:
train_df['label'].value_counts()

no-clickbait    13221
clickbait        4276
Name: label, dtype: int64

In [11]:
test_df['label'].value_counts()

no-clickbait    3235
clickbait       1102
Name: label, dtype: int64

In [12]:
from transformers import RobertaTokenizer, RobertaModel
import torch

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
model.to('cuda:0')

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [14]:
def read_word_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt").to('cuda:0')
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state.cpu()
    embedding=np.mean(last_hidden_states[0].detach().numpy(),axis=0)
#     embedding=np.mean(last_hidden_states[0].detach().to('cpu'),axis=0)
    return embedding

In [15]:
x_train_embeddings=[]
for i in tqdm(range(len(train_df))):
    try:
        x_train_embeddings.append(read_word_embedding(train_df['Text'].iloc[i]))
    except:
        print(i)
        break

100%|██████████| 17497/17497 [02:47<00:00, 104.57it/s]


In [16]:
x_train_embeddings=np.array(x_train_embeddings)
y_train_intensity=np.array(list(train_df['Intensity'].values))

In [17]:
x_test_embeddings=[]
for i in tqdm(range(len(test_df))):
    try:
        x_test_embeddings.append(read_word_embedding(test_df['Text'].iloc[i]))
    except:
        print(i)
        break

100%|██████████| 4337/4337 [00:40<00:00, 106.19it/s]


In [18]:
x_test_embeddings=np.array(x_test_embeddings)
y_test_intensity=np.array(list(test_df['Intensity'].values))
y_test_actual_label=np.array(list(test_df['label'].values))

In [19]:
x_test_embeddings_new=[]
for i in tqdm(range(len(test_df))):
    try:
        x_test_embeddings_new.append(read_word_embedding(test_df['Paraphrased_Text_Pegasus'].iloc[i]))
    except:
        print(i)
        break

x_test_embeddings_new=np.array(x_test_embeddings_new)

100%|██████████| 4337/4337 [00:41<00:00, 105.01it/s]


In [20]:
test_df['Text'].iloc[386]

'🤔'

# Linear Regression

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [22]:
linear_reg = LinearRegression().fit(x_train_embeddings, y_train_intensity)

In [23]:
y_test_pred = linear_reg.predict(x_test_embeddings)
y_test_pred_classes=['clickbait' if i>=0.5 else 'no-clickbait' for i in y_test_pred]

y_test_pred_new = linear_reg.predict(x_test_embeddings_new)
y_test_pred_classes_new=['clickbait' if i>=0.5 else 'no-clickbait' for i in y_test_pred_new]

y_test_actual_classes=list(test_df['label'].values)

In [24]:
from sklearn.metrics import median_absolute_error,mean_squared_error, f1_score, accuracy_score

In [25]:
def results(y_true,y_pred,y_actual_classes,y_pred_classes):
    Medae = median_absolute_error(y_true,y_pred)
    mse = mean_squared_error(y_true,y_pred) #default=True
    rmse = mean_squared_error(y_true,y_pred,squared=False)
    accuracy=accuracy_score(y_actual_classes,y_pred_classes)
    f1=f1_score(y_actual_classes,y_pred_classes,pos_label='clickbait')
    
    print("MedAE:",Medae)
    print("MSE:",mse)
    print("RMSE:",rmse)
    print("Accuracy:",accuracy)
    print("f1-score:",f1)

In [26]:
results(y_test_intensity,y_test_pred,y_test_actual_classes,y_test_pred_classes)

MedAE: 0.10626551307554688
MSE: 0.027650374498232728
RMSE: 0.16628401756703115
Accuracy: 0.8547382983629237
f1-score: 0.6732365145228216


In [27]:
results(y_test_intensity,y_test_pred_new,y_test_actual_classes,y_test_pred_classes_new)

MedAE: 0.14225667318375002
MSE: 0.04147182429491902
RMSE: 0.20364632158455262
Accuracy: 0.8070094535393129
f1-score: 0.6258381761287439


In [28]:
test_df['predicted_intesity']=y_test_pred
test_df['predicted_intesity_paraphrased']=y_test_pred_new

In [29]:
test_df.drop(['Unnamed: 0'],axis=1,inplace=True)
test_df.head()

Unnamed: 0,SNO,Intensity,Text,label,#tokens,Paraphrased_Text,Paraphrased_Text_Pegasus,predicted_intesity,predicted_intesity_paraphrased
0,833654544412340224,0.0,Abu Dhabi awards China’s CNPC a stake in its l...,no-clickbait,25,Abu Dhabi awarded China's CNPC a stake in its ...,The largest oil concession in Abu Dhabi has a ...,0.075497,0.161243
1,821482229213728768,0.066667,Former CIA rendition agent issues plea to Dona...,no-clickbait,17,Former CIA rendition agent issues plea to Dona...,The former CIA rendition agent is facing jail ...,0.201714,0.329372
2,828736281328906241,0.133333,Eddie Jones warns England players not to use C...,no-clickbait,25,Eddie Jones warns England players not to use C...,Eddie Jones warned England players not to use ...,0.165798,0.213936
3,831914725311016960,0.466667,"Rachel Matthews: Sorry, no refunds... why do m...",no-clickbait,25,"Rachel Matthews: sorry, no refunds... why do m...",Men think they should get their money back aft...,0.583281,0.582115
4,829615888076828672,0.133333,Thordis Elva was sixteen when she was raped by...,no-clickbait,19,Thordis Elva was sixteen when she was raped by...,"When she was sixteen, she was raped by her boy...",0.219322,0.440407


In [30]:
test_df.to_csv('test_set_outputs.csv')

In [31]:
# paraphrase intensity less than predicted intensity
pplp_df=test_df[test_df['predicted_intesity_paraphrased']<=test_df['predicted_intesity']]
len(pplp_df)

1123

In [32]:
pplp_df.to_csv('paraphrase intensity less than predicted intensity.csv')

In [33]:
# paraphrase intensity greater than predicted intensity
ppgp_df=test_df[test_df['predicted_intesity_paraphrased']>test_df['predicted_intesity']]
len(ppgp_df)

3214

In [34]:
ppgp_df.to_csv('paraphrase intensity greater than predicted intensity.csv')

In [35]:
# paraphrase intensity less than actual intensity
ppla_df=test_df[test_df['predicted_intesity_paraphrased']<=test_df['Intensity']]
len(ppla_df)

1492

In [36]:
ppla_df.to_csv('paraphrase intensity less than actual intensity.csv')

In [37]:
# paraphrase intensity greater than actual intensity
ppga_df=test_df[test_df['predicted_intesity_paraphrased']>test_df['Intensity']]
len(ppga_df)

2845

In [38]:
ppga_df.to_csv('paraphrase intensity greater than actual intensity.csv')