In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

In [2]:
import tensorflow as tf
import tensorflow_hub as hub

In [3]:
# train_df=pd.read_excel('../input/ire-major-project-data/train.xlsx')
# dev_df=pd.read_excel('../input/ire-major-project-data/dev.xlsx')
# test_df=pd.read_excel('../input/ire-major-project-data/test.xlsx')

train_df=pd.read_table('../input/ire-major-data-v2/en_train_pt.tsv',names=['SNO','Intensity','Text'],sep='\t')
test_df=pd.read_table('../input/ire-major-data-v2/en_test_pt.tsv',names=['SNO','Intensity','Text'],sep='\t')

In [4]:
train_df.head(2)

Unnamed: 0,SNO,Intensity,Text
0,839551577044234245,0.533333,Perspective: These agencies helped make Americ...
1,844843703847944192,0.133333,Seven arrests made in raids after London attac...


In [5]:
print('Length of train set\t:',len(train_df))
print('Length of test set\t:',len(test_df))
print('Total data size\t\t:',len(train_df)+len(test_df))

Length of train set	: 17506
Length of test set	: 4341
Total data size		: 21847


In [6]:
train_df['label']=['clickbait' if i>=0.5 else 'no-clickbait' for i in train_df['Intensity']]
test_df['label']=['clickbait' if i>=0.5 else 'no-clickbait' for i in test_df['Intensity']]

In [7]:
train_df['label'].value_counts()

no-clickbait    13225
clickbait        4281
Name: label, dtype: int64

In [8]:
test_df['label'].value_counts()

no-clickbait    3237
clickbait       1104
Name: label, dtype: int64

In [9]:
from transformers import RobertaTokenizer, RobertaModel
import torch

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
length=[]
for i in train_df['Text']:
    tok = tokenizer(i, return_tensors="pt")
    length.append(len(tok['input_ids'][0]))

train_df['#tokens']=length

Token indices sequence length is longer than the specified maximum sequence length for this model (573 > 512). Running this sequence through the model will result in indexing errors


In [11]:
length=[]
for i in test_df['Text']:
    tok = tokenizer(i, return_tensors="pt")
    length.append(len(tok['input_ids'][0]))

test_df['#tokens']=length

In [12]:
train_df=train_df[train_df['#tokens']<50]
test_df=test_df[test_df['#tokens']<50]

In [13]:
len(train_df)

17497

In [14]:
len(test_df)

4337

In [15]:
model.to('cuda:0')

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [16]:
# import re
# def preprocess(text):
#     text=re.sub(r'\W+',' ',text)
#     text=text.strip()
#     return text

In [17]:
# train_df['preprocessed_text']=train_df['text'].apply(lambda x:preprocess(x))
# dev_df['preprocessed_text']=dev_df['text'].apply(lambda x:preprocess(x))
# test_df['preprocessed_text']=test_df['text'].apply(lambda x:preprocess(x))

In [18]:
def read_word_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt").to('cuda:0')
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state.cpu()
    embedding=np.mean(last_hidden_states[0].detach().numpy(),axis=0)
#     embedding=np.mean(last_hidden_states[0].detach().to('cpu'),axis=0)
    return embedding

In [19]:
x_train_embeddings=[]
for i in tqdm(range(len(train_df))):
    try:
        x_train_embeddings.append(read_word_embedding(train_df['Text'].iloc[i]))
    except:
        print(i)
        break

100%|██████████| 17497/17497 [02:50<00:00, 102.35it/s]


In [20]:
x_train_embeddings=np.array(x_train_embeddings)
y_train_intensity=np.array(list(train_df['Intensity'].values))

In [21]:
# x_dev_embeddings=[read_word_embedding(sentence) for sentence in tqdm(dev_df['preprocessed_text'])]
# y_dev_intensity=list(dev_df['intensity'].values)
# y_dev_actual_label=list(dev_df['class'].values)

In [22]:
x_test_embeddings=[]
for i in tqdm(range(len(test_df))):
    try:
        x_test_embeddings.append(read_word_embedding(test_df['Text'].iloc[i]))
    except:
        print(i)
        break

100%|██████████| 4337/4337 [00:41<00:00, 104.62it/s]


In [23]:
x_test_embeddings=np.array(x_test_embeddings)
y_test_intensity=np.array(list(test_df['Intensity'].values))
y_test_actual_label=np.array(list(test_df['label'].values))

# Linear Regression

In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [25]:
linear_reg = LinearRegression().fit(x_train_embeddings, y_train_intensity)

In [26]:
y_test_pred = linear_reg.predict(x_test_embeddings)
y_test_pred_classes=['clickbait' if i>=0.5 else 'no-clickbait' for i in y_test_pred]
y_test_actual_classes=list(test_df['label'].values)

In [27]:
from sklearn.metrics import median_absolute_error,mean_squared_error, f1_score, accuracy_score

In [28]:
def results(y_true,y_pred,y_actual_classes,y_pred_classes):
    Medae = median_absolute_error(y_true,y_pred)
    mse = mean_squared_error(y_true,y_pred) #default=True
    rmse = mean_squared_error(y_true,y_pred,squared=False)
    accuracy=accuracy_score(y_actual_classes,y_pred_classes)
    f1=f1_score(y_actual_classes,y_pred_classes,pos_label='clickbait')
    
    print("MedAE:",Medae)
    print("MSE:",mse)
    print("RMSE:",rmse)
    print("Accuracy:",accuracy)
    print("f1-score:",f1)

In [29]:
results(y_test_intensity,y_test_pred,y_test_actual_classes,y_test_pred_classes)

MedAE: 0.10626551307554688
MSE: 0.027650374498232728
RMSE: 0.16628401756703115
Accuracy: 0.8547382983629237
f1-score: 0.6732365145228216


# Ridge Regression

In [30]:
from sklearn.linear_model import Ridge

In [31]:
ridge_reg = Ridge(alpha=2).fit(x_train_embeddings, y_train_intensity)
y_test_pred = ridge_reg.predict(x_test_embeddings)
y_test_pred_classes=['clickbait' if i>=0.5 else 'no-clickbait' for i in y_test_pred]
y_test_actual_classes=list(test_df['label'].values)

In [32]:
results(y_test_intensity,y_test_pred,y_test_actual_classes,y_test_pred_classes)

MedAE: 0.10668419998230713
MSE: 0.027413629698537584
RMSE: 0.1655706184639581
Accuracy: 0.8563523172700023
f1-score: 0.6753517457008859


# Gradient Boosted Regression

In [33]:
from sklearn.ensemble import GradientBoostingRegressor

In [34]:
ridge_reg = GradientBoostingRegressor().fit(x_train_embeddings, y_train_intensity)
y_test_pred = ridge_reg.predict(x_test_embeddings)
y_test_pred_classes=['clickbait' if i>=0.5 else 'no-clickbait' for i in y_test_pred]
y_test_actual_classes=list(test_df['label'].values)
results(y_test_intensity,y_test_pred,y_test_actual_classes,y_test_pred_classes)

MedAE: 0.11500446947761096
MSE: 0.029832788492144123
RMSE: 0.17272170822494815
Accuracy: 0.844362462531704
f1-score: 0.6349378042184964


# Random Forest Regression

In [35]:
from sklearn.ensemble import RandomForestRegressor

In [36]:
ridge_reg = RandomForestRegressor(max_depth=3, random_state=0).fit(x_train_embeddings, y_train_intensity)
y_test_pred = ridge_reg.predict(x_test_embeddings)
y_test_pred_classes=['clickbait' if i>=0.5 else 'no-clickbait' for i in y_test_pred]
y_test_actual_classes=list(test_df['label'].values)
results(y_test_intensity,y_test_pred,y_test_actual_classes,y_test_pred_classes)

MedAE: 0.1434751878687867
MSE: 0.04243003617381256
RMSE: 0.2059855241851052
Accuracy: 0.7890246714318654
f1-score: 0.41533546325878595


# Adaboost Regression

In [37]:
from sklearn.ensemble import AdaBoostRegressor

In [38]:
ridge_reg = AdaBoostRegressor(random_state=0, n_estimators=100).fit(x_train_embeddings, y_train_intensity)
y_test_pred = ridge_reg.predict(x_test_embeddings)
y_test_pred_classes=['clickbait' if i>=0.5 else 'no-clickbait' for i in y_test_pred]
y_test_actual_classes=list(test_df['label'].values)
results(y_test_intensity,y_test_pred,y_test_actual_classes,y_test_pred_classes)

MedAE: 0.15568692764027522
MSE: 0.03933567908290634
RMSE: 0.19833224418360806
Accuracy: 0.842979017754208
f1-score: 0.6316928069226608
