In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

In [2]:
import tensorflow as tf
import tensorflow_hub as hub

In [3]:
# train_df=pd.read_excel('../input/ire-major-project-data/train.xlsx')
# dev_df=pd.read_excel('../input/ire-major-project-data/dev.xlsx')
# test_df=pd.read_excel('../input/ire-major-project-data/test.xlsx')

train_df=pd.read_table('../input/ire-major-data-v2/en_train_pt.tsv',names=['SNO','Intensity','Text'],sep='\t')
test_df=pd.read_table('../input/ire-major-data-v2/en_test_pt.tsv',names=['SNO','Intensity','Text'],sep='\t')

In [4]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [5]:
def return_length(df):
    length=[]
    for i in df['Text']:
        tok = tokenizer(i, return_tensors="pt")
        length.append(len(tok['input_ids'][0]))
        
    return length

In [6]:
 train_df['#tokens']=return_length(train_df)

Token indices sequence length is longer than the specified maximum sequence length for this model (573 > 512). Running this sequence through the model will result in indexing errors


In [7]:
 test_df['#tokens']=return_length(test_df)

In [8]:
train_df=train_df[train_df['#tokens']<50]
test_df=test_df[test_df['#tokens']<50]
print('Length of Train_df:',len(train_df))
print('Length of Test_df:',len(test_df))

Length of Train_df: 17497
Length of Test_df: 4337


In [9]:
train_df['label']=['clickbait' if i>=0.5 else 'no-clickbait' for i in train_df['Intensity']]
test_df['label']=['clickbait' if i>=0.5 else 'no-clickbait' for i in test_df['Intensity']]

In [10]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

2022-10-20 11:44:54.186546: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-20 11:44:54.187642: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-20 11:44:54.188351: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-20 11:44:54.189255: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [11]:
def read_word_embedding(sentence):
    embedding=embed([sentence])[0]
    return embedding

In [12]:
x_train_embeddings=[]
for i in tqdm(range(len(train_df))):
    try:
        x_train_embeddings.append(read_word_embedding(train_df['Text'].iloc[i]))
    except:
        print(i)
        break

100%|██████████| 17497/17497 [01:56<00:00, 150.59it/s]


In [13]:
x_train_embeddings=np.array(x_train_embeddings)
y_train_intensity=np.array(list(train_df['Intensity'].values))

In [14]:
# x_dev_embeddings=[read_word_embedding(sentence)[0] for sentence in tqdm(dev_df['text'])]
# y_dev_intensity=list(dev_df['intensity'].values)
# y_dev_actual_label=list(dev_df['class'].values)

In [15]:
x_test_embeddings=[]
for i in tqdm(range(len(test_df))):
    try:
        x_test_embeddings.append(read_word_embedding(test_df['Text'].iloc[i]))
    except:
        print(i)
        break

100%|██████████| 4337/4337 [00:28<00:00, 150.16it/s]


In [16]:
x_test_embeddings=np.array(x_test_embeddings)
y_test_intensity=np.array(list(test_df['Intensity'].values))
y_test_actual_label=np.array(list(test_df['label'].values))

#### Linear Regression

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import median_absolute_error,mean_squared_error, f1_score, accuracy_score

In [18]:
def results(y_true,y_pred,y_actual_classes,y_pred_classes):
    Medae = median_absolute_error(y_true,y_pred)
    mse = mean_squared_error(y_true,y_pred) #default=True
    rmse = mean_squared_error(y_true,y_pred,squared=False)
    accuracy=accuracy_score(y_actual_classes,y_pred_classes)
    f1=f1_score(y_actual_classes,y_pred_classes,pos_label='clickbait')
    
    print("MedAE:",Medae)
    print("MSE:",mse)
    print("RMSE:",rmse)
    print("Accuracy:",accuracy)
    print("f1-score:",f1)

In [19]:
linear_reg = LinearRegression().fit(x_train_embeddings, y_train_intensity)

In [20]:
y_test_pred = linear_reg.predict(x_test_embeddings)
y_test_pred_classes=['clickbait' if i>=0.5 else 'no-clickbait' for i in y_test_pred]
y_test_actual_classes=list(test_df['label'].values)

In [21]:
results(y_test_intensity,y_test_pred,y_test_actual_classes,y_test_pred_classes)

MedAE: 0.11350554425177795
MSE: 0.0310764670912838
RMSE: 0.1762851868175083
Accuracy: 0.842979017754208
f1-score: 0.6356340288924559


#### Ridge Regression

In [22]:
from sklearn.linear_model import Ridge

In [23]:
ridge_reg = Ridge(alpha=2).fit(x_train_embeddings, y_train_intensity)
y_test_pred = ridge_reg.predict(x_test_embeddings)
y_test_pred_classes=['clickbait' if i>=0.5 else 'no-clickbait' for i in y_test_pred]
y_test_actual_classes=list(test_df['label'].values)

In [24]:
results(y_test_intensity,y_test_pred,y_test_actual_classes,y_test_pred_classes)

MedAE: 0.1142291108500525
MSE: 0.030738736215767264
RMSE: 0.1753246594628584
Accuracy: 0.8418261471062947
f1-score: 0.632762312633833


#### Gradient Boosted Regression

In [25]:
from sklearn.ensemble import GradientBoostingRegressor

In [26]:
ridge_reg = GradientBoostingRegressor().fit(x_train_embeddings, y_train_intensity)
y_test_pred = ridge_reg.predict(x_test_embeddings)
y_test_pred_classes=['clickbait' if i>=0.5 else 'no-clickbait' for i in y_test_pred]
y_test_actual_classes=list(test_df['label'].values)
results(y_test_intensity,y_test_pred,y_test_actual_classes,y_test_pred_classes)

MedAE: 0.11891341195347027
MSE: 0.03146910043147234
RMSE: 0.17739532246221246
Accuracy: 0.8367535162554761
f1-score: 0.6217948717948718


#### Random Forest Regression

In [27]:
from sklearn.ensemble import RandomForestRegressor

In [28]:
ridge_reg = RandomForestRegressor(max_depth=2, random_state=0).fit(x_train_embeddings, y_train_intensity)
y_test_pred = ridge_reg.predict(x_test_embeddings)
y_test_pred_classes=['clickbait' if i>=0.5 else 'no-clickbait' for i in y_test_pred]
y_test_actual_classes=list(test_df['label'].values)
results(y_test_intensity,y_test_pred,y_test_actual_classes,y_test_pred_classes)

MedAE: 0.15534439529352834
MSE: 0.044830509962275604
RMSE: 0.21173216562977767
Accuracy: 0.7897163938206133
f1-score: 0.4123711340206186


#### Adaboost Regression

In [29]:
from sklearn.ensemble import AdaBoostRegressor

In [30]:
ridge_reg = AdaBoostRegressor(random_state=0, n_estimators=100).fit(x_train_embeddings, y_train_intensity)
y_test_pred = ridge_reg.predict(x_test_embeddings)
y_test_pred_classes=['clickbait' if i>=0.5 else 'no-clickbait' for i in y_test_pred]
y_test_actual_classes=list(test_df['label'].values)
results(y_test_intensity,y_test_pred,y_test_actual_classes,y_test_pred_classes)

MedAE: 0.16709814784708554
MSE: 0.04396743016103345
RMSE: 0.20968411995435765
Accuracy: 0.8298362923679963
f1-score: 0.5601907032181168
