In [1]:
#!pip install simpletransformers

In [2]:
#!pip install torch

In [3]:
import numpy as np
import pandas as pd
from sklearn.metrics import *
from sklearn.model_selection import *

from tqdm import tqdm
import warnings
warnings.simplefilter('ignore')
import gc
from scipy.special import softmax

from simpletransformers.classification.classification_model import ClassificationModel
from sklearn.metrics import mean_squared_error as mse

In [4]:
TRAIN_PATH = 'Train.csv'
TEST_PATH = 'Test.csv'
SAMPLE_SUB_PATH = 'SampleSubmission.csv'

In [5]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)
ID_COL, TARGET_COL = sample_sub.columns.tolist()

In [6]:
train.head()

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0,1.0
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1,1.0
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1,1.0
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1,1.0
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0,1.0


In [7]:

train['safe_text'].apply(lambda x: len(x)).describe()

count    10001.000000
mean        99.902810
std         29.893888
min          1.000000
25%         79.000000
50%        107.000000
75%        122.000000
max        153.000000
Name: safe_text, dtype: float64

In [8]:
train['label'].value_counts()

 0    4910
 1    4053
-1    1038
Name: label, dtype: int64

In [9]:
train['label'][~train['label'].isin([0, -1, 1])] = -1

In [10]:
train.isnull().sum()

tweet_id     0
safe_text    0
label        0
agreement    0
dtype: int64

In [11]:
train = train.dropna()

In [12]:
train.isnull().sum()

tweet_id     0
safe_text    0
label        0
agreement    0
dtype: int64

In [13]:
test.head()

Unnamed: 0,tweet_id,safe_text
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...
1,00UNMD0E,Students starting school without whooping coug...
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe..."
3,01HOEQJW,How many innocent children die for lack of vac...
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though..."


In [14]:
test.isnull().sum()

tweet_id     0
safe_text    0
dtype: int64

In [15]:
test['safe_text'] = test['safe_text'].fillna('xxxxxx')

In [16]:
test.isnull().sum()

tweet_id     0
safe_text    0
dtype: int64

**Lets see some of the tweets of each kind**

**1. Positive**

In [17]:
train[train['label'] == 1]['safe_text'].values[:5]

array(["I'm 100% thinking of devoting my career to proving autism isn't caused by vaccines due to the IDIOTIC posts I've seen about World Autism Day",
       '<user> a nearly 67 year old study when mental health studies and vaccines were relatively in their infancies that has been refuted?',
       'Study of more than 95,000 kids finds no link between MMR vaccine and autism <url>',
       'psa: VACCINATE YOUR FUCKING KIDS',
       'Coughing extra on the shuttle and everyone thinks I have the measles. ðŸ˜‚ #VaccinateYourKids'],
      dtype=object)

**2. Neutral**

In [18]:
train[train['label'] == 0]['safe_text'].values[:5]

array(['Me &amp; The Big Homie meanboy3000 #MEANBOY #MB #MBS #MMR #STEGMANLIFE @ Stegman St. <url>',
       'Thanks to <user> Catch me performing at La Nuit NYC 1134 1st ave. Show starts at 6! #jennifair #mmrâ€¦ <url>',
       "<user> @ this point I have 2 text, butw/Bon Jovi cover playin @ Alibi's hope U can come out 2 MMR BBQ<user> will b there!",
       'My prediction, vaccine exemption in Arizona will end soon. To much money is being lost by big pharma.',
       '1$Mug Noche <user> #mmr #mixmasterrod #dcdj #mmr   @ Mad Hatter <url>'],
      dtype=object)

**3. Negative**

In [19]:
train[train['label'] == -1]['safe_text'].values[:5]

array(['#whatcausesautism VACCINES, DO NOT VACCINATE YOUR CHILD',
       "I mean if they immunize my kid with something that won't secretly kill him years down the line then I'm all for it, but I don't trust that",
       '<user> #CDC lied and hid data that black boys have an 340% uncreased risk of developing autism after MMR #CDCwhistleblower help!?!?',
       '<user> vaccines causing autism',
       '<user> <user> Other than that, his defense is not against vaccines being harmful - but the American life being unhealthy.'],
      dtype=object)

Lets go to the modelling part. **Simple Transformers** is extremely simple to use, and switching architectures requires only the change in argument name.

In [20]:
def get_model(model_type, model_name, n_epochs = 2, train_batch_size = 112, eval_batch_size = 144, seq_len = 134, lr = 2e-5):
    model = ClassificationModel(model_type, model_name,num_labels=1, use_cuda=False, args={'train_batch_size':train_batch_size,
                                                                         "eval_batch_size": eval_batch_size,
                                                                         'reprocess_input_data': True,
                                                                         'overwrite_output_dir': True,
                                                                         'fp16': False,
                                                                         'do_lower_case': False,
                                                                         'num_train_epochs': n_epochs,
                                                                         'max_seq_length': seq_len,
                                                                         'regression': True,
                                                                         'manual_seed': 2,
                                                                         "learning_rate":lr,
                                                                         "use_cuda": False,
                                                                         "save_eval_checkpoints": False,
                                                                         "save_model_every_epoch": False,})
    return model 

In [21]:
tmp = pd.DataFrame()
tmp['text'] = train['safe_text']
tmp['labels'] = train['label']
tmp_test = test[['safe_text']].rename({'safe_text': 'text'}, axis=1)
tmp_test['labels'] = 0
tmp_trn, tmp_val = train_test_split(tmp, test_size=0.15, random_state=2)

In [22]:
cc = ClassificationModel()
cc

TypeError: __init__() missing 2 required positional arguments: 'model_type' and 'model_name'

**Model A: Roberta Base 3 Epochs**

In [24]:
model = get_model('roberta', 'roberta-base', n_epochs=3)
model.train_model(tmp_trn)
preds_val = model.eval_model(tmp_val)[1]
preds_val = np.clip(preds_val, -1, 1)
print(f"RMSE: {mse(tmp_val['labels'], preds_val)**0.5}")
test_preds = model.eval_model(tmp_test)[1]
test_preds = np.clip(test_preds, -1, 1)
pv_1 = preds_val
pt_1 = test_preds

OSError: Unable to load weights from pytorch checkpoint file. If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. 

**Model B: Roberta Large 1 Epoch**

In [0]:
model = get_model('roberta', 'roberta-large', n_epochs=1, train_batch_size=16, eval_batch_size=16)
model.train_model(tmp_trn)
preds_val = model.eval_model(tmp_val)[1]
preds_val = np.clip(preds_val, -1, 1)
print(f"RMSE: {mse(tmp_val['labels'], preds_val)**0.5}")
test_preds = model.eval_model(tmp_test)[1]
test_preds = np.clip(test_preds, -1, 1)
pv_2 = preds_val
pt_2 = test_preds

HBox(children=(IntProgress(value=0, description='Downloading', max=482, style=ProgressStyle(description_width=â€¦




HBox(children=(IntProgress(value=0, description='Downloading', max=1425941629, style=ProgressStyle(descriptionâ€¦




HBox(children=(IntProgress(value=0, description='Downloading', max=898823, style=ProgressStyle(description_widâ€¦




HBox(children=(IntProgress(value=0, description='Downloading', max=456318, style=ProgressStyle(description_widâ€¦




HBox(children=(IntProgress(value=0, max=8499), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initialâ€¦

HBox(children=(IntProgress(value=0, description='Current iteration', max=532, style=ProgressStyle(description_â€¦

Running loss: 0.019545



HBox(children=(IntProgress(value=0, max=1500), HTML(value='')))




HBox(children=(IntProgress(value=0, max=94), HTML(value='')))


RMSE: 0.4923375270272207


HBox(children=(IntProgress(value=0, max=5177), HTML(value='')))




HBox(children=(IntProgress(value=0, max=324), HTML(value='')))




**Model C: Roberta Large 2 Epochs**

In [0]:
model = get_model('roberta', 'roberta-large', n_epochs=2, train_batch_size=16, eval_batch_size=16, lr = 2e-5)
model.train_model(tmp_trn)
preds_val = model.eval_model(tmp_val)[1]
preds_val = np.clip(preds_val, -1, 1)
print(f"RMSE: {mse(tmp_val['labels'], preds_val)**0.5}")
test_preds = model.eval_model(tmp_test)[1]
test_preds = np.clip(test_preds, -1, 1)
pv_3 = preds_val
pt_3 = test_preds

HBox(children=(IntProgress(value=0, max=8499), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initialâ€¦

HBox(children=(IntProgress(value=0, description='Current iteration', max=532, style=ProgressStyle(description_â€¦

Running loss: 0.208412


HBox(children=(IntProgress(value=0, description='Current iteration', max=532, style=ProgressStyle(description_â€¦

Running loss: 0.071644



HBox(children=(IntProgress(value=0, max=1500), HTML(value='')))




HBox(children=(IntProgress(value=0, max=94), HTML(value='')))


RMSE: 0.46936953979596896


HBox(children=(IntProgress(value=0, max=5177), HTML(value='')))




HBox(children=(IntProgress(value=0, max=324), HTML(value='')))




**Model D: Roberta Large 3 Epochs**

In [0]:
model = get_model('roberta', 'roberta-large', n_epochs=3, train_batch_size=16, eval_batch_size=16, lr = 1e-5)
model.train_model(tmp_trn)
preds_val = model.eval_model(tmp_val)[1]
preds_val = np.clip(preds_val, -1, 1)
print(f"RMSE: {mse(tmp_val['labels'], preds_val)**0.5}")
test_preds = model.eval_model(tmp_test)[1]
test_preds = np.clip(test_preds, -1, 1)
pv_4 = preds_val
pt_4 = test_preds

HBox(children=(IntProgress(value=0, max=8499), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initialâ€¦

HBox(children=(IntProgress(value=0, description='Current iteration', max=532, style=ProgressStyle(description_â€¦

Running loss: 0.338560


HBox(children=(IntProgress(value=0, description='Current iteration', max=532, style=ProgressStyle(description_â€¦

Running loss: 0.239610


HBox(children=(IntProgress(value=0, description='Current iteration', max=532, style=ProgressStyle(description_â€¦

Running loss: 0.331663



HBox(children=(IntProgress(value=0, max=1500), HTML(value='')))




HBox(children=(IntProgress(value=0, max=94), HTML(value='')))


RMSE: 0.4817205289380705


HBox(children=(IntProgress(value=0, max=5177), HTML(value='')))




HBox(children=(IntProgress(value=0, max=324), HTML(value='')))




**Blending of Different Models**

In [0]:
pv = ((pv_1 * 0.3 + pv_2 * 0.7) * 0.3 + pv_3*0.7)*0.65 + pv_4*0.35
print(f"RMSE: {mse(tmp_val['labels'], pv)**0.5}")

RMSE: 0.4574547440807413


In [0]:
tp = ((pt_1 * 0.3 + pt_2 * 0.7) * 0.3 + pt_3*0.7)*0.65 + pt_4*0.35

In [0]:
pd.Series(tp).describe()

count    5177.000000
mean        0.350946
std         0.509943
min        -0.998366
25%         0.016573
50%         0.236499
75%         0.885619
max         1.000000
dtype: float64

In [0]:
final_test_preds = tp
preds_df_final = test[[ID_COL]]
preds_df_final[TARGET_COL] = final_test_preds
SUB_FILE_NAME = 'roberta_ensemble.csv'
preds_df_final.to_csv(SUB_FILE_NAME, index=False)

In [0]:
preds_df_final.head()

Unnamed: 0,tweet_id,label
0,00BHHHP1,-0.650443
1,00UNMD0E,0.37428
2,01AXPTJF,0.056176
3,01HOEQJW,0.92869
4,01JUKMAO,0.255095


In [0]:
preds_df_final.tail()

Unnamed: 0,tweet_id,label
5172,ZXVVNC5O,0.957115
5173,ZYIANVI8,0.080267
5174,ZYITEHAH,0.529573
5175,ZZ3BMBTG,1.0
5176,ZZIYCVNH,-0.421558


In [0]:
preds_df_final.shape

(5177, 2)

In [0]:
preds_df_final['label'].describe()

count    5177.000000
mean        0.350946
std         0.509943
min        -0.998366
25%         0.016573
50%         0.236499
75%         0.885619
max         1.000000
Name: label, dtype: float64

In [0]:
preds_df_final.isnull().sum()

tweet_id    0
label       0
dtype: int64

In [0]:
from google.colab import files
files.download(SUB_FILE_NAME)