In [2]:
def load_dataset(file_name):
    import pandas as pd
    import json
    
    df = []
    with open('/mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-clickbait-spoiling/' + file_name) as f:
        for i in f:
            i = json.loads(i)
            tweet = i['postText']
            article_title = i['targetTitle']
            article = ' '.join(i['targetParagraphs'])
            label = i['tags']
            
            assert len(tweet) == 1
            tweet = tweet[0]
            
            assert len(label) == 1
            label = label[0]
            
            if label not in ['phrase', 'phrases', 'passage', 'multi']:
                print(label)
                
            assert label in ['phrase', 'phrases', 'passage', 'multi']
            
            if label == 'multi':
                continue
            
            df += [{'text': tweet + ' - ' + article_title + article, 
                    'labels': (label == 'phrase' or label == 'phrases')}]

    return pd.DataFrame(df)   
            
    
test_dataset = load_dataset('test.jsonl')
train_dataset = load_dataset('train.jsonl')
validation_dataset = load_dataset('validation.jsonl')

In [2]:
test_dataset

Unnamed: 0,text,labels
0,This simple household item saves lives - The S...,True
1,You'll Never Believe Who Robert Downey Jr.'s '...,True
2,THE NEWEST THEORY ON HIDDLESWIFT'S ROMANCE IS ...,False
3,"The little girl from ""Titanic"" is all grown up...",True
4,Why we never really get over that first love -...,False
...,...,...
821,Inside our three-month effort to attend Apple'...,False
822,Dad And Son Are Seconds From Assassination By ...,False
823,Kanye West Reveals His Favorite Song... Of All...,False
824,"""Pippi Longstocking"" star arrested - 'Pippi Lo...",True


In [3]:
train_dataset

Unnamed: 0,text,labels
0,"Wes Welker Wanted Dinner With Tom Brady, But P...",False
1,NASA sets date for full recovery of ozone hole...,True
2,This is what makes employees happy -- and it's...,True
3,The perfect way to cook rice so that it's perf...,True
4,What happens if your new AirPods get lost or s...,False
...,...,...
2636,If You See A Purple Butterfly Sticker At The H...,False
2637,Has Facebook's video explosion completely shak...,False
2638,Cop Is Eating At A Chili's When Teen Hands Him...,False
2639,You need to see this Twitter account that pred...,True


In [4]:
validation_dataset

Unnamed: 0,text,labels
0,Five Nights at Freddy’s Sequel Delayed for Wei...,False
1,Here’s how much you should be tipping your hai...,True
2,A man swallowed a microSD card and you won't b...,False
3,This popular soda could cure your hangovers sc...,True
4,The anytime snack you won't feel guilty about ...,True
...,...,...
652,"Dog Dies One Hour After Hiking With His Owner,...",False
653,This is what happens when you leave a hotel cl...,False
654,This Texas GOP elector announces that he won't...,True
655,WikiLeaks' Julian Assange Reported Dead - Wiki...,False


# Training

In [1]:
configurations = []

for learn_rate in [4e-5]:
    for warumup_ratio in [0.02, 0.06]:
        for batch_size in [8, 16]:
            for seq_length in [256, 384, 512]:
                configurations += [{
                    "overwrite_output_dir": True,
                    "num_train_epochs": 10,
                    "fp16": False,
                    "train_batch_size": batch_size,
                    "gradient_accumulation_steps": 4,
                    "evaluate_during_training": True,
                    "max_seq_length": seq_length,
                    "learning_rate": learn_rate,
                    "early_stopping_consider_epochs": True,
                    "early_stopping_delta": 0.01,
                    "early_stopping_metric": "acc",
                    "early_stopping_metric_minimize": False,
                    "early_stopping_patience": 3,
                    "evaluate_during_training_steps": 331,
                    "output_dir": "outputs/",
                    'warmup_ratio': warumup_ratio,
                    # 'warmup_steps': 0, # usually empfohlen, warmup_ratio ist prozentuales äquivalent 
                    #                    # <> überschreiben sich gegenseitig
                    'save_steps': 2000,
                    "manual_seed": 12345
                }]

In [3]:
def train_model(config, num):
    from simpletransformers.classification import ClassificationModel
    import sklearn
    import os.path
    
    config["output_dir"] = "outputs/deberta_concat_" + str(num)
    
    if os.path.isdir(config["output_dir"]):
        print('Skip ', num)
        return
    
    model = ClassificationModel("deberta", "microsoft/deberta-large", args = config)
    model.train_model(train_dataset, eval_df=validation_dataset, acc=sklearn.metrics.accuracy_score)

In [4]:
for config, num in [i for i in zip(configurations, range(len(configurations)))]:
    print('Running configuration number', num)
    
    train_model(config, num)

Running configuration number 0
Skip  0
Running configuration number 1
Skip  1
Running configuration number 2
Skip  2
Running configuration number 3
Skip  3
Running configuration number 4
Skip  4
Running configuration number 5
Skip  5
Running configuration number 6
Skip  6
Running configuration number 7
Skip  7
Running configuration number 8
Skip  8
Running configuration number 9
Skip  9
Running configuration number 10
Skip  10
Running configuration number 11
Skip  11


In [38]:
model.eval_model(validation_dataset, acc=sklearn.metrics.accuracy_score)

  0%|          | 0/657 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/83 [00:00<?, ?it/s]

({'mcc': 0.5189907816932701,
  'tp': 264,
  'tn': 235,
  'fp': 87,
  'fn': 71,
  'auroc': 0.8332344488736442,
  'auprc': 0.8547997162455769,
  'acc': 0.7595129375951294,
  'eval_loss': 1.5449147986592473},
 array([[-4.50758696,  4.8231678 ],
        [-4.62452698,  4.63890123],
        [ 4.56198597, -4.33566332],
        ...,
        [ 0.98726308, -0.87757879],
        [ 3.27670145, -3.22399449],
        [-4.55029869,  4.87138462]]),
 [['Five Nights at Freddy’s Sequel Delayed for Weird Reason',
   'Here’s how much you should be tipping your hairdresser',
   "A man swallowed a microSD card and you won't believe what happened next!",
   'This popular soda could cure your hangovers scientists say:',
   "The anytime snack you won't feel guilty about eating",
   'You won\'t believe this stunning "Harry Potter" revelation about Professor McGonagall',
   "J.J. Abrams has an answer on if there will be a post-credits scene in the new 'Star Wars'",
   'Kristin Cavallari (@KristinCav) opens up abo

In [37]:
model.eval_model(test_dataset, acc=sklearn.metrics.accuracy_score)

  0%|          | 0/826 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/104 [00:00<?, ?it/s]

({'mcc': 0.5662373938122638,
  'tp': 338,
  'tn': 309,
  'fp': 94,
  'fn': 85,
  'auroc': 0.83468548533751,
  'auprc': 0.8406305749647641,
  'acc': 0.7832929782082324,
  'eval_loss': 1.4019140609235854},
 array([[-4.73007059,  4.75768614],
        [-4.66822481,  4.89313078],
        [ 4.12303448, -3.80747771],
        ...,
        [-4.3079958 ,  4.59369469],
        [-4.60104084,  4.89709044],
        [-4.45188284,  4.57753611]]),
 [])

# Model Selection with Validation data

In [4]:
from glob import glob
from simpletransformers.classification import ClassificationModel
import sklearn
import pandas as pd
from tqdm import tqdm
import os

for config, num in [i for i in zip(configurations, range(len(configurations)))]:
    directory = "outputs/deberta_concat_" + str(num) + "/"
    df = []
    
    if os.path.isfile(directory + 'eval-results.jsonl'):
        print('Skip ', num)
        continue
        
    for checkpoint in tqdm(glob(directory + "checkpoint-*")):
        # print(checkpoint)
        model = ClassificationModel("deberta", checkpoint)
        
        valid_acc = model.eval_model(validation_dataset, acc=sklearn.metrics.accuracy_score)[0]['acc']
        test_acc = model.eval_model(test_dataset, acc=sklearn.metrics.accuracy_score)[0]['acc']
        
        df += [{"checkpoint": checkpoint, "valid_acc": valid_acc, "test_acc": test_acc,
               "config": config}]
    
    pd.DataFrame(df).to_json(directory + 'eval-results.jsonl')
        
#df = pd.DataFrame(df)
#df

Skip  0
Skip  1
Skip  2
Skip  3
Skip  4
Skip  5
Skip  6
Skip  7
Skip  8
Skip  9
Skip  10
Skip  11


In [6]:
df = []

for config, num in [i for i in zip(configurations, range(len(configurations)))]:
    df += [pd.read_json("outputs/deberta_concat_" + str(num) + "/eval-results.jsonl")]

df = pd.concat(df)

df

Unnamed: 0,checkpoint,valid_acc,test_acc,config
0,outputs/deberta_concat_0/checkpoint-246-epoch-3,0.490107,0.487893,"{'overwrite_output_dir': True, 'num_train_epoc..."
1,outputs/deberta_concat_0/checkpoint-656-epoch-8,0.509893,0.512107,"{'overwrite_output_dir': True, 'num_train_epoc..."
2,outputs/deberta_concat_0/checkpoint-820-epoch-10,0.509893,0.512107,"{'overwrite_output_dir': True, 'num_train_epoc..."
3,outputs/deberta_concat_0/checkpoint-410-epoch-5,0.509893,0.512107,"{'overwrite_output_dir': True, 'num_train_epoc..."
4,outputs/deberta_concat_0/checkpoint-331,0.509893,0.512107,"{'overwrite_output_dir': True, 'num_train_epoc..."
...,...,...,...,...
6,outputs/deberta_concat_9/checkpoint-331,0.782344,0.785714,"{'overwrite_output_dir': True, 'num_train_epoc..."
7,outputs/deberta_concat_9/checkpoint-164-epoch-4,0.777778,0.794189,"{'overwrite_output_dir': True, 'num_train_epoc..."
8,outputs/deberta_concat_9/checkpoint-369-epoch-9,0.786910,0.790557,"{'overwrite_output_dir': True, 'num_train_epoc..."
9,outputs/deberta_concat_9/checkpoint-205-epoch-5,0.785388,0.797821,"{'overwrite_output_dir': True, 'num_train_epoc..."


In [None]:
print('finished!')

In [8]:
df[df['checkpoint'].str.contains('epoch')].sort_values('valid_acc', ascending=False)

Unnamed: 0,checkpoint,valid_acc,test_acc,config
10,outputs/deberta_concat_9/checkpoint-246-epoch-6,0.808219,0.790557,"{'overwrite_output_dir': True, 'num_train_epoc..."
0,outputs/deberta_concat_1/checkpoint-246-epoch-3,0.806697,0.792978,"{'overwrite_output_dir': True, 'num_train_epoc..."
11,outputs/deberta_concat_6/checkpoint-164-epoch-2,0.803653,0.791768,"{'overwrite_output_dir': True, 'num_train_epoc..."
2,outputs/deberta_concat_1/checkpoint-820-epoch-10,0.802131,0.788136,"{'overwrite_output_dir': True, 'num_train_epoc..."
2,outputs/deberta_concat_7/checkpoint-820-epoch-10,0.800609,0.803874,"{'overwrite_output_dir': True, 'num_train_epoc..."
0,outputs/deberta_concat_9/checkpoint-123-epoch-3,0.799087,0.776029,"{'overwrite_output_dir': True, 'num_train_epoc..."
9,outputs/deberta_concat_1/checkpoint-492-epoch-6,0.799087,0.786925,"{'overwrite_output_dir': True, 'num_train_epoc..."
8,outputs/deberta_concat_1/checkpoint-738-epoch-9,0.797565,0.784504,"{'overwrite_output_dir': True, 'num_train_epoc..."
7,outputs/deberta_concat_7/checkpoint-328-epoch-4,0.797565,0.789346,"{'overwrite_output_dir': True, 'num_train_epoc..."
1,outputs/deberta_concat_1/checkpoint-656-epoch-8,0.797565,0.791768,"{'overwrite_output_dir': True, 'num_train_epoc..."


In [3]:
# Based on the validation data above, we select outputs/deberta_concat_9/checkpoint-246-epoch-6
from simpletransformers.classification import ClassificationModel
import sklearn

model = ClassificationModel("deberta", "outputs/deberta_concat_9/checkpoint-246-epoch-6")
model.eval_model(test_dataset, acc=sklearn.metrics.accuracy_score)

  0%|          | 0/826 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/104 [00:00<?, ?it/s]

({'mcc': 0.5841171980338219,
  'tp': 318,
  'tn': 335,
  'fp': 68,
  'fn': 105,
  'auroc': 0.8526242307985616,
  'auprc': 0.8662257961187407,
  'acc': 0.7905569007263923,
  'eval_loss': 1.378462234868913},
 array([[-4.72781277,  4.20952988],
        [-4.04980993,  3.58672714],
        [ 4.18360662, -3.76707053],
        ...,
        [-4.8248167 ,  4.21302795],
        [-4.78792763,  4.20505524],
        [-4.62253475,  3.96327543]]),
 [])

In [7]:
for _, i in [i for i in df[df['checkpoint'].str.contains('epoch')].sort_values('valid_acc', ascending=False).iterrows()][:5]:
    print(i['valid_acc'], i['config'])

0.8082191781 {'overwrite_output_dir': True, 'num_train_epochs': 10, 'fp16': False, 'train_batch_size': 16, 'gradient_accumulation_steps': 4, 'evaluate_during_training': True, 'max_seq_length': 256, 'learning_rate': 4e-05, 'early_stopping_consider_epochs': True, 'early_stopping_delta': 0.01, 'early_stopping_metric': 'acc', 'early_stopping_metric_minimize': False, 'early_stopping_patience': 3, 'evaluate_during_training_steps': 331, 'output_dir': 'outputs/', 'warmup_ratio': 0.06, 'save_steps': 2000, 'manual_seed': 12345}
0.8066971081000001 {'overwrite_output_dir': True, 'num_train_epochs': 10, 'fp16': False, 'train_batch_size': 8, 'gradient_accumulation_steps': 4, 'evaluate_during_training': True, 'max_seq_length': 384, 'learning_rate': 4e-05, 'early_stopping_consider_epochs': True, 'early_stopping_delta': 0.01, 'early_stopping_metric': 'acc', 'early_stopping_metric_minimize': False, 'early_stopping_patience': 3, 'evaluate_during_training_steps': 331, 'output_dir': 'outputs/deberta_concat