In [1]:
def load_dataset(file_name):
    import pandas as pd
    import json
    
    df = []
    with open('/kaggle/input/dataset/' + file_name) as f:
        for i in f:
            i = json.loads(i)
            tweet = i['postText']
            label = i['tags']
            
            assert len(tweet) == 1
            tweet = tweet[0]
            
            assert len(label) == 1
            label = label[0]
            
            if label not in ['phrase', 'phrases', 'passage', 'multi']:
                print(label)
                
            assert label in ['phrase', 'phrases', 'passage', 'multi']
            
            if label == 'multi':
                continue
            
            df += [{'text': tweet, 'labels': (label == 'phrase' or label == 'phrases')}]

    return pd.DataFrame(df)   
            
    
test_dataset = load_dataset('test.jsonl')
train_dataset = load_dataset('train.jsonl')
validation_dataset = load_dataset('validation.jsonl')

In [2]:
import sklearn

args = {
    "overwrite_output_dir": True,
    "num_train_epochs": 3,
    "fp16": False,
    "train_batch_size": 8,
    "gradient_accumulation_steps": 4,
    "evaluate_during_training": True,
    "max_seq_length": 64,
    "learning_rate": 2e-5,
    "early_stopping_consider_epochs": True,
    "early_stopping_delta": 0.01,
    "early_stopping_metric": "acc",
    "early_stopping_metric_minimize": False,
    "early_stopping_patience": 3,
    "evaluate_during_training_steps": 331,
    "output_dir": "outputs/",
    'warmup_ratio': 0.06,
    # 'warmup_steps': 0, # usually empfohlen, warmup_ratio ist prozentuales äquivalent 
    #                    # <> überschreiben sich gegenseitig
    'save_steps': 2000,
    "manual_seed": 12345
}

# Training

In [9]:
configurations = []

for learn_rate in [1e-5, 4e-5, 1e-4]:
    for warumup_ratio in [0.02, 0.06, 0.1]:
        for batch_size in [8, 16, 32]:
            configurations += [{
                "overwrite_output_dir": True,
                "num_train_epochs": 2,
                "fp16": False,
                "train_batch_size": batch_size,
                "gradient_accumulation_steps": 4,
                "evaluate_during_training": True,
                "max_seq_length": 64,
                "learning_rate": learn_rate,
                "early_stopping_consider_epochs": True,
                "early_stopping_delta": 0.01,
                "early_stopping_metric": "acc",
                "early_stopping_metric_minimize": False,
                "early_stopping_patience": 3,
                "evaluate_during_training_steps": 331,
                "output_dir": "outputs/",
                'warmup_ratio': warumup_ratio,
                # 'warmup_steps': 0, # usually empfohlen, warmup_ratio ist prozentuales äquivalent 
                #                    # <> überschreiben sich gegenseitig
                'save_steps': 2000,
                "manual_seed": 12345
            }]

In [10]:
def train_model(config, num):
    from simpletransformers.classification import ClassificationModel
    import sklearn
    
    config["output_dir"] = "outputs/bert_" + str(num)
    
    model = ClassificationModel("bert", "bert-large-cased", args = config)
    # model.train_model(train_dataset)
    model.train_model(train_dataset, eval_df=validation_dataset, acc=sklearn.metrics.accuracy_score)

In [13]:
for config, num in zip(configurations, range(len(configurations))):
    print('Running configuration number', num)
    
    train_model(config, num)

Running configuration number 0


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at 

  0%|          | 0/2641 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/331 [00:00<?, ?it/s]

  0%|          | 0/402 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/331 [00:00<?, ?it/s]

  0%|          | 0/402 [00:00<?, ?it/s]

Running configuration number 1


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at 

  0%|          | 0/2641 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/402 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/166 [00:00<?, ?it/s]

RuntimeError: [enforce fail at inline_container.cc:300] . unexpected pos 2247958336 vs 2247958224

In [3]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.63.9-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.5/250.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting streamlit
  Downloading streamlit-1.16.0-py2.py3-none-any.whl (9.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.2/9.2 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pydeck>=0.1.dev5
  Downloading pydeck-0.8.0-py2.py3-none-any.whl (4.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
Collecting watchdog
  Downloading watchdog-2.2.1-py3-none-manylinux2014_x86_64.whl (78 k

In [12]:
!rm -rf /kaggle/working/*

In [5]:
from simpletransformers.classification import ClassificationModel
model = ClassificationModel("bert", "bert-large-cased", args = args)
# model.train_model(train_dataset)
model.train_model(train_dataset, eval_df=validation_dataset, acc=sklearn.metrics.accuracy_score)

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at 

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

  0%|          | 0/2641 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/331 [00:00<?, ?it/s]

  0%|          | 0/402 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/331 [00:00<?, ?it/s]

  0%|          | 0/402 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/331 [00:00<?, ?it/s]

  0%|          | 0/402 [00:00<?, ?it/s]

(246,
 defaultdict(list,
             {'global_step': [82, 164, 246],
              'train_loss': [0.8940210938453674,
               0.5410407185554504,
               0.19847697019577026],
              'mcc': [0.4077844354538989,
               0.39317414650060994,
               0.5148255065876252],
              'tp': [124, 175, 151],
              'tn': [156, 103, 153],
              'fp': [38, 91, 41],
              'fn': [84, 33, 57],
              'auroc': [0.7545350911974624,
               0.810889175257732,
               0.8257335448057097],
              'auprc': [0.7605866702929839,
               0.8061887862514157,
               0.8246494646205694],
              'acc': [0.6965174129353234,
               0.6915422885572139,
               0.7562189054726368],
              'eval_loss': [0.5923164486885071,
               0.5571625472283831,
               0.5141485575951782]}))

In [6]:
model.eval_model(validation_dataset, acc=sklearn.metrics.accuracy_score)

  0%|          | 0/402 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/51 [00:00<?, ?it/s]

({'mcc': 0.5148255065876252,
  'tp': 151,
  'tn': 153,
  'fp': 41,
  'fn': 57,
  'auroc': 0.8257335448057097,
  'auprc': 0.8246494646205694,
  'acc': 0.7562189054726368,
  'eval_loss': 0.5141485575951782},
 array([[ 9.46247637e-01, -4.57936734e-01],
        [-1.09892476e+00,  7.86266148e-01],
        [ 9.77843404e-01, -5.77914298e-01],
        [-1.11956739e+00,  9.54282224e-01],
        [-1.08632398e+00,  9.53849614e-01],
        [ 5.58940291e-01, -2.90020257e-01],
        [-1.61085844e-01, -1.88955098e-01],
        [-4.83724862e-01, -2.66330302e-01],
        [-1.19590652e+00,  9.42048728e-01],
        [ 1.30341744e+00, -7.66175449e-01],
        [ 1.09556937e+00, -6.41015053e-01],
        [ 1.35152268e+00, -6.78265095e-01],
        [-5.14256358e-02, -3.89870971e-01],
        [ 1.11067164e+00, -4.68116969e-01],
        [-1.48599565e-01, -4.23772514e-01],
        [-9.43289757e-01,  6.82071030e-01],
        [ 1.43682849e+00, -6.62687242e-01],
        [ 1.34732747e+00, -6.71936929e-01],
  

In [7]:
model.eval_model(test_dataset, acc=sklearn.metrics.accuracy_score)

  0%|          | 0/255 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/32 [00:00<?, ?it/s]

({'mcc': 0.39856084799171543,
  'tp': 81,
  'tn': 97,
  'fp': 31,
  'fn': 46,
  'auroc': 0.7720841535433071,
  'auprc': 0.7931830996136036,
  'acc': 0.6980392156862745,
  'eval_loss': 0.6080340659245849},
 array([[-5.68755805e-01,  2.26945952e-02],
        [ 7.62211442e-01, -5.39016902e-01],
        [-9.39116597e-01,  3.25912297e-01],
        [ 7.76860714e-01, -8.16382945e-01],
        [-1.10611570e+00,  1.01324689e+00],
        [ 1.32492995e+00, -6.30910218e-01],
        [ 1.20069873e+00, -5.97305119e-01],
        [-1.51504651e-01, -1.46450460e-01],
        [ 1.38103449e+00, -5.65858603e-01],
        [ 1.20355344e+00, -6.62168324e-01],
        [-8.97327662e-01,  2.21008435e-01],
        [-1.20680177e+00,  1.00803769e+00],
        [ 1.14105535e+00, -6.98316693e-01],
        [-1.14934814e+00,  1.01342440e+00],
        [-9.51572776e-01,  1.79424092e-01],
        [ 1.33166182e+00, -5.28566360e-01],
        [ 3.01265508e-01, -6.65626943e-01],
        [-1.09362435e+00,  1.01619995e+00],
   

# Model Selection with Validation data

In [None]:
from glob import glob
from simpletransformers.classification import ClassificationModel
import sklearn
import pandas as pd
from tqdm import tqdm

df = []

for config, num in zip(configurations, range(len(configurations))):
    directory = "outputs/bert_" + str(num) + "/"
    
    for checkpoint in tqdm(glob(directory + "checkpoint-*")):
        # print(checkpoint)
        model = ClassificationModel("bert", checkpoint)
        
        valid_acc = model.eval_model(validation_dataset, acc=sklearn.metrics.accuracy_score)[0]['acc']
        test_acc = model.eval_model(test_dataset, acc=sklearn.metrics.accuracy_score)[0]['acc']
        
        df += [{"checkpoint": checkpoint, "valid_acc": valid_acc, "test_acc": test_acc,
               "config": config}]
        
df = pd.DataFrame(df)
df

In [None]:
df.to_json('outputs/bert_eval_results.csv')

In [None]:
print('finished!')

In [None]:
df[df['checkpoint'].str.contains('epoch')].sort_values('valid_acc', ascending=False)

In [None]:
df.sort_values('test_acc', ascending=False)

In [None]:
for _, i in [i for i in df[df['checkpoint'].str.contains('epoch')].sort_values('valid_acc', ascending=False).iterrows()][:5]:
    print(i['valid_acc'], i['config'])