In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 4.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 57.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 76.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.2 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 51.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
  

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
PATH = '/content/drive/MyDrive/NLP Projects/disaster_tweets/'

In [4]:
import sys
sys.path.insert(0, PATH+'utils')

In [5]:
import model_utils as model_utils
import preprocessing as pre
import tokenize_utils as tkn
import train_utils as train_utils
import inference

cuda
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

cuda


In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch 
import transformers

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
batch_size = 16
epochs = 3
seed_val = 42
#lr = 6e-6
#dropout_rate = 0.0
num_folds = 5
#weight_decay=0
#maxlen=50

In [9]:
import random
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [10]:
train = pd.read_csv(PATH + "train.csv")
test = pd.read_csv(PATH + "test.csv")

In [11]:
train.text=train.text.apply(lambda x: pre.preprocess_all(x))
test.text=test.text.apply(lambda x: pre.preprocess_all(x))

sentences = train.text.values
labels = train.target.values

In [None]:
print(sum(labels)/len(labels))

0.4296597924602653


In [30]:
from sklearn.model_selection import KFold
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

skf = KFold(n_splits=num_folds)
skf.get_n_splits(train, labels)

configs = {}
for i in range(1):
  #RandomSearchCV
  config = {'lr':5.8e-6+random.randint(0,20)*1e-7,
          'dropout_rate': 0.0,
          'weight_decay': 0.0,
          'maxlen': 50 + random.randint(0, 40)}

  overall_loss_vals = []
  accuracies = []

  all_sequences = tkn.get_input_ids(sentences)
  all_sequences = tkn.tf_pad_sequences(all_sequences, maxlen=config['maxlen'])
  all_masks = np.array(tkn.get_masks(all_sequences))

  for train_index, test_index in skf.split(all_sequences, labels):
    best_acc = 0

    X_train, X_test = all_sequences[train_index], all_sequences[test_index]
    mask_train, mask_test = all_masks[train_index], all_masks[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    train_inputs = torch.tensor(X_train)
    validation_inputs = torch.tensor(X_test)

    train_labels = torch.tensor(y_train)
    validation_labels = torch.tensor(y_test)

    train_masks = torch.tensor(mask_train)
    validation_masks = torch.tensor(mask_test)

    # Create the DataLoader for our training set.
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # Create the DataLoader for our validation set.
    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

    auto_model = model_utils.get_automodel(rate=config['dropout_rate'])

    auto_optimizer = torch.optim.Adam(auto_model.parameters(),
                                      lr = config['lr'], 
                                      weight_decay=config['weight_decay']) # args.learning_rate - default is 5e-5, our notebook had 2e-5
    criterion = torch.nn.CrossEntropyLoss()
    loss_values, val_loss_values, acc = train_utils.run_automodel(auto_model, 
                                                                  auto_optimizer, 
                                                                  criterion, 
                                                                  epochs, 
                                                                  train_dataloader, 
                                                                  validation_dataloader, 
                                                                  scheduler=None,
                                                                  verbose=False)
    overall_loss_vals.append([loss_values, val_loss_values])
    accuracies.append(acc)
    
    if acc > best_acc:
      torch.save(auto_model,  PATH + f"saved_models/model_{config['lr']}")
      best_acc = acc
    del auto_model
  
  config_accuracy = np.mean(np.array(accuracies))
  print("Reporting average accuracy and std")
  print("Learning rate:", config['lr'])
  print("Avg accuracy:", np.mean(np.array(accuracies)))
  print("St dev:", np.std(np.array(accuracies)))

  configs[config_accuracy] = config
  print(config)


Padding/truncating all sentences to 55 values...

Padding token: "[PAD]", ID: 0


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Reporting average accuracy and std
Learning rate: 7.5e-06
Avg accuracy: 0.8172743055555556
St dev: 0.013324016565520746
{'lr': 7.5e-06, 'dropout_rate': 0.0, 'weight_decay': 0.0, 'maxlen': 55}


In [None]:
# pd_dict = {'lr':[], 'dropout_rate':[]
#                   , 'accuracy': []
#                   , 'maxlen': []}

# for acc in configs:
#   pd_dict['accuracy'].append(acc)
#   pd_dict['dropout_rate'].append(configs[acc]['dropout_rate'])
#   pd_dict['maxlen'].append(configs[acc]['maxlen'])
#   pd_dict['lr'].append(configs[acc]['lr'])


In [None]:
# df = pd.DataFrame.from_dict(pd_dict)

In [None]:
# import sklearn
# from sklearn.preprocessing import MaxAbsScaler
# scaler = MaxAbsScaler()
# d4 = pd.DataFrame(scaler.fit_transform(df[['lr', 'maxlen']]),
#                    columns=['lr', 'maxlen'])
# d4['accuracy'] = df['accuracy']


In [None]:
# y = d4.accuracy
# Xs = d4[['lr', 'maxlen']]
# train_Xs, train_y = Xs[:16], y[:16]
# val_Xs, val_y = Xs[16:], y[16:]

In [None]:
# from sklearn.linear_model import LinearRegression
# reg = LinearRegression().fit(train_Xs, train_y)
# reg.coef_

0.10817507873410659

In [None]:
# import matplotlib.pyplot as plt
# % matplotlib inline

# import seaborn as sns

# fig, axs = plt.subplots(5, 2)
# axs[0, 0].plot(overall_loss_vals[0][0], 'b-o')
# axs[0, 0].plot(overall_loss_vals[0][1], 'r-o')
# axs[0, 0].set_title('Fold 1')
# axs[0, 1].plot(overall_loss_vals[1][0], 'b-o')
# axs[0, 1].plot(overall_loss_vals[1][1], 'r-o')
# axs[0, 1].set_title('Fold 2')
# axs[1, 0].plot(overall_loss_vals[2][0], 'b-o')
# axs[1, 0].plot(overall_loss_vals[2][1], 'r-o')
# axs[1, 0].set_title('Fold 3')
# axs[1, 1].plot(overall_loss_vals[3][0], 'b-o')
# axs[1, 1].plot(overall_loss_vals[3][1], 'r-o')
# axs[1, 1].set_title('Fold 4')
# axs[2, 0].plot(overall_loss_vals[4][0], 'b-o')
# axs[2, 0].plot(overall_loss_vals[4][1], 'r-o')
# axs[2, 0].set_title('Fold 5')
# axs[2, 1].plot(overall_loss_vals[4][0], 'b-o')
# axs[2, 1].plot(overall_loss_vals[4][1], 'r-o')
# axs[2, 1].set_title('Fold 6')
# axs[3, 0].plot(overall_loss_vals[4][0], 'b-o')
# axs[3, 0].plot(overall_loss_vals[4][1], 'r-o')
# axs[3, 0].set_title('Fold 7')
# axs[3, 1].plot(overall_loss_vals[4][0], 'b-o')
# axs[3, 1].plot(overall_loss_vals[4][1], 'r-o')
# axs[3, 1].set_title('Fold 8')
# axs[4, 0].plot(overall_loss_vals[4][0], 'b-o')
# axs[4, 0].plot(overall_loss_vals[4][1], 'r-o')
# axs[4, 0].set_title('Fold 9')
# axs[4, 1].plot(overall_loss_vals[4][0], 'b-o')
# axs[4, 1].plot(overall_loss_vals[4][1], 'r-o')
# axs[4, 1].set_title('Fold 10')

# # Use plot styling from seaborn.
# sns.set(style='darkgrid')
# plt.show()

In [23]:
test_sentences = test.text.values

test_sequences = tkn.get_input_ids(test_sentences)
test_sequences = tkn.tf_pad_sequences(test_sequences, maxlen=config['maxlen'])
test_masks = np.array(tkn.get_masks(test_sequences))

# Convert to tensors.
prediction_inputs = torch.tensor(test_sequences)
prediction_masks = torch.tensor(test_masks)

# Set the batch size.  
inference_batch_size = 1

# Create the DataLoader.
prediction_data = TensorDataset(torch.tensor(test.id.values), prediction_inputs, prediction_masks)
prediction_dataloader = DataLoader(prediction_data, batch_size=inference_batch_size)



Padding/truncating all sentences to 56 values...

Padding token: "[PAD]", ID: 0


In [33]:
model = torch.load('/content/drive/MyDrive/NLP Projects/disaster_tweets/saved_models/model_7.5e-06')

In [39]:
inference.run_inference(model, prediction_dataloader, filename='submission.csv')
