# Deep Learning models
## XLM_roBERTa model:  (BERT embeddings)

This notebook was run on a Google cloud Deep Learning VM. We had created a VM with 2vCPUs and 52Go of Memory and an Nvidia Tesla P100 GPU. We recommend as in the other notebooks to run it after installing the requirements `pip install -r requirements.txt` and having a suitable GPU to ensure fast trainng.
With the Google Cloud VM it took 7 hours to fit.
We used tensorflow, transformers, and pretrained models from https://huggingface.co/.

This model Gave us 0.877 accuracy and 0.878 F1-Score.

### Relevant imports and checking environment:

In [2]:
import tensorflow as tf
print(tf.__version__)

2.3.1


In [5]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import GlobalAveragePooling1D
from keras.layers import Convolution1D
from keras.layers import MaxPooling1D
from keras.layers import Embedding
from keras.layers import LSTM
from keras.preprocessing import sequence

Using TensorFlow backend.


In [6]:
import numpy as np
# Let's import a fast tokenizer that can work on batched inputs
# (the 'Fast' tokenizers in HuggingFace)
from transformers import RobertaTokenizerFast, logging as transformers_logging
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# Let's load a pretrained TF2 Roberta model and a simple optimizer
from transformers import TFXLMRobertaForSequenceClassification


In [7]:
# Check the current GPU infos
!nvidia-smi

Mon Dec 14 22:06:49 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   57C    P0    35W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [24]:
from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 5210981556642462867
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 16454849342812487283
physical_device_desc: "device: XLA_CPU device"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 5025661241410285274
physical_device_desc: "device: XLA_GPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 15703311680
locality {
  bus_id: 1
  links {
  }
}
incarnation: 6551095889574330453
physical_device_desc: "device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0"
]


### Preprocessing data:

In [8]:
x_poss = list(open("Project/cleaned_data/cleaned_train_pos_full.txt", "r", encoding='utf-8').readlines())
x_poss = [s.strip() for s in x_poss]
x_pos = []
for elem in x_poss:
    if elem!='':
        tweet=''
        for word in elem.split(','):
            tweet+=word+' '
        x_pos.append(tweet)
x_negg = list(open("Project/cleaned_data/cleaned_train_neg_full.txt", "r", encoding='utf-8').readlines())
x_negg = [s.strip() for s in x_negg]
x_neg = []
for elem in x_negg:
    if elem!='':
        tweet=''
        for word in elem.split(','):
            tweet+=word+' '
        x_neg.append(tweet)

In [10]:
transformers_logging.set_verbosity_warning()
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

In [11]:
transformers_logging.set_verbosity_info()

max_length = 512
batch_size = 64

In [12]:
def convert_example_to_feature(tweet):
        return tokenizer(tweet, add_special_tokens=True,
                                    max_length=None,
                                    pad_to_max_length=True,
                                    return_attention_mask=True,
                                    return_token_type_ids=False)

In [13]:
bert_x = convert_example_to_feature((x_pos + x_neg))



In [14]:
y = np.concatenate([np.ones(len(x_pos)), np.zeros(len(x_neg))])

### Model:

In [17]:
model= TFXLMRobertaForSequenceClassification.from_pretrained("roberta-base")
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE, from_logits=True)
opt = tf.keras.optimizers.Adam(learning_rate=3e-5)

model.compile(optimizer=opt,
              loss=loss_fn,
              metrics=['accuracy'])

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/jupyter/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config XLMRobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

loading weights file https://huggingface.co/roberta-base/resolve/main/tf_model.h5 from cache at /home/jupyter/.cache/huggingface/transformers/22fef2e3c5012c1a8f8d7f024e302

In [18]:
[input_ids,attention_mask] = np.array( [bert_x['input_ids'], bert_x['attention_mask']] )

In [20]:
bert_x = 0 # to optimize memory

In [21]:
attention_mask = attention_mask.astype(np.int8) # to optimize memory

In [22]:
indices = np.random.permutation(input_ids.shape[0])
training_idx, test_idx = indices[:2000000], indices[2000000:]

In [23]:
train_input_ids = tf.convert_to_tensor(input_ids[training_idx,:])
train_att_mask = tf.convert_to_tensor(attention_mask[training_idx,:])
y_train = tf.convert_to_tensor(y[training_idx],dtype=tf.float32)


test_input_ids = tf.convert_to_tensor(input_ids[test_idx,:])
test_att_mask = tf.convert_to_tensor(attention_mask[test_idx,:])
y_test = tf.convert_to_tensor(y[test_idx],dtype=tf.float32)

In [25]:
roberta_history = model.fit([train_input_ids,train_att_mask], 
      y_train,
      validation_data=([test_input_ids,test_att_mask], y_test),
      epochs=1, batch_size=batch_size, verbose=1 )



In [27]:
model.save_weights("./xlm_roberta_weights_1M.h5")

In [29]:
tesst = list(open("Project/cleaned_data/cleaned_test_data.txt", "r", encoding='utf-8').readlines())
tesst = [s.strip() for s in tesst]
test = []
for elem in tesst:
    if elem!='':
        tweet=''
        for word in elem.split(','):
            tweet+=word+' '
        test.append(tweet)

In [30]:
bert_test = convert_example_to_feature(test)

In [31]:
input_ids = tf.convert_to_tensor(bert_test.get('input_ids'))
attention_mask =tf.convert_to_tensor(bert_test.get('attention_mask'))

In [32]:
y_pred = model.predict([input_ids,attention_mask])

In [33]:
y_pred[0][0]

array([ 3.1326149, -3.456763 ], dtype=float32)

In [34]:
def softmax(x):
    return np.exp(x)/sum(np.exp(x))

In [35]:
predictions = [softmax(x) for x in y_pred[0]]

In [36]:
output = []
for elem in predictions:
    if elem[0]>elem[1] : x=-1
    else : x = 1
    output.append(x)

In [38]:
from helpers import create_csv_submission

In [40]:
create_csv_submission(output, './xlm_roberta_FULL.csv') # 0.877 ACC 0.878 F1