# Machine Learning Project

Article: *Multi-Task Deep Neural Networks for Natural Language Understanding*

<a href=https://arxiv.org/abs/1901.11504> https://arxiv.org/abs/1901.11504</a>

In [1]:
#!cd ..
#!python scripts/download_glue_data.py --data_dir data --tasks all

#!pip install python-csv
#!pip install psutil
#!pip install torch
#!pip install transformers #version 3.4.0 
#!pip install path

#!cd notebooks

In [2]:
import os
from path import Path
PROJ_DIR = Path().getcwd().parent
DATA_DIR = PROJ_DIR / "data"
MODELS_DIR=PROJ_DIR / "models"
os.chdir(PROJ_DIR)
print(DATA_DIR)
print(MODELS_DIR)


E:\Documenti\Magistrale\PycharmProject\Progetto_ML\data
E:\Documenti\Magistrale\PycharmProject\Progetto_ML\models


In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from transformers import BertTokenizer
import torch

from scripts.glue import *

## System Information

In [4]:
import psutil
svmem = psutil.virtual_memory()

def get_size(bytes, suffix="B"):
    """
    Scale bytes to its proper format
    e.g:
        1253656 => '1.20MB'
        1253656678 => '1.17GB'
    """
    factor = 1024
    for unit in ["", "K", "M", "G", "T", "P"]:
        if bytes < factor:
            return f"{bytes:.2f}{unit}{suffix}"
        bytes /= factor

# RAM information
print("="*38, "Memory Info", "="*39)
print(f"Total: {get_size(svmem.total)}")
print(f"Available: {get_size(svmem.available)}")
print(f"Used: {get_size(svmem.used)}")
print(f"Percentage: {svmem.percent}%")

# CPU information
print("="*40, "CPU Info", "="*40)
# number of cores
print("Physical cores:", psutil.cpu_count(logical=False))
print("Total cores:", psutil.cpu_count(logical=True))
# GPU information
print("="*40, "GPU Info", "="*40)
print(f"Number of CUDA devices: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print("Status: Available")
    cuda_device=torch.cuda.current_device()
    print(f"CUDA device ID: {cuda_device}")
    print(f"CUDA device name: {torch.cuda.get_device_name(cuda_device)}")
else:
    print("Status: Not available")
    

Total: 3.89GB
Available: 220.37MB
Used: 3.67GB
Percentage: 94.5%
Physical cores: 2
Total cores: 4
Number of CUDA devices: 0
Status: Not available


## Import Data

In [5]:
cola=CoLA(DATA_DIR)
sst_2=SST_2(DATA_DIR)
mnli=MNLI(DATA_DIR)
rte=RTE(DATA_DIR)
wnli=WNLI(DATA_DIR)
qqp=QQP(DATA_DIR)
mrpc=MRPC(DATA_DIR)
snli=SNLI(DATA_DIR)
sts_b=STS_B(DATA_DIR)
qnli=QNLI(DATA_DIR)
tasks=[cola, sst_2, mnli, rte, wnli, qqp, mrpc, snli, sts_b, qnli]


b'Skipping line 24810: expected 12 fields, saw 13\nSkipping line 33961: expected 12 fields, saw 13\n'
b'Skipping line 75911: expected 12 fields, saw 13\nSkipping line 100114: expected 12 fields, saw 13\n'
b'Skipping line 150638: expected 12 fields, saw 13\nSkipping line 158834: expected 12 fields, saw 13\nSkipping line 173104: expected 12 fields, saw 13\nSkipping line 178252: expected 12 fields, saw 13\n'
b'Skipping line 221951: expected 12 fields, saw 13\n'
b'Skipping line 286845: expected 12 fields, saw 13\nSkipping line 314110: expected 12 fields, saw 13\n'
b'Skipping line 370: expected 5 fields, saw 6\n'
b'Skipping line 93: expected 5 fields, saw 6\nSkipping line 590: expected 5 fields, saw 6\nSkipping line 778: expected 5 fields, saw 6\nSkipping line 790: expected 5 fields, saw 6\nSkipping line 882: expected 5 fields, saw 6\nSkipping line 1296: expected 5 fields, saw 6\nSkipping line 1324: expected 5 fields, saw 6\nSkipping line 1640: expected 5 fields, saw 6\nSkipping line 1759: 

## Tokenization


The Bert Tokenizer implemented by Pytorch adds the special tokens [CLS], [SEP], [PAD] and [UNK] to the sequences and encodes the sequences.


In [6]:
PRE_TRAINED_MODEL_NAME='bert-large-cased'
MAX_LEN=512
BATCH_SIZE=32
EPOCHS=5

In [7]:
tokenizer=BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME) 

The special token encodings are:

In [8]:
print((tokenizer.sep_token, tokenizer.sep_token_id))
print((tokenizer.cls_token, tokenizer.cls_token_id))
print((tokenizer.pad_token, tokenizer.pad_token_id))
print((tokenizer.unk_token, tokenizer.unk_token_id))

('[SEP]', 102)
('[CLS]', 101)
('[PAD]', 0)
('[UNK]', 100)


The  ``encode_plus`` function is used to create the sentence tokenization and the attention mask with a single instruction.


In [9]:
sample_txt="When was I last outside? I am stuck at home for 2 weeks."
encoding = tokenizer.encode_plus(
  sample_txt,
  max_length=MAX_LEN,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
)
tokens_ids=encoding['input_ids'][0]
tokens=tokenizer.convert_ids_to_tokens(tokens_ids)
attention_mask=encoding['attention_mask']
print(f' Sentence: {sample_txt}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {tokens_ids.tolist()}')
print(f'Attention mask: {attention_mask.tolist()}')

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


 Sentence: When was I last outside? I am stuck at home for 2 weeks.
   Tokens: ['[CLS]', 'When', 'was', 'I', 'last', 'outside', '?', 'I', 'am', 'stuck', 'at', 'home', 'for', '2', 'weeks', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]

In [10]:
NUM_WORKERS=0
for task in tasks:
    task.tokenization(tokenizer, MAX_LEN, BATCH_SIZE, NUM_WORKERS)

In [11]:
def print_tokenization(data, has_sequence2=False):
    if has_sequence2:
        for i in range(len(data["sequence1"])):
            sequence1=data["sequence1"][i]
            sequence2=data["sequence2"][i]
            tokens_ids=data['input_ids'].tolist()[i]
            tokens=tokenizer.convert_ids_to_tokens(tokens_ids)
            attention_mask=data['attention_mask'].tolist()[i]
            positional_encoding=data['positional_encoding'].tolist()[i]
            token_type_ids=data["token_type_ids"].tolist()[i]
            print(f"     Index: {i}")
            print(f'Sequence 1: {sequence1}')
            print(f'Sequence 2: {sequence2}')
            print(f'    Tokens: {tokens}')
            print(f' Token IDs: {tokens_ids}')
            print(f'     Attention mask: {attention_mask}')
            print(f'Positional encoding: {positional_encoding}')
            print(f'     Token type IDs: {token_type_ids}\n\n')
    else:
        for i in range(len(data["sequence1"])):
            sequence=data["sequence1"][i]
            tokens_ids=data['input_ids'].tolist()[i]
            tokens=tokenizer.convert_ids_to_tokens(tokens_ids)
            attention_mask=data['attention_mask'].tolist()[i]
            positional_encoding=data['positional_encoding'].tolist()[i]
            token_type_ids=data["token_type_ids"].tolist()[i]
            print(f"     Index: {i}")
            print(f'  Sequence: {sequence}')
            print(f'    Tokens: {tokens}')
            print(f' Token IDs: {tokens_ids}')
            print(f'     Attention mask: {attention_mask}')
            print(f'Positional encoding: {positional_encoding}')
            print(f'     Token type IDs: {token_type_ids}\n\n')

In [12]:
print_tokenization(mnli.dev_tokenized_data.viewData(), has_sequence2=True)

    

     Index: 0
Sequence 1: The new rights are nice enough
Sequence 2: Everyone really likes the newest benefits 
    Tokens: ['[CLS]', 'The', 'new', 'rights', 'are', 'nice', 'enough', '[SEP]', 'Everyone', 'really', 'likes', 'the', 'newest', 'benefits', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '

In [14]:
print_tokenization(cola.dev_tokenized_data.viewData())





     Index: 0
  Sequence: The sailors rode the breeze clear of the rocks.
    Tokens: ['[CLS]', 'The', 'sailors', 'rode', 'the', 'breeze', 'clear', 'of', 'the', 'rocks', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]