In [14]:
!nvidia-smi

Sat Nov 28 01:07:02 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.104      Driver Version: 410.104      CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    43W / 300W |    417MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|    0  

In [5]:
#Check that tensorflow recognizes the GPU
import tensorflow as tf
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Cannot recognize GPU")

Default GPU Device: /device:GPU:0


In [3]:
def regex_filter(df):
    df["final_utterance"] = df["final_utterance"].str.lower()

    regex_patterns={
      # replace "uh.," "um.," "ah.," "mm.,", "oh.,!?" with blank but we leave uh-hum/mm-hmm in there
      r'(\buh(\,|\.|\s|\?))': ' ',
      r'(\bum(\,|\.|\s|\?))': ' ',
      r'(\bah\W)': ' ',
      r'(\bmm(\,|\.|\s|\!|\?))': ' ',
      r'(\boh(\,|\.|\s|\!|\?))': ' ',

      r'(\s*(?:\+?(\d{1,3}))?[-. (]*(\d{3})?[-. )]*(\d{3})[-. ]*(\d{4})(?: *x(\d+))?\s*)': ' ', # remove 10 digit and 7 digit numbers  e.g., 456-123-0000 and 123-0000
      r'(\d{3,})': '', # remove any numbers that has length 3 or more 
      r'\b(\w+)((?:\W+?)(\s+)?\1\b)+': r'\1', # replaces duplicate words with a single instance of that word. "yup, yup thank thank you. bye-bye" -> "yup thank you. bye" 
      r'([a-zA-Z0-9._-]+(\s+)?@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)': '[email]',
      r'(\bok\b)': 'okay',  #replaces "ok" with "okay"
      r'(ma\'am)': 'madam', #replaces ma'am with madam 
      r'(\s{2,})': ' '#replaces 2 or more whitespaces with a single one
    }

    df['final_utterance'].replace(regex=regex_patterns, inplace=True)
    df['final_utterance'] = df['final_utterance'].str.strip() 

    return df

In [4]:
from google.cloud import bigquery
import os
import pandas as pd
import numpy as np
import sentencepiece as spm
%pwd

'/home/jupyter/xlnet_pretrain'

In [255]:
project_id = 'talkiq-data'
client = bigquery.Client(project=project_id)

query = """
SELECT call_id, time_end, final_utterance
FROM `talkiq-data.ai_research.dialpad_xlnet_lm_pretrain`
"""

df = client.query(query).to_dataframe() 
df = regex_filter(df)

df.head()

Unnamed: 0,call_id,time_end,final_utterance
0,4503643975647232,10.5,"thank you for calling dialpad support, this is..."
1,4503643975647232,14.558,"hey, good morning."
2,4503643975647232,16.23,"hi, may i have your name, madam?"
3,4503643975647232,18.368,until the.
4,4503643975647232,21.15,"hi, toby. how can i help?"


In [128]:
#np.savetxt('train_sentencepiece.txt', df.final_utterance.values, fmt='%s')

## Train SentencePiece model

In [5]:
!pwd

/home/jupyter/xlnet_pretrain


In [131]:
spm.SentencePieceTrainer.train('--input=/home/jupyter/xlnet_pretrain/train_sentencepiece.txt \
                                --model_prefix=spiece \
                                --vocab_size=16000 \
                                --model_type=unigram \
                                --control_symbols=<cls>,<sep>,<pad>,<mask>,<eod>\
                                --user_defined_symbols=<eop>,.,(,),",-,–,£,€ \
                                --character_coverage=0.99995 \
                                --shuffle_input_sentence \
                                --input_sentence_size=10000000')

## Pre-process data for pretraining an XLNet model

In [242]:
# df_temp = df.groupby(['call_id'])[['time_end']].agg(np.max).reset_index().rename(columns={'time_end':'max_timeend'})
# df_processed = df.join(df_temp.set_index('call_id'), on='call_id')
# df_processed.loc[(df_processed['time_end'] == df_processed['max_timeend']), ['final_utterance']] = df_processed.final_utterance + " \n"

call_ids=df.call_id
unequal = call_ids.index[call_ids.shift(-1) != call_ids ]
df['final_utterance'].loc[unequal] += ' \n'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [None]:
#np.savetxt('lm_pretrain.txt', df.final_utterance.values, fmt='%s')

In [None]:
#!python xlnet/data_utils.py --use_tpu=False --bsz_per_host=4 --num_core_per_host=1 --uncased=True --seq_len=512 --reuse_len=256 --input_glob="lm_pretrain.txt" --save_dir='./dialpad_xlnet_4/' --sp_path='./dialpad_xlnet_4/spiece.model' --mask_alpha=6 --mask_beta=1 --num_predict=85 --bi_data=True

In [18]:
#bi_data = True
!python xlnet/data_utils.py --use_tpu=False --vocab_size=16000 --bsz_per_host=4 --num_core_per_host=1 --uncased=True --seq_len=512 --reuse_len=256 --input_glob="lm_pretrain.txt" --save_dir='./dialpad_xlnet_16000/' --sp_path='./dialpad_xlnet_16000/spiece.model' --mask_alpha=6 --mask_beta=1 --num_predict=85 --bi_data=True


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
INFO:tensorflow:Use glob: lm_pretrain.txt
INFO:tensorflow:Find 1 files: ['./lm_pretrain.txt']
INFO:tensorflow:Task 0 process 1 files: ['./lm_pretrain.txt']
INFO:tensorflow:Processing ./lm_pretrain.txt
INFO:tensorflow:Loading line 0
INFO:tensorflow:Loading line 100000
INFO:tensorflow:Loading line 200000
INFO:tensorflow:Loading line 300000
INFO:tensorflow:Loading line 400000
INFO:tensorflow:Loading line 500000
INFO:tensorflow:Loading line 600000
INFO:tensorflow:Loading line 700000
INFO:tensorflow:Loading line 800000
INFO:tensorflow:Loading line 900000
INFO:tensorflow:Loading line 1000000
INFO:tensorflow:Loading line 1100000
INFO:tensorflow:Finish with line 1186477
INFO:tensorflow:[Tas

## XLNet Model Pretrain

In [None]:
!python xlnet/train_gpu.py --corpus_info_path="./dialpad_xlnet_16000/corpus_info.json" --model_dir="./dialpad_xlnet_16000/model/" --record_info_dir="./dialpad_xlnet_16000/tfrecords/" --use_tpu=False --train_batch_size=4 --num_core_per_host=1 --vocab_size=16000 --seq_len=512 --reuse_len=256 --mem_len=384 --perm_size=256 --untie_r=True --mask_alpha=6 --mask_beta=1 --num_predict=85 --train_steps=200000 --iterations=1000 --save_steps=10000 --uncased=True --bi_data=True --d_head=64 --d_inner=3072 --d_model=768 --ff_activation=gelu --n_head=12 --n_layer=12 --learning_rate=3e-4


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
INFO:tensorflow:n_token 16000
INFO:tensorflow:Use the following tfrecord dirs: ['./dialpad_xlnet_bi4/tfrecords/']
INFO:tensorflow:[0] Record glob: ./dialpad_xlnet_bi4/tfrecords/record_info-train-*.bsz-4.vocabsz-16000.seqlen-512.reuse-256.uncased.bi.alpha-6.beta-1.fnp-85.json
INFO:tensorflow:[0] Num of record info path: 1
INFO:tensorflow:[Dir 0] Number of chosen batches: 44133
INFO:tensorflow:[Dir 0] Number of chosen files: 1
INFO:tensorflow:['./dialpad_xlnet_bi4/tfrecords/train-0-0.bsz-4.vocabsz-16000.seqlen-512.reuse-256.uncased.bi.alpha-6.beta-1.fnp-85.tfrecords']
INFO:tensorflow:Total number of batches: 44133
INFO:tensorflow:Total number of files: 1
INFO:tensorflow:['./dialpad_xl