In [6]:
!nvidia-smi

Wed Nov 25 03:18:14 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    39W / 300W |    309MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
#Check that tensorflow recognizes the GPU
import tensorflow as tf
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Cannot recognize GPU")

Default GPU Device: /device:GPU:0


In [9]:
from google.cloud import bigquery
import os
import pandas as pd
import numpy as np
import sentencepiece as spm
%pwd

'/home/jupyter'

In [10]:
def regex_filter(df):
    df["final_utterance"] = df["final_utterance"].str.lower()

    regex_patterns={
      # replace "uh.," "um.," "ah.," "mm.,", "oh.,!?" with blank but we leave uh-hum/mm-hmm in there
      r'(\buh(\,|\.|\s|\?))': ' ',
      r'(\bum(\,|\.|\s|\?))': ' ',
      r'(\bah\W)': ' ',
      r'(\bmm(\,|\.|\s|\!|\?))': ' ',
      r'(\boh(\,|\.|\s|\!|\?))': ' ',

      r'(\s*(?:\+?(\d{1,3}))?[-. (]*(\d{3})?[-. )]*(\d{3})[-. ]*(\d{4})(?: *x(\d+))?\s*)': ' ', # remove 10 digit and 7 digit numbers  e.g., 456-123-0000 and 123-0000
      r'(\d{3,})': '', # remove any numbers that has length 3 or more 
      r'\b(\w+)((?:\W+?)(\s+)?\1\b)+': r'\1', # replaces duplicate words with a single instance of that word. "yup, yup thank thank you. bye-bye" -> "yup thank you. bye" 
      r'([a-zA-Z0-9._-]+(\s+)?@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)': '[email]',
      r'(\bok\b)': 'okay',  #replaces "ok" with "okay"
      r'(ma\'am)': 'madam', #replaces ma'am with madam 
      r'(\s{2,})': ' '#replaces 2 or more whitespaces with a single one
    }

    df['final_utterance'].replace(regex=regex_patterns, inplace=True)
    df['final_utterance'] = df['final_utterance'].str.strip() 

    return df


In [11]:
project_id = 'talkiq-data'
client = bigquery.Client(project=project_id)

query = """
SELECT *
FROM `talkiq-data.ai_research.dialpad_support_call_transcripts`
"""

df = client.query(query).to_dataframe() 
df = regex_filter(df)

#remove all punctuation 
df['text'] = df['final_utterance'].str.replace(r'[^\w\s]+', ' ')
df['text_length'] = df['text'].str.split().str.len()

df.head()

Unnamed: 0,group_id,call_id,duration,final_utterance,PARTITIONTIME,text,text_length
0,CallCenter-6652811376590848,4777995676483584,293.887,hi thanks for calling dialpad this try my spea...,2020-02-21 00:00:00+00:00,hi thanks for calling dialpad this try my spea...,389
1,CallCenter-6652811376590848,4798450927992832,77.179,"thank, waiting.,yeah.,okay, a fifty dollars fo...",2020-02-21 00:00:00+00:00,thank waiting yeah okay a fifty dollars for ...,160
2,CallCenter-5296936154038272,4897400062279680,286.53,"hi, thanks for calling delta disagree. let's s...",2020-02-21 00:00:00+00:00,hi thanks for calling delta disagree let s s...,657
3,CallCenter-6652811376590848,4516407094607872,47.94,"yes.,what?,yeah., sure.",2020-02-24 00:00:00+00:00,yes what yeah sure,4
4,CallCenter-6652811376590848,4561888533348352,279.99,"thanks for calling dialpad, how can i help you...",2020-02-24 00:00:00+00:00,thanks for calling dialpad how can i help you...,580


In [None]:
# Only run this if you are trying to create another dataset for sentencepiece model training
#np.savetxt('dialpad_cc_calls.txt', df.final_utterance.values, fmt='%s')

## Train SentencePiece (tokenizer) model

In [5]:
# %mkdir test_folder
%cd dialpad_xlnet

/home/jupyter/dialpad_xlnet


In [6]:
spm.SentencePieceTrainer.train('--input=/home/jupyter/dialpad_cc_calls.txt \
                                --model_prefix=spiece \
                                --vocab_size=16000 \
                                --model_type=unigram \
                                --control_symbols=<cls>,<sep>,<pad>,<mask>,<eod>\
                                --user_defined_symbols=<eop>,.,(,),",-,–,£,€ \
                                --shuffle_input_sentence \
                                --input_sentence_size=10000000')

In [6]:
%cd ..

/home/jupyter


## Create tfrecords using txt file and trained SentencePiece model

In [5]:
!python xlnet/data_utils.py --use_tpu=False --bsz_per_host=4 --num_core_per_host=1 --uncased=True --seq_len=512 --reuse_len=256 --input_glob="dialpad_cc_calls.txt" --save_dir='./dialpad_xlnet_4/' --sp_path='./dialpad_xlnet_4/spiece.model' --mask_alpha=6 --mask_beta=1 --num_predict=85 --bi_data=False


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
INFO:tensorflow:Use glob: dialpad_cc_calls.txt
INFO:tensorflow:Find 1 files: ['./dialpad_cc_calls.txt']
INFO:tensorflow:Task 0 process 1 files: ['./dialpad_cc_calls.txt']
INFO:tensorflow:Processing ./dialpad_cc_calls.txt
INFO:tensorflow:Loading line 0
INFO:tensorflow:Finish with line 22852
INFO:tensorflow:[Task 0] Total number line: 22852
INFO:tensorflow:Using perm indices [0] for pass 0
INFO:tensorflow:Raw data shape (4, 5683065).
INFO:tensorflow:Start writing ./dialpad_xlnet_4/tfrecords/train-0-0.bsz-4.seqlen-512.reuse-256.uncased.uni.alpha-6.beta-1.fnp-85.tfrecords.
INFO:tensorflow:Processing batch 0
INFO:tensorflow:Processing batch 500
INFO:tensorflow:Processing batch 1000
INFO:

## Pretrain an XLNet!

In [8]:
!python xlnet/train_gpu.py --corpus_info_path="./dialpad_xlnet_4/corpus_info.json" --model_dir="./dialpad_xlnet_4/model/" --record_info_dir="./dialpad_xlnet_4/tfrecords/" --use_tpu=False --train_batch_size=4 --num_core_per_host=1 --seq_len=512 --reuse_len=256 --mem_len=384 --perm_size=256 --untie_r=True --mask_alpha=6 --mask_beta=1 --num_predict=85 --train_steps=10000 --iterations=2 --save_steps=5000 --uncased=True --bi_data=False --d_head=64 --d_inner=3072 --d_model=768 --ff_activation=gelu --n_head=12 --n_layer=12 --n_token=16000



  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
INFO:tensorflow:n_token 16000
INFO:tensorflow:Use the following tfrecord dirs: ['./dialpad_xlnet_4/tfrecords/']
INFO:tensorflow:[0] Record glob: ./dialpad_xlnet_4/tfrecords/record_info-train-*.bsz-4.seqlen-512.reuse-256.uncased.uni.alpha-6.beta-1.fnp-85.json
INFO:tensorflow:[0] Num of record info path: 1
INFO:tensorflow:[Dir 0] Number of chosen batches: 22198
INFO:tensorflow:[Dir 0] Number of chosen files: 1
INFO:tensorflow:['./dialpad_xlnet_4/tfrecords/train-0-0.bsz-4.seqlen-512.reuse-256.uncased.uni.alpha-6.beta-1.fnp-85.tfrecords']
INFO:tensorflow:Total number of batches: 22198
INFO:tensorflow:Total number of files: 1
INFO:tensorflow:['./dialpad_xlnet_4/tfrecords/train-0-0.bsz-4.