This notebook prepares tensorflow datasets for training as well as the outputs from data pre-processing and saves in the `Data` directory

In [1]:
#Basic imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import time
import os
from collections import Counter
import json
from pathlib import Path
from datetime import datetime
import pickle

#Tensorflow impocarts
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import CSVLogger

Define functions that prepare the data:

In [2]:
def filter_num_ticks(df_input, hyper_params, cuDNN=False):
    
    #This function filters dataframe rows by the number of ticks
    
    #Length params from hyper_params
    max_sen_len = hyper_params['max_sen_len']
    ntick_cutoff = hyper_params['ntick_cutoff']
    
    #Get rid of rows where count(,) < max_sen_len-1
    df_input = df_input[df_input.route_id.str.count(',') >= max_sen_len-1]

    #trruncation function
    def trunc(x, max_sen_len): #func to truncate
        return ' '.join(x.split(',')[-max_sen_len::])

    #Truncate all string to be same length
    for col in df_input.columns:
        if col != 'user_id':
            df_input[col] = df_input[col].apply(lambda x: trunc(x, max_sen_len))

    #Reset the dataframe index
    ouput_index = df_input.index
    df_input.reset_index(inplace=True)
    df_input.drop('index',axis=1,inplace=True)
    
    return df_input, ouput_index

def fit_tokenizer(text_to_fit, num_vals):
    
    #Parameters for tokenization
    oov_token = '<UNK>' #this is the token for out of vocab

    #Tokenize dataset
    tokenizer_input = Tokenizer(num_words=num_vals, oov_token=oov_token)
    tokenizer_input.fit_on_texts(text_to_fit)

    #Get the word index dictionary
    word_index = tokenizer_input.word_index
    vocab_size = len(word_index)+1

    
    return tokenizer_input, word_index, vocab_size

def generate_train_val_data_select(df_input, hyper_params, vocab_size, word_index, tokenizer, features, cuDNN=True):   
    
    
    #Parameters from hyper_params
    ntick_cutoff = hyper_params['ntick_cutoff']
    batch_size = hyper_params['batch_size']
    max_sen_len = hyper_params['max_sen_len']
    # trainsize = hyper_params['test_cutoff']
    nyears = hyper_params['nyears'] 
    binsize = hyper_params['binsize']


    #Filter by number of ticks
    df_input, ouput_index = filter_num_ticks(df_input, hyper_params, cuDNN=cuDNN)


    #Now generate "text files" from dataframe...
    text_route = [] #each of these is just a list of strings, where L = users
    text_ndays = []
    text_rating_yds = []
    text_rating_vscale = []
    text_rating_misc = []
    text_routetype = []

    for i in range(len(df_input.index)):
        text_route.append(df_input.route_id[i])
        text_ndays.append(df_input.ndays[i])
        text_rating_yds.append(df_input.rating_yds[i])
        text_rating_vscale.append(df_input.rating_vscale[i])
        text_rating_misc.append(df_input.rating_misc[i])
        text_routetype.append(df_input.routetype[i])


    #Get total number of records 
    n_records = len(text_route)

    #Tokenize on rating_yds
    tokenizer_rating_yds, word_index_rating_yds, vocab_size_rating_yds = fit_tokenizer(text_rating_yds, 20)

    #Tokenize on rating_vscale
    tokenizer_rating_vscale, word_index_rating_vscale, vocab_size_rating_vscale = fit_tokenizer(text_rating_vscale, 20)

    #Tokenize on rating_misc
    tokenizer_rating_misc, word_index_rating_misc, vocab_size_rating_misc = fit_tokenizer(text_rating_misc, 800)

    #Tokenize on routetype
    tokenizer_routetype, word_index_routetype, vocab_size_routetype = fit_tokenizer(text_routetype, 120)

    #Generate feature and target sentences (shifted)
    feature_text_route = [" ".join(sen.split(' ')[0:-1]) for sen in text_route]
    feature_text_ndays = [" ".join(sen.split(' ')[0:-1]) for sen in text_ndays]
    feature_text_rating_yds = [" ".join(sen.split(' ')[0:-1]) for sen in text_rating_yds]
    feature_text_rating_vscale = [" ".join(sen.split(' ')[0:-1]) for sen in text_rating_vscale]
    feature_text_rating_misc = [" ".join(sen.split(' ')[0:-1]) for sen in text_rating_misc]
    feature_text_routetype = [" ".join(sen.split(' ')[0:-1]) for sen in text_routetype]
    target_text = [" ".join(sen.split(' ')[1::]) for sen in text_route]

    #Produce bins
    bins = np.arange(0, nyears*365 + binsize, binsize, dtype=int) #produce bins
    vocab_size_bins = len(bins) + 1

    #Bin feature_text_ndays in sequences directly
    feature_seq_ndays = [[np.digitize(y, bins, right=False) for y in [int(x) for x in z.split(' ')]] for z in feature_text_ndays]    

    #Directly convert to matricies
    feature_seq_route = np.array(tokenizer.texts_to_sequences(feature_text_route)) #array remove need to use padding
    feature_seq_ndays = np.array(feature_seq_ndays)
    feature_seq_rating_yds = np.array(tokenizer_rating_yds.texts_to_sequences(feature_text_rating_yds)) #array remove need to use padding
    feature_seq_rating_vscale = np.array(tokenizer_rating_vscale.texts_to_sequences(feature_text_rating_vscale)) #array remove need to use padding
#     feature_seq_rating_misc = np.array(tokenizer_rating_misc.texts_to_sequences(feature_text_rating_misc)) #array remove need to use padding
    feature_seq_routetype = np.array(tokenizer_routetype.texts_to_sequences(feature_text_routetype)) #array remove need to use padding
    target_seq = np.array(tokenizer.texts_to_sequences(target_text))

    #Make feature dictionary
    if features == 'route_id':
        feature_dict = feature_seq_route
    
    if features == 'ndays':
        feature_dict = {'route_id': feature_seq_route,
                        'ndays': feature_seq_ndays}
        
    if features == 'all':
        feature_dict = {'route_id': feature_seq_route,
                        'ndays': feature_seq_ndays,
                        'rating_yds': feature_seq_rating_yds,
                        'rating_vscale': feature_seq_rating_vscale, #'rating_misc': feature_seq_rating_misc,
                        'routetype': feature_seq_routetype}


    # Make a dataset tensor for training and one for validation
    dataset_size = feature_seq_route.shape[0] #number of records
    train_size = dataset_size * 90 // 100
    dataset = tf.data.Dataset.from_tensor_slices((feature_dict, target_seq)) #all data, shuffled perfectly

    # #Split into train and val
    # train_set_output = dataset.take(train_size) 
    # val_set_output = dataset.skip(train_size)
    
    
    vocab = {
        'vocab_size_route': vocab_size,
        'vocab_size_bins': vocab_size_bins,
        'vocab_size_yds': 20,
        'vocab_size_vscale': 20,
        'vocab_size_misc': 800,
        'vocab_size_type': 120
    }
    
    tokenizer_dict = {
        'tokenizer_route': tokenizer,
        'tokenizer_rating_yds': tokenizer_rating_yds,
        'tokenizer_rating_vscale': tokenizer_rating_vscale,
        'tokenizer_rating_misc': tokenizer_rating_misc,
        'tokenizer_routetype': tokenizer_routetype
        
    }

    return dataset, vocab, word_index, tokenizer_dict, n_records, ouput_index

Read raw data, load previously prepared tokenizer and hyper_params

In [3]:
#Read data
df = pd.read_json('tickdata_format_allfeatures.json')

#Load tokenizer, word_index, vocab_size:
import pickle 
fp_tokenizer = 'tokenizer_cuDNN_230203-023518.pickle'
fp_wordindex = 'wordindex_cuDNN_230203-023518.json'
with open(fp_tokenizer, "rb") as fp:
    tokenizer = pickle.load(fp)
with open(fp_wordindex, "rb") as fp:
    word_index = json.load(fp)
vocab_size = len(word_index) + 1

#Define hyper_params:
hyper_params= {
    'ntick_cutoff': 50,        #min number of ticks to use a profile (only for non-cuDNN)
    'batch_size': 32,          #obvious
    'max_sen_len': 50,         #max length of sentence (number of ticks to use for cuDNN)
    'binsize': 30,             #size of bins for binning ndays in days
    'nyears': 20,              #max number of years to look-back
}

In [4]:
#Generate datasets
dataset_routes, vocab, word_index, tokenizer_dict, n_records, ouput_index = generate_train_val_data_select(df,
                                                                                     hyper_params,
                                                                                     vocab_size,
                                                                                     word_index,
                                                                                     tokenizer,
                                                                                     features='route_id',
                                                                                     cuDNN=True)
dataset_ndays, vocab, word_index, tokenizer_dict, n_records, ouput_index = generate_train_val_data_select(df,
                                                                                     hyper_params,
                                                                                     vocab_size,
                                                                                     word_index,
                                                                                     tokenizer,
                                                                                     features='ndays',
                                                                                     cuDNN=True)
dataset_all, vocab, word_index, tokenizer_dict, n_records, ouput_index = generate_train_val_data_select(df,
                                                                                     hyper_params,
                                                                                     vocab_size,
                                                                                     word_index,
                                                                                     tokenizer,
                                                                                     features='all',
                                                                                     cuDNN=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_input[col] = df_input[col].apply(lambda x: trunc(x, max_sen_len))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_input.drop('index',axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_input[col] = df_input[col].apply(lambda x: trunc(x, max_sen_len))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveat

In [14]:
#Split into train/val and batch/prefetch
val_set_route = dataset_routes.take(n_records//10).batch(hyper_params['batch_size']).prefetch(1)
val_set_nday = dataset_ndays.take(n_records//10).batch(hyper_params['batch_size']).prefetch(1)
val_set_all = dataset_all.take(n_records//10).batch(hyper_params['batch_size']).prefetch(1)

train_set_route = dataset_routes.skip(n_records//10).batch(hyper_params['batch_size']).prefetch(1)
train_set_nday = dataset_ndays.skip(n_records//10).batch(hyper_params['batch_size']).prefetch(1)
train_set_all = dataset_all.skip(n_records//10).batch(hyper_params['batch_size']).prefetch(1)



Save the tensor data and other outputs to files in the Data directory:

In [15]:
#Save training/test sets and output index 
train_set_route.save('Data/train_set_route')
val_set_route.save('Data/val_set_route')
train_set_nday.save('Data/train_set_nday')
val_set_nday.save('Data/val_set_nday')
train_set_all.save('Data/train_set_all')
val_set_all.save('Data/val_set_all')


    
#Save hyper_params as a separate file as well, and vocab, tokenizer_dict
with open('Data/hyper_params.json', 'w') as fp:
    json.dump(hyper_params, fp)
    
with open('Data/output_index.pickle', 'wb') as fp:
    pickle.dump(ouput_index, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('Data/tokenizer_route.pickle', "wb") as fp:
    pickle.dump(tokenizer, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('Data/word_index.json', "w") as fp:
    json.dump(word_index, fp)
        
with open('Data/vocab.json', "w") as fp:
    json.dump(vocab, fp)
    
with open('Data/tokenizer_dict.pickle', "wb") as fp:
    pickle.dump(tokenizer_dict, fp, protocol=pickle.HIGHEST_PROTOCOL)
    