In [2]:
import pandas as pd
import numpy as np
import json

import tensorflow as tf
from tensorflow.keras.models import Sequential


In [3]:
train_files = !cd shakespeare_data/train; ls
test_files = !cd shakespeare_data/test; ls

In [4]:
test_files

['all_data_niid_0_keep_0_test_9.json']

In [5]:
with open('shakespeare_data/train/all_data_niid_0_keep_0_train_9.json') as f:
    train_data = json.load(f)

with open('shakespeare_data/test/all_data_niid_0_keep_0_test_9.json') as f:
    test_data = json.load(f)

In [6]:
users, num_samples_train, user_data_train, hierarchies = (
    train_data[x] for x in ['users', 'num_samples', 'user_data', 'hierarchies'])
_, num_samples_test, user_data_test, _ = (
    test_data[x] for x in ['users', 'num_samples', 'user_data', 'hierarchies'])

In [7]:
import re

user_info = pd.DataFrame({
    'play' : hierarchies, 
    'user' : users, 
    'num_samples_train' : num_samples_train, 
    'num_samples_test': num_samples_test})

user_info['user_id'] = user_info['user'].astype('category').cat.codes
user_info['play_id'] = user_info['play'].astype('category').cat.codes

user_info = user_info.set_index('user')

#user_info.set_index(['play_id','user_id']).sort_index()

#user_info.apply(lambda row: re.sub(re.sub(' ', '_', row['play']), '', row['user']), axis=1)

In [8]:
user_info.sort_index().head(10)

Unnamed: 0_level_0,play,num_samples_train,num_samples_test,user_id,play_id
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALLS_WELL_THAT_ENDS_WELL_BERTRAM,ALLS WELL THAT ENDS WELL,9484,1054,0,1
ALLS_WELL_THAT_ENDS_WELL_CLOWN,ALLS WELL THAT ENDS WELL,7551,840,1,1
ALLS_WELL_THAT_ENDS_WELL_COUNTESS,ALLS WELL THAT ENDS WELL,10348,1150,2,1
ALLS_WELL_THAT_ENDS_WELL_DIANA,ALLS WELL THAT ENDS WELL,4616,513,3,1
ALLS_WELL_THAT_ENDS_WELL_DUKE,ALLS WELL THAT ENDS WELL,560,63,4,1
ALLS_WELL_THAT_ENDS_WELL_FIRST_GENTLEMAN,ALLS WELL THAT ENDS WELL,309,35,5,1
ALLS_WELL_THAT_ENDS_WELL_FIRST_LORD,ALLS WELL THAT ENDS WELL,3357,373,6,1
ALLS_WELL_THAT_ENDS_WELL_FIRST_SOLDIER,ALLS WELL THAT ENDS WELL,2902,323,7,1
ALLS_WELL_THAT_ENDS_WELL_GENTLEMAN,ALLS WELL THAT ENDS WELL,559,63,8,1
ALLS_WELL_THAT_ENDS_WELL_HELENA,ALLS WELL THAT ENDS WELL,16825,1870,9,1


In [9]:
# 36 different plays
# 1000 different speakers
user_info['play'].value_counts()

KING RICHARD III                            52
THE SECOND PART OF KING HENRY THE SIXTH     52
THE TRAGEDY OF ANTONY AND CLEOPATRA         50
THE LIFE OF TIMON OF ATHENS                 48
THE TRAGEDY OF CORIOLANUS                   47
THE FIRST PART OF HENRY THE SIXTH           45
SECOND PART OF KING HENRY IV                44
THE LIFE OF KING HENRY THE FIFTH            42
KING HENRY THE EIGHTH                       42
THE THIRD PART OF KING HENRY THE SIXTH      40
THE TRAGEDY OF JULIUS CAESAR                40
THE TRAGEDY OF MACBETH                      38
CYMBELINE                                   36
KING RICHARD THE SECOND                     34
THE TRAGEDY OF ROMEO AND JULIET             32
THE WINTER'S TALE                           32
THE FIRST PART OF KING HENRY THE FOURTH     30
THE TRAGEDY OF HAMLET, PRINCE OF DENMARK    29
A MIDSUMMER NIGHT'S DREAM                   28
THE TAMING OF THE SHREW                     28
THE HISTORY OF TROILUS AND CRESSIDA         26
THE TRAGEDY O

In [10]:
def batch_data(data, batch_size):
    '''
    data is a dict := {'x': [list], 'y': [list]}
    returns x, y, which are both lists of size-batch_size lists
    '''
    raw_x = data['x']
    raw_y = data['y']        
    batched_x = []
    batched_y = []
    for i in range(0, len(raw_x), batch_size):
        batched_x.append(raw_x[i:i+batch_size])
        batched_y.append(raw_y[i:i+batch_size])
    return batched_x, batched_y


def read_data(train_data_dir, test_data_dir):
    '''parses data in given train and test data directories

    assumes:
    - the data in the input directories are .json files with 
        keys 'users' and 'user_data'
    - the set of train set users is the same as the set of test set users
    
    Return:
        clients: list of client ids
        groups: list of group ids; empty list if none found
        train_data: dictionary of train data
        test_data: dictionary of test data
    '''
    clients = []
    groups = []
    train_data = {}
    test_data = {}

    train_files = os.listdir(train_data_dir)
    train_files = [f for f in train_files if f.endswith('.json')]
    for f in train_files:
        file_path = os.path.join(train_data_dir,f)
        with open(file_path, 'r') as inf:
            cdata = json.load(inf)
        clients.extend(cdata['users'])
        if 'hierarchies' in cdata:
            groups.extend(cdata['hierarchies'])
        train_data.update(cdata['user_data'])

    test_files = os.listdir(test_data_dir)
    test_files = [f for f in test_files if f.endswith('.json')]
    for f in test_files:
        file_path = os.path.join(test_data_dir, f)
        with open(file_path, 'r') as inf:
            cdata = json.load(inf)
        test_data.update(cdata['user_data'])

    clients = list(train_data.keys())

    return clients, groups, train_data, test_data

In [11]:
import numpy as np
import os
import sys
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Input, Embedding, Dropout, Flatten, Conv1D, MaxPooling1D

#from tensorflow.contrib import rnn

#from model import Model
#from utils.language_utils import letter_to_vec, word_to_indices

"""Utils for language models."""

import json
import numpy as np
import re


# ------------------------
# utils for shakespeare dataset

ALL_LETTERS = " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
NUM_LETTERS = len(ALL_LETTERS)


def _one_hot(index, size):
    """Returns one-hot vector with given size and value 1 at given index."""
    vec = [0 for _ in range(size)]
    vec[int(index)] = 1
    return vec


def letter_to_vec(letter):
    """Returns one-hot representation of given letter."""
    index = max(0,ALL_LETTERS.find(letter)) # treating ' ' as unknown character
    return _one_hot(index, NUM_LETTERS)

def letter_to_index(letter):
    """Returns one-hot representation of given letter."""
    index = max(0,ALL_LETTERS.find(letter)) # treating ' ' as unknown character
    return index


def word_to_indices(word):
    '''returns a list of character indices

    Args:
        word: string
    
    Return:
        indices: int list with length len(word)
    '''
    indices = []
    for c in word:
        indices.append(max(0, ALL_LETTERS.find(c))) # added max to account for -1
    return indices


# ------------------------
# utils for sent140 dataset


def split_line(line):
    """Split given line/phrase into list of words

    Args:
        line: string representing phrase to be split
    
    Return:
        list of strings, with each string representing a word
    """
    return re.findall(r"[\w']+|[.,!?;]", line)


def _word_to_index(word, indd):
    """Returns index of given word based on given lookup dictionary

    returns the length of the lookup dictionary if word not found

    Args:
        word: string
        indd: dictionary with string words as keys and int indices as values
    """
    if word in indd:
        return indd[word]
    else:
        return len(indd)


def line_to_indices(line, indd, max_words=25):
    """Converts given phrase into list of word indices
    
    if the phrase has more than max_words words, returns a list containing
    indices of the first max_words words
    if the phrase has less than max_words words, repeatedly appends integer 
    representing unknown index to returned list until the list's length is 
    max_words

    Args:
        line: string representing phrase/sequence of words
        indd: dictionary with string words as keys and int indices as values
        max_words: maximum number of word indices in returned list

    Return:
        indl: list of word indices, one index for each word in phrase
    """
    line_list = split_line(line) # split phrase in words
    indl = []
    for word in line_list:
        cind = _word_to_index(word, indd)
        indl.append(cind)
        if (len(indl) == max_words):
            break
    for i in range(max_words - len(indl)):
        indl.append(len(indd))
    return indl


def bag_of_words(line, vocab):
    """Returns bag of words representation of given phrase using given vocab.

    Args:
        line: string representing phrase to be parsed
        vocab: dictionary with words as keys and indices as values

    Return:
        integer list
    """
    bag = [0]*len(vocab)
    words = split_line(line)
    for w in words:
        if w in vocab:
            bag[vocab[w]] += 1
    return bag


def get_word_emb_arr(path):
    with open(path, 'r') as inf:
        embs = json.load(inf)
    vocab = embs['vocab']
    word_emb_arr = np.array(embs['emba'])
    indd = {}
    for i in range(len(vocab)):
        indd[vocab[i]] = i
    vocab = {w: i for i, w in enumerate(embs['vocab'])}
    return word_emb_arr, indd, vocab


def val_to_vec(size, val):
    """Converts target into one-hot.

    Args:
        size: Size of vector.
        val: Integer in range [0, size].
    Returns:
         vec: one-hot vector with a 1 in the val element.
    """
    assert 0 <= val < size
    vec = [0 for _ in range(size)]
    vec[int(val)] = 1
    return vec

In [12]:
class ClientModel():
    def __init__(self, lr, seq_len, num_classes, n_hidden):
        self.lr = lr
        self.seq_len = seq_len
        self.num_classes = num_classes
        self.n_hidden = n_hidden
        

    def create_model(self):
        features = tf.placeholder(tf.int32, [None, self.seq_len])
        embedding = tf.get_variable("embedding", [self.num_classes, 8])
        x = tf.nn.embedding_lookup(embedding, features)
        labels = tf.placeholder(tf.int32, [None, self.num_classes])
        
        stacked_lstm = rnn.MultiRNNCell(
            [rnn.BasicLSTMCell(self.n_hidden) for _ in range(2)])
        outputs, _ = tf.nn.dynamic_rnn(stacked_lstm, x, dtype=tf.float32)
        pred = tf.layers.dense(inputs=outputs[:,-1,:], units=self.num_classes)
        
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred, labels=labels))
        train_op = self.optimizer.minimize(
            loss=loss,
            global_step=tf.train.get_global_step())

        correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(labels, 1))
        eval_metric_ops = tf.count_nonzero(correct_pred)

        return features, labels, train_op, eval_metric_ops

    def process_x(self, raw_x_batch):
        x_batch = [word_to_indices(word) for word in raw_x_batch]
        x_batch = np.array(x_batch)
        return x_batch

    def process_y(self, raw_y_batch):
        y_batch = [letter_to_vec(c) for c in raw_y_batch]
        return y_batch

In [None]:
#model = ClientModel(0.001, 80,)

In [13]:
data = read_data('shakespeare_data/train', 'shakespeare_data/test')

In [1]:
len(data[2]['KING_RICHARD_III_SON']['x'])

NameError: name 'data' is not defined

In [67]:
data[2]['TWELFTH_NIGHT__OR__WHAT_YOU_WILL_SECOND_OFFICER']['x']

['Antonio, I arrest thee at the suit Of Count Orsino. Come, sir, away. Come, sir, ',
 'ntonio, I arrest thee at the suit Of Count Orsino. Come, sir, away. Come, sir, I',
 'tonio, I arrest thee at the suit Of Count Orsino. Come, sir, away. Come, sir, I ',
 'onio, I arrest thee at the suit Of Count Orsino. Come, sir, away. Come, sir, I p',
 'nio, I arrest thee at the suit Of Count Orsino. Come, sir, away. Come, sir, I pr',
 'io, I arrest thee at the suit Of Count Orsino. Come, sir, away. Come, sir, I pra',
 'o, I arrest thee at the suit Of Count Orsino. Come, sir, away. Come, sir, I pray',
 ', I arrest thee at the suit Of Count Orsino. Come, sir, away. Come, sir, I pray ',
 ' I arrest thee at the suit Of Count Orsino. Come, sir, away. Come, sir, I pray y',
 'I arrest thee at the suit Of Count Orsino. Come, sir, away. Come, sir, I pray yo',
 'rrest thee at the suit Of Count Orsino. Come, sir, away. Come, sir, I pray you g',
 'rest thee at the suit Of Count Orsino. Come, sir, away. Come, s

In [270]:
data[2]['TWELFTH_NIGHT__OR__WHAT_YOU_WILL_SECOND_OFFICER']['y']

['I', ' ', 'p', 'r', 'a', 'y', ' ', 'y', 'o', 'u', 'o', '.', ' ']

In [94]:
len(user_data_train['THE_TRAGEDY_OF_CORIOLANUS_FIRST_LORD']['x'])

334

In [14]:
def data_to_vec(data):
    return np.array(list(map(
        lambda sent: np.array(list(map(letter_to_vec, sent))), data)))

def data_to_index(data):
    return np.array(list(map(
        lambda sent: np.array(list(map(letter_to_index, sent))), data)))

In [239]:
user_info.head()

Unnamed: 0_level_0,play,num_samples_train,num_samples_test,user_id,play_id
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TWELFTH_NIGHT__OR__WHAT_YOU_WILL_SECOND_OFFICER,"TWELFTH NIGHT; OR, WHAT YOU WILL",13,2,1124,35
THE_TRAGEDY_OF_CORIOLANUS_FIRST_LORD,THE TRAGEDY OF CORIOLANUS,334,38,819,25
THE_FIRST_PART_OF_HENRY_THE_SIXTH_COUNTESS,THE FIRST PART OF HENRY THE SIXTH,1736,193,393,13
THE_TRAGEDY_OF_JULIUS_CAESAR_ARTEMIDORUS,THE TRAGEDY OF JULIUS CAESAR,639,72,884,27
THE_SECOND_PART_OF_KING_HENRY_THE_SIXTH_WARWICK,THE SECOND PART OF KING HENRY THE SIXTH,5184,576,664,20


In [151]:
user_info.loc['TWELFTH_NIGHT__OR__WHAT_YOU_WILL_SECOND_OFFICER']

play                 TWELFTH NIGHT; OR, WHAT YOU WILL
num_samples_train                                  13
num_samples_test                                    2
user_id                                          1124
play_id                                            35
Name: TWELFTH_NIGHT__OR__WHAT_YOU_WILL_SECOND_OFFICER, dtype: object

In [1]:
user_info.shape

NameError: name 'user_info' is not defined

In [307]:
users_train = []
users_test = []
plays_train = []
plays_test = []
x_data_train = []
x_data_test = []
y_data_train = []
y_data_test = []
for i in range(100):
    user = data[0][i]
    play = data[1][i]
    train_data = data[2][user]
    test_data = data[3][user]
    
    play, num_samples_train, num_samples_test, user_id, play_id = (
        user_info.loc[user].values)
    
    users_train.append(np.array([user_id]*num_samples_train))
    users_test.append(np.array([user_id]*num_samples_test))
    plays_train.append(np.array([play_id]*num_samples_train))
    plays_test.append(np.array([play_id]*num_samples_test))
    x_data_train.append(data_to_index(train_data['x']))
    x_data_test.append(data_to_index(test_data['x']))
    y_data_train.append(data_to_index(train_data['y']))
    y_data_test.append(data_to_index(test_data['y']))
    
users_train = np.concatenate(users_train)
users_test = np.concatenate(users_test)
plays_train = np.concatenate(plays_train)
plays_test = np.concatenate(plays_test)
x_data_train = np.concatenate(x_data_train)
x_data_test = np.concatenate(x_data_test)
y_data_train = np.concatenate(y_data_train)
y_data_test = np.concatenate(y_data_test)


In [308]:
np.save('shakespeare_data/small_dataset/x_train.npy', x_data_train)
np.save('shakespeare_data/small_dataset/y_train.npy', y_data_train)
np.save('shakespeare_data/small_dataset/x_test.npy', x_data_test)
np.save('shakespeare_data/small_dataset/y_test.npy', y_data_test)
np.save('shakespeare_data/small_dataset/gid_train.npy', users_train)
np.save('shakespeare_data/small_dataset/gid_test.npy', users_test )
np.save('shakespeare_data/small_dataset/gid2_train.npy', plays_train)
np.save('shakespeare_data/small_dataset/gid2_test.npy', plays_test )

In [309]:
user_info.to_pickle('shakespeare_data/user_info.pickle')

In [15]:
users_train = np.load('shakespeare_data/small_dataset/x_train.npy')
users_test = np.load('shakespeare_data/small_dataset/y_train.npy')
plays_train = np.load('shakespeare_data/small_dataset/x_test.npy')
plays_test = np.load('shakespeare_data/small_dataset/y_test.npy')
x_data_train = np.load('shakespeare_data/small_dataset/gid_train.npy')
x_data_test = np.load('shakespeare_data/small_dataset/gid_test.npy')
y_data_train = np.load('shakespeare_data/small_dataset/gid2_train.npy')
y_data_test = np.load('shakespeare_data/small_dataset/gid2_test.npy')

In [198]:
#y_data = np.squeeze(data_to_vec(user_data_train['THE_TRAGEDY_OF_CORIOLANUS_FIRST_LORD']['y']))
#x_data = data_to_vec(user_data_train['THE_TRAGEDY_OF_CORIOLANUS_FIRST_LORD']['x'])

In [284]:
x_data_test.shape

(41097, 80)

In [109]:
x_data.shape, y_data.shape

((334, 80, 53), (334, 53))

In [187]:
from scipy.sparse import csr_matrix

In [186]:
x_data_train

dtype('int64')

In [25]:
model = Sequential([
    #Input(batch_size=100, shape=(80,53), sparse=True),
    Embedding(53, 128, input_length=80),
    LSTM(256, return_sequences=True),
    LSTM(256),
    #Flatten(),
    Dense(256, activation='relu'),
    Dense(53, activation='softmax'),
])

In [600]:
model = Sequential([
    #Input(batch_size=100, shape=(80,53), sparse=True),
    Embedding(53, 256, input_length=80),
    Conv1D(256, 5, activation='relu'),
    MaxPooling1D(2),
    Conv1D(512, 5,  activation='relu'),
    MaxPooling1D(2),
    Conv1D(1024, 3,  activation='relu'),
    #MaxPooling1D(3),
    #Conv1D(1024, 3,  activation='relu'),
    MaxPooling1D(2),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(53)
])

In [26]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=4e-4),
    metrics=['acc'],
    loss=tf.keras.losses.SparseCategoricalCrossentropy())

In [33]:
[10]

[10]

In [27]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 80, 128)           6784      
_________________________________________________________________
lstm_10 (LSTM)               (None, 80, 256)           394240    
_________________________________________________________________
lstm_11 (LSTM)               (None, 256)               525312    
_________________________________________________________________
dense_10 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_11 (Dense)             (None, 53)                13621     
Total params: 1,005,749
Trainable params: 1,005,749
Non-trainable params: 0
_________________________________________________________________


In [598]:
model.fit(x_data_train, y_data_train, batch_size=500, epochs=1, shuffle=True)

Train on 369430 samples

KeyboardInterrupt: 

In [None]:
model.evaluate(x_data_test, y_data_test)

In [316]:
user_info[user_info['user_id']==0]

Unnamed: 0_level_0,play,num_samples_train,num_samples_test,user_id,play_id
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALLS_WELL_THAT_ENDS_WELL_BERTRAM,ALLS WELL THAT ENDS WELL,9484,1054,0,1


In [333]:
data[2]['ALLS_WELL_THAT_ENDS_WELL_BERTRAM']['x'][20]

" weep o'er my father's death anew; but I must attend his Majesty's command, to w"

In [341]:
data[3]['ALLS_WELL_THAT_ENDS_WELL_BERTRAM']['x'][0]

", madam, weep o'er my father's death anew; but I must attend his Majesty's comma"

In [324]:
''.join(map(get_letter, x_data_train[0]))

'Antonio  I arrest thee at the suit Of Count Orsino  Come  sir  away  Come  sir  '

In [321]:
get_letter = lambda x: ALL_LETTERS[x]

In [342]:
len(data)

4

In [None]:
read_data()