In [1]:
import os
import pandas as pd
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub
from tensorflow.keras.layers import Dense, Dropout, Input
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,auc,roc_curve
import pydot # it needs Graphviz to be installed as well
# clean


In [2]:
if 'COLAB_TPU_ADDR' in os.environ:
  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
  tf.config.experimental_connect_to_cluster(cluster_resolver)
  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
  strategy = tf.distribute.TPUStrategy(cluster_resolver)
  print('Using TPU')
elif tf.config.list_physical_devices('GPU'):
  strategy = tf.distribute.MirroredStrategy()
  print('Using GPU')
else:
  raise ValueError('Running on CPU is not recommended.')


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Using GPU


In [5]:
def importPlaysinCorpus(CorpusPath,Author):
    corpus=pd.DataFrame()
    for play in os.scandir(CorpusPath):
        if play.is_file():
            print((" importing  ") + (play.path) + (" to corpus"))
            newplay=pd.read_csv(play.path, delimiter='\r', header=None, names=['sentence_source', 'author', 'play'])
            newplay[['author']]=Author
            if 'Cleaned' in play.path:
                newplay[['play']]=os.path.basename(play.path).removesuffix('Cleaned.txt')
            else:
                newplay[['play']]=os.path.basename(play.path).removesuffix('.txt')
            corpus=pd.concat([corpus, newplay], axis = 0,join='outer')
    return corpus

In [6]:
projectCorpus=pd.DataFrame()
MarlowePath='./Corpus/Marlowe/'
ShakespearePath='./Corpus/Shakespeare/CleanedPlays/'

projectCorpus=importPlaysinCorpus(MarlowePath,'Marlowe')
projectCorpus=projectCorpus.append(importPlaysinCorpus(ShakespearePath,'Shakespeare'))
projectCorpus['sentence_source']=projectCorpus['sentence_source'].str.lower()

 importing  ./Corpus/Marlowe/Dido.txt to corpus
 importing  ./Corpus/Marlowe/DrFaustus.txt to corpus
 importing  ./Corpus/Marlowe/EdwardII.txt to corpus
 importing  ./Corpus/Marlowe/JewOfMalta.txt to corpus
 importing  ./Corpus/Marlowe/Tamburlaine1.txt to corpus
 importing  ./Corpus/Marlowe/Tamburlaine2.txt to corpus
 importing  ./Corpus/Shakespeare/CleanedPlays/AnthonyCleopatraCleaned.txt to corpus
 importing  ./Corpus/Shakespeare/CleanedPlays/HenryVIIICleaned.txt to corpus
 importing  ./Corpus/Shakespeare/CleanedPlays/HenryVCleaned.txt to corpus
 importing  ./Corpus/Shakespeare/CleanedPlays/MacbethCleaned.txt to corpus
 importing  ./Corpus/Shakespeare/CleanedPlays/RichardIIICleaned.txt to corpus
 importing  ./Corpus/Shakespeare/CleanedPlays/HamletCleaned.txt to corpus


  projectCorpus=projectCorpus.append(importPlaysinCorpus(ShakespearePath,'Shakespeare'))


In [7]:
NewShakesepareCorpus=pd.DataFrame()
MarloweMean=projectCorpus.loc[projectCorpus['author'] == 'Marlowe'].play.value_counts().mean()
for play in projectCorpus.loc[projectCorpus['author'] == 'Shakespeare'].play.unique():
    selectedPlay=projectCorpus.loc[projectCorpus['play'] == play]
    if selectedPlay.value_counts().sum() > MarloweMean:
        playSample = selectedPlay.sample(int(MarloweMean))
        NewShakesepareCorpus=pd.concat([NewShakesepareCorpus, playSample], axis = 0,join='outer')

In [8]:
NewProjectCorpus=pd.DataFrame()
NewProjectCorpus=projectCorpus.loc[projectCorpus['author'] == 'Marlowe']
NewProjectCorpus=pd.concat([NewProjectCorpus, NewShakesepareCorpus], axis = 0,join='outer')
NewProjectCorpus.sample(20)

Unnamed: 0,sentence_source,author,play
2007,grant that these signs of victory we yield,Marlowe,Tamburlaine1
2087,"as wrathful planets, death, or destiny.",Marlowe,Tamburlaine1
306,"is stoutly to abjure the trinity,",Marlowe,DrFaustus
2449,"and when my soul hath virtue of your sight,",Marlowe,Tamburlaine2
3851,electronic work is discovered and reported to ...,Shakespeare,RichardIII
1914,shall mount the milk white way and meet him th...,Marlowe,Tamburlaine2
1861,like caesar's sister. the wife of antony,Shakespeare,AnthonyCleopatra
399,"but this is not the best. look, prithee, charm...",Shakespeare,AnthonyCleopatra
1378,"ill news, by’r lady; seldom comes the better.",Shakespeare,RichardIII
1704,"and ebb again, as thou depart st from me.",Marlowe,Tamburlaine2


In [9]:
#replacein the author with 0 for Marlowe and 1 for Shakespeare
NewProjectCorpus['author'].replace(['Marlowe','Shakespeare'],[0,1],inplace=True)
NewProjectCorpus.sample(20)

Unnamed: 0,sentence_source,author,play
397,"now, by my sword-",1,AnthonyCleopatra
377,"something thou hast deserved, away i say, depa...",0,Dido
2203,"they pass not for thy frowns as late they did,",0,EdwardII
1664,to make these captives rein their lavish tongu...,0,Tamburlaine1
1477,"and see where god stretcheth out his arm,",0,DrFaustus
742,"into our presence, where this heaven of beauty",1,HenryVIII
1134,"i come to join with you, and leave the king,",0,EdwardII
1866,embrace it.,1,HenryV
119,we'll know all our fortunes.,1,AnthonyCleopatra
1119,captain what require you my masters?,0,Tamburlaine2


Preprocessing

In [10]:
preProcessURL='https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
encoderURL='https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
bert_preprocess_model = hub.KerasLayer(preProcessURL)

In [11]:
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
BUFFER_SIZE = 10000
BATCH_SIZE_PER_REPLICA = 64
BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

Number of devices: 1


In [10]:
projectCorpus['sentence_source'].value_counts

<bound method IndexOpsMixin.value_counts of 0                               the tragedy of dido queen
1                                            of carthage.
2       here the curtains draw, there is discovered ju...
3                     ganymede upon his knee, and mercury
4                                           lying asleep.
                              ...                        
3871             take up the bodies. such a sight as this
3872        becomes the field, but here shows much amiss.
3873                          go, bid the soldiers shoot.
3874    [_exeunt, bearing off the bodies, after which ...
3875                                          shot off._]
Name: sentence_source, Length: 33721, dtype: object>

In [11]:
text_test = projectCorpus['sentence_source'].values
with strategy.scope():
    text_preprocessed = bert_preprocess_model(text_test)


In [None]:
print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')


In [None]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(preProcessURL, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(encoderURL, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)


In [None]:
classifier_model = build_classifier_model()
bert_raw_result = classifier_model(tf.constant(text_test))
print(tf.sigmoid(bert_raw_result))


In [None]:
tf.keras.utils.plot_model(classifier_model)


In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()


In [None]:
epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')
