In [1]:
import os
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,auc,roc_curve
import clean

  from .autonotebook import tqdm as notebook_tqdm


Function for cleaning Shakespeare plays, using the custom clean module 

In [2]:
def cleanShakespeareText(filePath):
    with open(filePath) as file:
        lines = file.readlines()
        file.close()
    text=[]

    for line in lines:
        if line =='\n':
            continue
        
        line=clean.removeSpeakerName(line) #remove speaker's name from  each line
        line=clean.removeLinesBasedOnWords(['ACT', 'SCENE'],line) # removing lines with the words ACTS and Scenes
        line=clean.removeWords(['1','2','3','4','5','6','7','8','9'],line)
        line=clean.removeLinesBasedOnWords(['Exit'],line)
        line=clean.removeLinesBasedOnWords(['Enter'],line)
        line=clean.remove_words_in_brackets(line)
        line=clean.removeWhitespace(line)
        text.append(line)
    return text    

Function for the creation of the clean plays

In [3]:
def cleanFilesinPath(playPath,cleanFilesPath):
    for play in os.scandir(playPath):
        if play.is_file():
            print((" cleaning  ") + (play.name))
            cleanedPlay=cleanShakespeareText(play.path)
            
            with open(cleanFilesPath+play.name.removesuffix('.txt')+'Cleaned'+'.txt', 'w') as cleanedFile:
                
                cleanedFile.write("\n".join(str(item) for item in cleanedPlay))
                

In [None]:
OriginalShakespearePath='./Corpus/Shakespeare/'
CleanShakespearePath='./Corpus/Shakespeare/CleanedPlays/'
cleanFilesinPath(OriginalShakespearePath,CleanShakespearePath)

Function for the import of the selected plays from each author

In [4]:

def importPlaysinCorpus(CorpusPath,Author):
    corpus=pd.DataFrame()
    for play in os.scandir(CorpusPath):
        if play.is_file():
            print((" importing  ") + (play.path) + (" to corpus"))
            newplay=pd.read_csv(play.path, delimiter='\r', header=None, names=['sentence_source', 'author', 'play'])
            newplay[['author']]=Author
            if 'Cleaned' in play.path:
                newplay[['play']]=os.path.basename(play.path).removesuffix('Cleaned.txt')
            else:
                newplay[['play']]=os.path.basename(play.path).removesuffix('.txt')
            corpus=pd.concat([corpus, newplay], axis = 0,join='outer')
    return corpus

In [7]:
projectCorpus=pd.DataFrame()
MarlowePath='./Corpus/Marlowe/'
ShakespearePath='./Corpus/Shakespeare/CleanedPlays/'

projectCorpus=importPlaysinCorpus(MarlowePath,'Marlowe')
projectCorpus=projectCorpus.append(importPlaysinCorpus(ShakespearePath,'Shakespeare'))
projectCorpus['sentence_source']=projectCorpus['sentence_source'].str.lower()

 importing  ./Corpus/Marlowe/Dido.txt to corpus
 importing  ./Corpus/Marlowe/DrFaustus.txt to corpus
 importing  ./Corpus/Marlowe/EdwardII.txt to corpus
 importing  ./Corpus/Marlowe/JewOfMalta.txt to corpus
 importing  ./Corpus/Marlowe/Tamburlaine1.txt to corpus
 importing  ./Corpus/Marlowe/Tamburlaine2.txt to corpus
 importing  ./Corpus/Shakespeare/CleanedPlays/AnthonyCleopatraCleaned.txt to corpus
 importing  ./Corpus/Shakespeare/CleanedPlays/HenryVIIICleaned.txt to corpus
 importing  ./Corpus/Shakespeare/CleanedPlays/HenryVCleaned.txt to corpus
 importing  ./Corpus/Shakespeare/CleanedPlays/MacbethCleaned.txt to corpus
 importing  ./Corpus/Shakespeare/CleanedPlays/RichardIIICleaned.txt to corpus
 importing  ./Corpus/Shakespeare/CleanedPlays/HamletCleaned.txt to corpus


  projectCorpus=projectCorpus.append(importPlaysinCorpus(ShakespearePath,'Shakespeare'))


In [6]:
print("Marlowe median rows ",  projectCorpus.loc[projectCorpus['author'] == 'Marlowe'].play.value_counts().median())
print("Shakespeare median rows ",  projectCorpus.loc[projectCorpus['author'] == 'Shakespeare'].play.value_counts().median())

Marlowe median rows  2495.5
Shakespeare median rows  3578.5


Taking a sample of each play based on Marlowe's plays line median

In [8]:
NewShakesepareCorpus=pd.DataFrame()
MarloweMean=projectCorpus.loc[projectCorpus['author'] == 'Marlowe'].play.value_counts().mean()
for play in projectCorpus.loc[projectCorpus['author'] == 'Shakespeare'].play.unique():
    selectedPlay=projectCorpus.loc[projectCorpus['play'] == play]
    if selectedPlay.value_counts().sum() > MarloweMean:
        playSample = selectedPlay.sample(int(MarloweMean))
        NewShakesepareCorpus=pd.concat([NewShakesepareCorpus, playSample], axis = 0,join='outer')

In [9]:
NewProjectCorpus=pd.DataFrame()
NewProjectCorpus=projectCorpus.loc[projectCorpus['author'] == 'Marlowe']
NewProjectCorpus=pd.concat([NewProjectCorpus, NewShakesepareCorpus], axis = 0,join='outer')
NewProjectCorpus.sample(20)

Unnamed: 0,sentence_source,author,play
581,unsifted in such perilous circumstance.,Shakespeare,Hamlet
1233,let’s away to him.exeunt.,Marlowe,DrFaustus
2612,"did you attempt his rescue, edmund",Marlowe,EdwardII
1790,whereon his brains still beating puts him thus,Shakespeare,Hamlet
1656,than live in infamy under such a king.,Marlowe,EdwardII
853,"ay, by heaven, my lord.",Shakespeare,Hamlet
1147,his horses go about.,Shakespeare,Macbeth
3404,"thought thy bride-bed to have deck’d, sweet maid,",Shakespeare,Hamlet
3438,"which we have not done neither; that, fear,",Shakespeare,HenryVIII
2429,"have knit again, and fleet, threat'ning most s...",Shakespeare,AnthonyCleopatra


Function to count words in each play

In [5]:
def wordsPerPlay(corpus):
    columns=['Play', 'Words Sum', 'Author']
    PlayList=[]
    for Play in corpus['play'].unique():
            PlayInLoop=(corpus.loc[corpus['play'] == Play])
            PlaySum = sum(PlayInLoop['sentence_source'].str.count('\w+'))
            PlayName = Play
            PlayAuthor = PlayInLoop['author'].unique()[0]
            PlayList.append([PlayName,PlaySum,PlayAuthor])
            
    WordsPerPlay=pd.DataFrame(PlayList, columns=columns)
    return WordsPerPlay

In [11]:
corpusStats=wordsPerPlay(projectCorpus)
newCorpusStats=wordsPerPlay(NewProjectCorpus)
print("Old Corpus:\n",corpusStats,"\n","New Corpus:\n",newCorpusStats)
print("Old Corpus Mean:\n",corpusStats['Words Sum'].mean(),"\n","New Corpus Mean:\n",newCorpusStats['Words Sum'].mean())

Old Corpus:
                 Play  Words Sum       Author
0               Dido      14003      Marlowe
1          DrFaustus      12040      Marlowe
2           EdwardII      21164      Marlowe
3         JewOfMalta      19328      Marlowe
4       Tamburlaine1      17666      Marlowe
5       Tamburlaine2      17921      Marlowe
6   AnthonyCleopatra      24163  Shakespeare
7          HenryVIII      24230  Shakespeare
8             HenryV      28687  Shakespeare
9            Macbeth      16859  Shakespeare
10        RichardIII      30886  Shakespeare
11            Hamlet      29958  Shakespeare 
 New Corpus:
                 Play  Words Sum       Author
0               Dido      14003      Marlowe
1          DrFaustus      12040      Marlowe
2           EdwardII      21164      Marlowe
3         JewOfMalta      19328      Marlowe
4       Tamburlaine1      17666      Marlowe
5       Tamburlaine2      17921      Marlowe
6   AnthonyCleopatra      14000  Shakespeare
7          HenryVIII      1

In [12]:
print("Shakespeare corpus mean before sampling",corpusStats.loc[corpusStats['Author'] == 'Shakespeare']['Words Sum'].mean(),"Shakespeare corpus mean after sampling",newCorpusStats.loc[corpusStats['Author'] == 'Shakespeare']['Words Sum'].mean())

Shakespeare corpus mean before sampling 25797.166666666668 Shakespeare corpus mean after sampling 16128.833333333334


<p1> Preparing the data for tokenization, splitting them and  data </p1>

In [10]:

# Loading the pre-trained BERT model and tokenizer
model = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(model)
MarShpeare = BertForSequenceClassification.from_pretrained(model, num_labels=1)

input_ids = [tokenizer.encode(sent, add_special_tokens=True,max_length=100,pad_to_max_length=True) for sent in NewProjectCorpus]
# Preparing the data

X = [str(i) for i in NewProjectCorpus['sentence_source'].values]       # comma separate the sentences in the pandas column 'sentence_source'

NewProjectCorpus['author'].replace(['Marlowe','Shakespeare'],[0,1],inplace=True)       # replace authors with numbers in the pandas column 'author' as tensors accept numerical values only

y = [float(i) for i in NewProjectCorpus['author'].values]                # comma separate the integers representing the authors, the backward function expects a float

# Tokenize and format the input sentences
X_tokenized = tokenizer(X, padding=True, truncation=True, max_length=100, return_tensors='pt')

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tokenized['input_ids'], y, test_size=0.2, random_state=42)

# Convert the data to PyTorch tensors
X_train = torch.tensor(X_train)
X_test = torch.tensor(X_test)
y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)

# Create the DataLoader for training and testing
BatchSize = 16
train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_data, shuffle=True,batch_size=BatchSize)
test_data = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_data, shuffle=True,batch_size=BatchSize)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [11]:
# Set up the optimizer and loss function
optimizer = torch.optim.AdamW(MarShpeare.parameters())
loss_fn = torch.nn.CrossEntropyLoss()

In [12]:
# Training loop
epochs = 7 #5 epochs seem to work best
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # use nvidia cuda 12.1 -> 12.2 is not supported yet!
MarShpeare.to(device)
MarShpeare.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch[0].to(device)
        labels = batch[1].to(device)
        optimizer.zero_grad()
        outputs = MarShpeare(input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Average Training Loss: {avg_train_loss}")


Epoch 1/7, Average Training Loss: 0.32499487718694675
Epoch 2/7, Average Training Loss: 0.3074251179118783
Epoch 3/7, Average Training Loss: 0.2864092650617846
Epoch 4/7, Average Training Loss: 0.2979120805939741
Epoch 5/7, Average Training Loss: 0.2911657956210491
Epoch 6/7, Average Training Loss: 0.2927288159922762
Epoch 7/7, Average Training Loss: 0.2962071697795327


In [13]:
MarShpeare.eval()
preds = []
true_labels = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch[0].to(device)
        labels = batch[1].to(device)

        outputs = MarShpeare(input_ids)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)
        
        preds.extend(predicted_labels.tolist())
        true_labels.extend(labels.tolist())

accuracy = accuracy_score(true_labels, preds)
print(f"Test Accuracy: {accuracy}")
f1 = f1_score(true_labels, preds, average='macro') 
print(f"F1 score: {f1}")
 

Test Accuracy: 0.5088393543428132
F1 score: 0.337238920020377


In [None]:
fpr, tpr, thresholds = roc_curve(true_labels, preds, pos_label=2)
AUC = auc(fpr, tpr)
AUC

Exporting the model

In [None]:
SavePath='c:/temp/MarSpheare'
MarShpeare.save_pretrained('c:/temp/MarSpheare')
tokenizer.save_pretrained('c:/temp/MarSpheare')

<p2>Evaluating the model</p2>

In [14]:
TestPath='./Corpus/Unseen/'
CleanTestPath='./Corpus/Unseen/Cleaned/'


#cleanFilesinPath(TestPath,CleanTestPath)

In [19]:
testCorpus=pd.DataFrame()

testCorpus=importPlaysinCorpus(CleanTestPath,'Shakespeare')
testCorpus['sentence_source']=testCorpus['sentence_source'].str.lower()

 importing  ./Corpus/Unseen/Cleaned/HenrVIPart1Cleaned.txt to corpus
 importing  ./Corpus/Unseen/Cleaned/HenrVIPart3Cleaned.txt to corpus
 importing  ./Corpus/Unseen/Cleaned/HenrVIPart2Cleaned.txt to corpus


In [20]:
testCorpus.sample(20)

Unnamed: 0,sentence_source,author,play
23,"but is your grace dead, my lord of somerset?",Shakespeare,HenrVIPart3
2560,you should leave me at the white hart in south...,Shakespeare,HenrVIPart2
395,"begin your suits anew, and sue to him.",Shakespeare,HenrVIPart2
2332,tends to god's glory and my country's weal.,Shakespeare,HenrVIPart1
394,"under the wings of our protector’s grace,",Shakespeare,HenrVIPart2
2878,"clarence, excuse me to the king my brother.",Shakespeare,HenrVIPart3
2716,and we are grac'd with wreaths of victory.,Shakespeare,HenrVIPart3
2846,marriage is a matter of more worth,Shakespeare,HenrVIPart1
1988,and am louted by a traitor villain,Shakespeare,HenrVIPart1
1015,him so much that he is drunk; and he enters wi...,Shakespeare,HenrVIPart2


In [23]:

X = [str(i) for i in testCorpus['sentence_source'].values]       # comma separate the sentences in the pandas column 'sentence_source'
model = torch.load(SavePath)
tokenizer = BertTokenizer.from_pretrained(model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # use nvidia cuda 12.1 -> 12.2 is not supported yet!

tokenized_text = tokenizer(X, padding=True, truncation=True, max_length=100, return_tensors='pt')
batch_size = 16
unseen_data_loader = torch.utils.data.DataLoader(tokenized_text['input_ids'], batch_size=batch_size)



PermissionError: [Errno 13] Permission denied: 'c:/temp/MarSpheare'

In [None]:

model.eval()
predicted_authors = []
with torch.no_grad():
    for batch in unseen_data_loader:
        batch = batch.to('cuda' if torch.cuda.is_available() else 'cpu')
        outputs = model(batch)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)
        predicted_authors.extend(predicted_labels.tolist())
