Natalie LaLuzerne

Imports

In [1]:
import pandas as pd
import string
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

Constants

In [2]:
data_directory = r'..\Data\Shakespeare_data.csv'

Read in data set

In [3]:
data = pd.read_csv( data_directory )

Remove the Dataline column from the data set

In [4]:
data = data.drop( columns = [ 'Dataline' ] )

Remove lines of the data set where PlayerLine, ActSceneLine, and Player are NaN

In [5]:
data = data.dropna()

Split the ActSceneLine column into three separate columns: Act, Scene, Line

In [6]:
ASL = data[ 'ActSceneLine' ].str.split( '.', expand = True )
data[ 'Act' ] = ASL[ 0 ]
data[ 'Scene' ] = ASL[ 1 ]
data[ 'Line' ] = ASL[ 2 ]
data = data.drop( columns = [ 'ActSceneLine' ] )

Convert the names of the Plays to numbers

In [7]:
plays = pd.Series( data[ 'Play' ], dtype = 'category' )
playNums = plays.cat.codes
data = data.drop( columns = [ 'Play' ] )
data.insert( len( data.columns ), 'Play', playNums, True )

Convert the names of the Players to numbers

In [8]:
players = pd.Series( data[ 'Player' ], dtype = 'category' )
playerNums = players.cat.codes
data = data.drop( columns = [ 'Player' ] )
data.insert( len( data.columns ), 'Player', playerNums, True )

Remove all punctuation from the PlayerLine Column

In [9]:
pfree_lines = []
[ pfree_lines.append( line.translate( str.maketrans( '','', string.punctuation ) ) ) for line in data[ 'PlayerLine' ] ]
data = data.drop( columns = [ 'PlayerLine' ] )
data.insert( len( data.columns ), 'PlayerLine', pfree_lines, True )

Select a random sample from the dataset to use in the classifier

In [10]:
data = data.sample( frac = 0.45, replace = True )

Transform the strings of player lines to floats for classification

In [11]:
labelEncoder = preprocessing.LabelEncoder()
data[ 'PlayerLine' ] = labelEncoder.fit_transform( data[ 'PlayerLine' ] )

Create features and labels

In [12]:
x = data[ [ 'PlayerLinenumber', 'Act', 'Scene', 'Line', 'Play', 'PlayerLine' ] ]
y = data[ 'Player' ]

Split the data set into training and test sets

In [13]:
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size = 0.2 )

Create a Random Forest Classifier

In [14]:
rfc = RandomForestClassifier( n_estimators = 100 )

Train the Random Forest Classifier and compute results

In [15]:
rfc.fit( x_train, y_train )
y_predict = rfc.predict( x_test )
accuracy = metrics.accuracy_score( y_test, y_predict ) * 100
print( 'Testing Accuracy: {0:.1f}%'.format( accuracy ) )

Testing Accuracy: 76.9%
