In [1]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns

In [2]:
shakespeare_data_url = "https://raw.githubusercontent.com/nishilp/Datascience02/master/data/Shakespeare_data.csv"

In [3]:
# Loading the data into pandas dataframe

full_data = pd.read_csv(shakespeare_data_url)

In [4]:
# Verifying the pandas dataframe

full_data.head(5)

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [5]:
# Checking for duplicate rows

sum(full_data.duplicated())

# No duplicate found

0

In [6]:

# Check total number of rows and columns. This frame has 5320 rows and 14 columns

full_data.shape

# This dataset has 111396 rows and 6 columns


(111396, 6)

In [7]:
# Check rows with missing attribute values

full_data.isnull().sum(axis=0)

# Interesting revelation here is that attribute "ActSceneLine" has 6243 missing values

Dataline               0
Play                   0
PlayerLinenumber       3
ActSceneLine        6243
Player                 7
PlayerLine             0
dtype: int64

In [8]:
# Almost all "PlayerLine" values are unique, so its better to drop it from our dataframe to simplify classification

full_data = full_data.drop(columns=['PlayerLine'])
full_data = full_data.drop(columns=['Dataline'])
# full_data = full_data.drop(columns=['ActSceneLine'])

In [9]:
# Dropping rows with null values 

full_data = full_data.dropna()

In [11]:
# Performing feature engineering on "ActSceneLine", taking the only "Act" & "Scene", ignoring "Line"

full_data.ActSceneLine = full_data.ActSceneLine.str.slice(0, 3)

In [12]:
full_data.head(5)

Unnamed: 0,Play,PlayerLinenumber,ActSceneLine,Player
3,Henry IV,1.0,1.1,KING HENRY IV
4,Henry IV,1.0,1.1,KING HENRY IV
5,Henry IV,1.0,1.1,KING HENRY IV
6,Henry IV,1.0,1.1,KING HENRY IV
7,Henry IV,1.0,1.1,KING HENRY IV


In [14]:
# Converting "ActSceneLine" from string to "float"

full_data['ActSceneLine'] = full_data['ActSceneLine'].convert_objects(convert_numeric=True)

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  """Entry point for launching an IPython kernel.


In [17]:
# We need to convert categorical features to numerical ones
# Using LabelEncoder on our target feature

from sklearn.preprocessing import LabelEncoder

number = LabelEncoder()
full_data['Player'] = number.fit_transform(full_data['Player'].astype('str'))
# full_data['PlayerLine'] = number.fit_transform(full_data['PlayerLine'].astype('str'))

In [18]:
# Label encoded target "Player"

full_data.head(5)

Unnamed: 0,Play,PlayerLinenumber,ActSceneLine,Player
3,Henry IV,1.0,1.1,457
4,Henry IV,1.0,1.1,457
5,Henry IV,1.0,1.1,457
6,Henry IV,1.0,1.1,457
7,Henry IV,1.0,1.1,457


In [20]:
# Onehot encoding on feature "Play"

full_data = pd.get_dummies(full_data, drop_first=True)

In [21]:
# Verifying the data after onehot encoding of categorical features (in this case "Play")

full_data.head(5)

Unnamed: 0,PlayerLinenumber,ActSceneLine,Player,Play_A Midsummer nights dream,Play_A Winters Tale,Play_Alls well that ends well,Play_Antony and Cleopatra,Play_As you like it,Play_Coriolanus,Play_Cymbeline,...,Play_Richard III,Play_Romeo and Juliet,Play_Taming of the Shrew,Play_The Tempest,Play_Timon of Athens,Play_Titus Andronicus,Play_Troilus and Cressida,Play_Twelfth Night,Play_Two Gentlemen of Verona,Play_macbeth
3,1.0,1.1,457,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,1.1,457,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1.0,1.1,457,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1.0,1.1,457,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1.0,1.1,457,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# Splitting the data into training and testing sets using sklearn module "train_test_split"

X = full_data.drop(columns=['Player'])
y = full_data['Player']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [24]:
# Classification Model : Logistic Regression

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(logreg.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(logreg.score(X_test, y_test)))

Accuracy of Logistic regression classifier on training set: 0.24
Accuracy of Logistic regression classifier on test set: 0.25


In [23]:
# Classification Model : Decision Trees

from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier().fit(X_train, y_train)

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on training set: 1.00
Accuracy of Decision Tree classifier on test set: 0.84


In [25]:
# Classification Model : K-nearest neighbors

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))



Accuracy of K-NN classifier on training set: 0.79
Accuracy of K-NN classifier on test set: 0.68


In [26]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
print('Accuracy of LDA classifier on training set: {:.2f}'
     .format(lda.score(X_train, y_train)))
print('Accuracy of LDA classifier on test set: {:.2f}'
     .format(lda.score(X_test, y_test)))

Accuracy of LDA classifier on training set: 0.22
Accuracy of LDA classifier on test set: 0.22


In [27]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
print('Accuracy of GNB classifier on training set: {:.2f}'
     .format(gnb.score(X_train, y_train)))
print('Accuracy of GNB classifier on test set: {:.2f}'
     .format(gnb.score(X_test, y_test)))

Accuracy of GNB classifier on training set: 0.24
Accuracy of GNB classifier on test set: 0.23
