## EECS 731 Project 2: Classification
### by Matthew Taylor

### Import required modules

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

### Import Shakespeare dataset

In [2]:
df = pd.read_csv('data/Shakespeare_data.csv')

In [3]:
df.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


### Data Cleaning

In [4]:
df = df.drop(columns="Dataline")
df = df[df.Player.notnull()]
df = df[df.ActSceneLine.notnull()]
df = df.reset_index(drop=True)

### Feature Engineering

In [5]:
# Categorize Plays, give them integer IDs
unique_plays = list(set(df.Play.values))
play_id_column = [unique_plays.index(x) for x in df.Play.values]

# Categorize Players, give them integer IDs
unique_players = list(set(df.Player.values))
player_id_column = [unique_players.index(x) for x in df.Player.values]

# Parse Act, Scene, and Line numbers
act_scene_line_column = list(df.ActSceneLine)
act_column = [x.split('.')[0] for x in act_scene_line_column]
scene_column = [x.split('.')[1] for x in act_scene_line_column]
line_column = [x.split('.')[2] for x in act_scene_line_column]

# Add new columns to dataframe
df.insert(1, "Play_ID", play_id_column, True)
df.insert(4, "Act", act_column, True)
df.insert(5, "Scene", scene_column, True)
df.insert(6, "Line", line_column, True)
df.insert(8, "Player_ID", player_id_column, True)

# Ensure expected values were added to the dataframe'
df.head()

Unnamed: 0,Play,Play_ID,PlayerLinenumber,ActSceneLine,Act,Scene,Line,Player,Player_ID,PlayerLine
0,Henry IV,26,1.0,1.1.1,1,1,1,KING HENRY IV,681,"So shaken as we are, so wan with care,"
1,Henry IV,26,1.0,1.1.2,1,1,2,KING HENRY IV,681,"Find we a time for frighted peace to pant,"
2,Henry IV,26,1.0,1.1.3,1,1,3,KING HENRY IV,681,And breathe short-winded accents of new broils
3,Henry IV,26,1.0,1.1.4,1,1,4,KING HENRY IV,681,To be commenced in strands afar remote.
4,Henry IV,26,1.0,1.1.5,1,1,5,KING HENRY IV,681,No more the thirsty entrance of this soil


### Relevant Statistics

In [6]:
print('Number of Players:', len(unique_players))
print('Expected accuracy if guessing:', 1/len(unique_players))

Number of Players: 934
Expected accuracy if guessing: 0.0010706638115631692


### Prepare Data for Training and Testing

In [7]:
# Use the name of the play, act, scene, and line as inputs
# Use player as output

# It's worth noting the line spoken is not being used as an input.
# This would require some sort of encoding, like vectorization.
# This process would dramatically increase the complexity of these models
# and as we will see, without including the line spoken, we're still able
# to produce rather impressive results.

inputs = (df[['Play_ID', 'Act', 'Scene', 'Line']]).to_numpy()
outputs = (df[['Player_ID']]).to_numpy()

# Split data into 80% training and 20% testing

x_train, x_test, y_train, y_test = train_test_split(inputs, outputs, test_size=0.2, random_state=0)
y_train = y_train.ravel()

### Create Model

In [8]:
rfc = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=1)

### Train and Test

In [None]:
rfc.fit(x_train, y_train)
rfc_score = rfc.score(x_test, y_test)
print('Random forest accuracy:', rfc_score)