# Harry Potter sentence classifier

## Data preparation

### Load books from files

In [None]:
filenames = {
    1: "Book 1 - The Philosopher's Stone.txt",
    2: 'Book 2 - The Chamber of Secrets.txt',
    3: 'Book 3 - The Prisoner of Azkaban.txt',
    4: 'Book 4 - The Goblet of Fire.txt',
    5: 'Book 5 - The Order of the Phoenix.txt',
    6: 'Book 6 - The Half Blood Prince.txt',
    7: 'Book 7 - The Deathly Hallows.txt'
}

In [None]:
books = {}
for id, filename in filenames.items():
    with open('books/' + filename, 'r') as file:
        book = file.read().replace('\n', '')
    books[id] = book

### Remove footers from each page

In [None]:
import re
footers = {
    1: r'Page \| \d Harry Potter and the Philosophers Stone - J.K. Rowling',
    2: r'Page \| \d Harry Potter and the Chamber of Secrets - J.K. Rowling',
    3: r'Page \| \d Harry Potter and the Prisoner of Azkaban - J.K. Rowling',
    4: r'Page \| \d Harry Potter and the Goblet of Fire - J.K. Rowling',
    5: r'Page \| \d Harry Potter and the Order of the Phoenix - J.K. Rowling',
    6: r'Page \| \d Harry Potter and the Half Blood Prince - J.K. Rowling',
    7: r'Page \| \d Harry Potter and the Deathly Hallows - J.K. Rowling'
}

In [None]:
for book, content in books.items():
    books[book] = re.sub(footers[book], '', content)

### Convert books to sentences

In [None]:
from nltk.tokenize import sent_tokenize

In [None]:
sentences = {}
for book, content in books.items():
    sentences[book] = sent_tokenize(content)

### Count sentences

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.bar(sentences.keys(), [len(s) for s in sentences.values()])

### Prepare dataset

In [None]:
import mlflow

In [None]:
mlflow.set_experiment('harry-potter-sentence-classifier')

In [None]:
mlflow.start_run()

In [None]:
SEED = 42

In [None]:
sample_size = 5000
mlflow.log_param("sample_size", sample_size)

In [None]:
import random
X, y = [], []
for book, sentence in sentences.items():
    random.Random(SEED).shuffle(sentences[book])
    selected = sentences[book][:sample_size]
    X += selected
    y += [book] * len(selected)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

## Training

In [None]:
max_features = 300
mlflow.log_param("max_features", max_features)
strip_accents = 'unicode'
mlflow.log_param("strip_accents", strip_accents)

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
mlflow.log_param("stop_words", True)

In [None]:
max_iter = 1000
mlflow.log_param("max_iter", max_iter)
hidden_layer_sizes = (100,)
mlflow.log_param("hidden_layer_sizes", hidden_layer_sizes)
early_stopping = True
mlflow.log_param("early_stopping", early_stopping)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier

clf = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=max_features,
        stop_words=stop_words,
        strip_accents=strip_accents
    )),
    ('mlp', MLPClassifier(
        random_state=SEED,
        max_iter=max_iter,
        hidden_layer_sizes=hidden_layer_sizes,
        early_stopping=early_stopping,
    )),
])

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_train)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
train_acc = accuracy_score(y_train, y_pred)
train_acc

In [None]:
mlflow.log_metric("train_acc", train_acc)

### Testing

In [None]:
y_pred = clf.predict(X_test)

In [None]:
test_acc = accuracy_score(y_test, y_pred)
test_acc

In [None]:
mlflow.log_metric("test_acc", test_acc)

In [None]:
mlflow.end_run()