In [None]:
from pathlib import Path
import nltk
import sklearn
import re
import pandas as pd
from bs4 import BeautifulSoup
from pprint import pprint
import matplotlib.pyplot as plt

In [None]:
# Hardcoded variables

DATA_DIR = Path('dataset/train-data')
assert DATA_DIR.exists()

VAL_SPLIT = 0.25

In [None]:
# First off we just locate all the notes files and organise them by speciality

speciality_dirs = list(DATA_DIR.glob('./*'))
speciality_names = [x.name for x in speciality_dirs]
speciality_to_label = {x: i for i, x in enumerate(speciality_names)}
label_to_speciality = {v: k for k, v in speciality_to_label.items()}

# Compile list of [path, label] pairs
dataset = []
for sp_name, sp_dir in zip(speciality_names, speciality_dirs):
    for filepath in sp_dir.glob('*.txt'):
        dataset.append([filepath, speciality_to_label[sp_name]])

pprint(dataset[:5])

In [None]:
# Split off a training set and a validation set

filepaths, labels = list(zip(*dataset))

filepaths_train, filepaths_val, labels_train, labels_val = \
    sklearn.model_selection.train_test_split(filepaths, labels, test_size=VAL_SPLIT, random_state=42, stratify=labels)

assert len(filepaths_train) == len(labels_train)
print(len(filepaths_train))
pprint(filepaths_train[:5])

In [None]:
# Define text preprocessing function here

def preprocess(text):
    
    # Remove html tags
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    
    # Remove non-word chars
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)  # Replace all whitespace with single space
    text = re.sub(r'[^\w ]', '', text) # Remove all non-word characters
    text = text.lower()
    
    return text
    

In [None]:
# Retrieve raw text data

def create_df(filepaths, labels):
    text = []
    for fpath in filepaths:
        with open(fpath, 'r', encoding="utf8", errors='ignore') as f:  # Needed to cope with non-ascii characters
            text.append(f.read())

    return pd.DataFrame({'filepath': filepaths, 'label': labels, 'raw_text': text, 'processed_text': [preprocess(x) for x in text]})

train_df = create_df(filepaths_train, labels_train)
val_df = create_df(filepaths_val, labels_val)

In [None]:
train_df.head()

In [None]:
val_df.head()

In [None]:
# Create BOW feature vector for each doc
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(stop_words='english')
X_train = count_vectorizer.fit_transform(train_df['processed_text'])
X_val = count_vectorizer.transform(val_df['processed_text'])

# Train linear SVM
model = sklearn.svm.LinearSVC(max_iter=1000, C=0.001)
model.fit(X_train, train_df['label'])

# Eval predictions on val set
y_pred = model.predict(X_val)
acc = sklearn.metrics.accuracy_score(val_df['label'], y_pred)

print('Accuracy = {:f}'.format(acc))

In [None]:
# Try again, but with TF-IDF features

count_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(stop_words='english')
X_train = count_vectorizer.fit_transform(train_df['processed_text'])
X_val = count_vectorizer.transform(val_df['processed_text'])

model = sklearn.svm.LinearSVC(max_iter=1000, C=1)
model.fit(X_train, train_df['label'])

y_pred = model.predict(X_val)
acc = sklearn.metrics.accuracy_score(val_df['label'], y_pred)

print('Accuracy = {:f}'.format(acc))

In [None]:
cm = sklearn.metrics.plot_confusion_matrix(model, X_val, val_df['label'], normalize='true', display_labels=speciality_names, xticks_rotation='vertical')