# Aviva Natural Language Processing SGH Talk

In [None]:
from collections import Counter

from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from pprint import pprint
from time import time


# Tweak matplotlib the figure sizes
plt.rcParams['figure.figsize'] = [35, 25]
plt.rc('xtick', labelsize=30)
plt.rc('ytick', labelsize=30)

Load the data first. The 20 Newsgroups dataset is conveniently available through the scikit-learn library.

In [None]:
# Data loading
news_train = fetch_20newsgroups(
    subset='train',
)

news_test = fetch_20newsgroups(
    subset='test',
)

Define helper functions to 

1. Get top tokens from a Multinomial NB model.

2. Compute macro-averaged metrics.

In [None]:
def get_top_features(vect, clf, n=10, k=0):
    """
    Lists top 'n' discriminant words in class.
    
    :param vect: Instance of a vectoriser used in feature extraction, e.g.
        CountVectorizer, TfidfVectorizer, etc.
    :param clf: Instance of a linear classifier used, e.g. MultinomialNB.
    :param n: How many top features are to be printed.
    :param k: Class index.

    :return: A list of n most discriminant words in class.
    """
    feature_names = vect.get_feature_names()
    top_n = np.argsort(clf.coef_[k])[-n:][::-1]
    top_features = [feature_names[i] for i in top_n]

    return top_features

def compute_metrics(actual_y, pred_y):
    """
    Returns macro metrics: precision, accuracy and F1-score.
    """
    prec = metrics.precision_score(actual_y, pred_y, average='macro')
    acc = metrics.accuracy_score(actual_y, pred_y)
    f1 = metrics.f1_score(actual_y, pred_y, average='macro')
    
    return prec, acc, f1

How does our data look like?

In [None]:
print(f"Label: {news_train.target_names[news_train.target[3]]}\n\n")
print(news_train.data[3])

Are the classes balanced?

In [None]:
train_counts = {key: val for key, val in Counter(news_train.target).items()}
train_counts = {key: train_counts[key] for key in sorted(train_counts, key=train_counts.get)}

labels = [news_train.target_names[key] for key in train_counts.keys()]

gig, ax =plt.subplots()
ax.barh(range(len(train_counts)), list(train_counts.values()), 0.75, align='center')
ax.set_yticks(np.arange(len(train_counts)))
ax.set_yticklabels(labels)
plt.show()

## Model 1 - CountVectorizer

In [None]:
# Define the Vectorizer
vect = CountVectorizer()

train_x = vect.fit_transform(news_train.data)
train_y = news_train.target

test_x = vect.transform(news_test.data)
test_y = news_test.target

print("CountVectorizer")
print("="*80)
print("Summary:")
print(f"Training set: {len(news_train.target)}")
print(f" Testing set: {len(news_test.target)}")
print(f"Extracted {len(vect.get_feature_names())} features.")

# Define a classifier
nb = MultinomialNB(alpha=0.1)

# Fit the classifier
nb.fit(train_x, train_y)

# Make predictions on the test set
pred_y = nb.predict(test_x)

# Summary of the macro scores
print("="*80)
print("Scores:")
prec_1, acc_1, f1_1 = compute_metrics(test_y, pred_y)
print(f"Precision: {prec_1:.3f}")
print(f" Accuracy: {acc_1:.3f}")
print(f" F1-Score: {f1_1:.3f}")

## Model 2 - TfidfVectorizer

In [None]:
# Define the Vectorizer
vect = TfidfVectorizer()

train_x = vect.fit_transform(news_train.data)
train_y = news_train.target

test_x = vect.transform(news_test.data)
test_y = news_test.target

print("CountVectorizer")
print("="*80)
print("Summary:")
print(f"Training set: {len(news_train.target)}")
print(f" Testing set: {len(news_test.target)}")
print(f"Extracted {len(vect.get_feature_names())} features.")

# Define a classifier
nb = MultinomialNB(alpha=0.1)

# Fit the classifier
nb.fit(train_x, train_y)

# Make predictions on the test set
pred_y = nb.predict(test_x)

# Summary of the macro scores
print("="*80)
print("Scores:")
prec_2, acc_2, f1_2 = compute_metrics(test_y, pred_y)
print(f"Precision: {prec_2:.3f} ({prec_1:.3f})")
print(f" Accuracy: {acc_2:.3f} ({acc_1:.3f})")
print(f" F1-Score: {f1_2:.3f} ({f1_1:.3f})")

In [None]:
print("Top words per class")
for k, cat in enumerate(news_train.target_names):
    print(f"{cat:<25}", get_top_features(vect, nb, n=10, k=k))

## Model 3 - TfidfVectorizer + stopwords

The words above are not very informative -- in NLP they're called *stopwords*. The nltk (Natural Language Processing Toolkig) has a list of all english stopwords. Removing them from the BoW document representation can improve accuracy and provide more informative "top words".

In [None]:
print(stopwords.words('english')[:10])

In [None]:
# Define the Vectorizer -- this time with stopwords
vect = TfidfVectorizer(
    stop_words=stopwords.words('english')
)

train_x = vect.fit_transform(news_train.data)
train_y = news_train.target

test_x = vect.transform(news_test.data)
test_y = news_test.target

print("CountVectorizer")
print("="*80)
print("Summary:")
print(f"Training set: {len(news_train.target)}")
print(f" Testing set: {len(news_test.target)}")
print(f"Extracted {len(vect.get_feature_names())} features.")

# Define a classifier
nb = MultinomialNB(alpha=0.1)

# Fit the classifier
nb.fit(train_x, train_y)

# Make predictions on the test set
pred_y = nb.predict(test_x)

# Summary of the macro scores
print("="*80)
print("Scores:")
prec_3, acc_3, f1_3 = compute_metrics(test_y, pred_y)
print(f"Precision: {prec_3:.3f} ({prec_2:.3f})")
print(f" Accuracy: {acc_3:.3f} ({acc_2:.3f})")
print(f" F1-Score: {f1_3:.3f} ({f1_2:.3f})")

In [None]:
print("Top words per class")
for k, cat in enumerate(news_train.target_names):
    print(f"{cat:<25}", get_top_features(vect, nb, n=10, k=k))

There's still a lot of noise: .edu, .com are of course parts of domains. Keith, Leveish are names. The model is actualy overfitting to posters' email addresses!

So -- back to the drawing board...

## Model 4 - TfidfVectorizer + stopwords + cleaner data

In [None]:
print(f"Label: {news_train.target_names[news_train.target[3]]}\n\n")
print(news_train.data[3])

Fortunately, the cleaning has already been done for us...

In [None]:
# Data loading
news_train = fetch_20newsgroups(
    subset='train',
    remove=('headers', 'footers'),
)
news_test = fetch_20newsgroups(
    subset='test',
    remove=('headers', 'footers'),
)

In [None]:
print(f"Label: {news_train.target_names[news_train.target[3]]}\n\n")
print(news_train.data[3])

In [None]:
# Define the Vectorizer
vect = TfidfVectorizer(
    stop_words=stopwords.words('english'),
)

train_x = vect.fit_transform(news_train.data)
train_y = news_train.target

test_x = vect.transform(news_test.data)
test_y = news_test.target

print("CountVectorizer")
print("="*80)
print("Summary:")
print(f"Training set: {len(news_train.target)}")
print(f" Testing set: {len(news_test.target)}")
print(f"Extracted {len(vect.get_feature_names())} features.")

# Define a classifier
nb = MultinomialNB(alpha=0.1)

# Fit the classifier
nb.fit(train_x, train_y)

# Make predictions on the test set
pred_y = nb.predict(test_x)

# Summary of the macro scores
print("="*80)
print("Scores:")
prec_4, acc_4, f1_4 = compute_metrics(test_y, pred_y)
print(f"Precision: {prec_4:.3f} ({prec_3:.3f})")
print(f" Accuracy: {acc_4:.3f} ({acc_3:.3f})")
print(f" F1-Score: {f1_4:.3f} ({f1_3:.3f})")

In [None]:
print("Top words per class")
for k, cat in enumerate(news_train.target_names):
    print(f"{cat:<25}", get_top_features(vect, nb, n=10, k=k))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def pprint_confusion_matrix(conf_matrix, labels):
    df = pd.DataFrame(
        data=conf_matrix,
        index=labels,
        columns=labels
    )
    
    fig, ax = plt.subplots()
    cax = ax.matshow(conf_matrix)
    fig.colorbar(cax)
    
    ax.set_xticks(range(len(labels)))
    ax.set_yticks(range(len(labels)))
    ax.set_xticklabels(labels, rotation='vertical')
    ax.set_yticklabels(labels)
    
    ax.set_xlabel("Predicted", fontsize=30)
    ax.xaxis.set_label_position('top')
    ax.set_ylabel("True", fontsize=30)
    
    plt.show()
    
    
pprint_confusion_matrix(metrics.confusion_matrix(test_y, pred_y), news_train.target_names)