Nick von Bulow
1/14/2022

# Data Aquisition

In [55]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
# text_normalizer.py at https://github.com/dipanjanS/text-analytics-with-python/tree/master/New-Second-Edition/Ch05%20-%20Text%20Classification
import text_normalizer as tn
import matplotlib.pyplot as plt
import pandas as pd
import spacy

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [4]:
# fetch the data using the helper from sklearn
data = fetch_20newsgroups(subset='all', shuffle=True, remove=('headers', 'footers', 'quotes'))

data_labels_map = dict(enumerate(data.target_names))

In [11]:
data_labels_map

{0: 'alt.atheism',
 1: 'comp.graphics',
 2: 'comp.os.ms-windows.misc',
 3: 'comp.sys.ibm.pc.hardware',
 4: 'comp.sys.mac.hardware',
 5: 'comp.windows.x',
 6: 'misc.forsale',
 7: 'rec.autos',
 8: 'rec.motorcycles',
 9: 'rec.sport.baseball',
 10: 'rec.sport.hockey',
 11: 'sci.crypt',
 12: 'sci.electronics',
 13: 'sci.med',
 14: 'sci.space',
 15: 'soc.religion.christian',
 16: 'talk.politics.guns',
 17: 'talk.politics.mideast',
 18: 'talk.politics.misc',
 19: 'talk.religion.misc'}

In [23]:
# build up the dataframe
corpus, target_labels, target_names = (data.data, data.target, [data_labels_map[label] for label in data.target])

df = pd.DataFrame({
    'Article': corpus,
    'Target Label': target_labels,
    'Target Name': target_names
})

print(df.shape)
df.head(10)

(18846, 3)


Unnamed: 0,Article,Target Label,Target Name
0,\n\nI am sure some bashers of Pens fans are pr...,10,rec.sport.hockey
1,My brother is in the market for a high-perform...,3,comp.sys.ibm.pc.hardware
2,\n\n\n\n\tFinally you said what you dream abou...,17,talk.politics.mideast
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3,comp.sys.ibm.pc.hardware
4,1) I have an old Jasmine drive which I cann...,4,comp.sys.mac.hardware
5,\n\nBack in high school I worked as a lab assi...,12,sci.electronics
6,\n\nAE is in Dallas...try 214/241-6060 or 214/...,4,comp.sys.mac.hardware
7,"\n[stuff deleted]\n\nOk, here's the solution t...",10,rec.sport.hockey
8,"\n\n\nYeah, it's the second one. And I believ...",10,rec.sport.hockey
9,\nIf a Christian means someone who believes in...,19,talk.religion.misc


# Data Preprocessing and Normalization

In [24]:
# number of empty documents
total_nulls = df[df['Article'].str.strip() == ''].shape[0]
total_nulls

515

In [25]:
# filter out any empty articles
df = df[~(df['Article'].str.strip() == '')]
df.shape

(18331, 3)

In [27]:
# clean up text

import nltk
from tqdm.notebook import tqdm
stopword_list = nltk.corpus.stopwords.words('english')

# keep negations
stopword_list.remove('no')
stopword_list.remove('not')

norm_corpus = tn.normalize_corpus(corpus=tqdm(df['Article']), html_stripping=True,
                                  contraction_expansion=True, accented_char_removal=True,
                                  text_lower_case=True, text_lemmatization=True,
                                  text_stemming=False, special_char_removal=True,
                                  remove_digits=True, stopword_removal=True,
                                  stopwords=stopword_list)

df['Clean Article'] = norm_corpus

df.head()

  0%|          | 0/18331 [00:00<?, ?it/s]



Unnamed: 0,Article,Target Label,Target Name,Clean Article
0,\n\nI am sure some bashers of Pens fans are pr...,10,rec.sport.hockey,sure basher pens fan pretty confused lack kind...
1,My brother is in the market for a high-perform...,3,comp.sys.ibm.pc.hardware,brother market high performance video card sup...
2,\n\n\n\n\tFinally you said what you dream abou...,17,talk.politics.mideast,finally say dream mediterranean new area great...
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3,comp.sys.ibm.pc.hardware,think scsi card dma transfer not disk scsi car...
4,1) I have an old Jasmine drive which I cann...,4,comp.sys.mac.hardware,old jasmine drive not use new system understan...


In [29]:
df = df.replace(r'^(\s?)+$', np.nan, regex=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18331 entries, 0 to 18845
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Article        18331 non-null  object
 1   Target Label   18331 non-null  int32 
 2   Target Name    18331 non-null  object
 3   Clean Article  18301 non-null  object
dtypes: int32(1), object(3)
memory usage: 644.4+ KB


In [31]:
df = df.dropna().reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18301 entries, 0 to 18300
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Article        18301 non-null  object
 1   Target Label   18301 non-null  int32 
 2   Target Name    18301 non-null  object
 3   Clean Article  18301 non-null  object
dtypes: int32(1), object(3)
memory usage: 500.5+ KB


In [33]:
# Back up the dataset to a csv file
df.to_csv('clean_newsgroups.csv', index=False)

# Checkpoint
Run from here to avoid preprocessing the data again


In [45]:
df = pd.read_csv('clean_newsgroups.csv')

# Training

## Building Train and Test Datasets

In [46]:
# We are splitting out dataset into 67% train and 33% test

from sklearn.model_selection import train_test_split

train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names = train_test_split(
    np.array(df['Clean Article']),
    np.array(df['Target Label']),
    np.array(df['Target Name']),
    test_size=0.33, random_state=42)
train_corpus.shape, test_corpus.shape

((12261,), (6040,))

In [47]:
# let's take a look at the various distribution of each category

from collections import Counter

trd = dict(Counter(train_label_names))
tsd = dict(Counter(test_label_names))

dataset_categories = (pd.DataFrame([[key, trd[key], tsd[key]] for key in trd],
                                   columns=['Target label', 'Train Count', 'Test Count'])
                        .sort_values(by=['Train Count', 'Test Count'], ascending=False))

dataset_categories['Train Ratio'] = dataset_categories['Train Count'] / (dataset_categories['Train Count'] + dataset_categories['Test Count'])

dataset_categories

Unnamed: 0,Target label,Train Count,Test Count,Train Ratio
6,sci.crypt,675,287,0.701663
8,comp.windows.x,669,311,0.682653
1,comp.graphics,661,292,0.693599
5,rec.motorcycles,658,311,0.679051
2,soc.religion.christian,656,318,0.673511
0,rec.sport.hockey,655,318,0.673176
14,sci.electronics,645,311,0.674686
15,comp.os.ms-windows.misc,642,304,0.678647
18,sci.med,641,319,0.667708
7,rec.sport.baseball,641,310,0.674027


## Build the text classifier

- Traditional feature (BOW, TF-IDF) representation and classification models
- Advanced feature (Word2Vec, GloVe, FastText) representation and classification models

### Bag of Words Features

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

# build a bag of words on the training articles and test articles

cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0)
cv_train_features = cv.fit_transform(train_corpus)
cv_test_features = cv.transform(test_corpus)

print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)


BOW model:> Train features shape: (12261, 74133)  Test features shape: (6040, 74133)


In [57]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB(alpha=1)
mnb.fit(cv_train_features, train_label_names)
mnb_bow_cv_scores = cross_val_score(mnb, cv_train_features, train_label_names, cv=5)
mnb_bow_cv_mean_score = np.mean(mnb_bow_cv_scores)
print('CV Accuracy (5-fold):', mnb_bow_cv_scores)
print('Mean CV Accuracy:', mnb_bow_cv_mean_score)
mnb_bow_test_score = mnb.score(cv_test_features, test_label_names)
print('Test Accuracy:', mnb_bow_test_score)

CV Accuracy (5-fold): [0.67835304 0.64437194 0.67862969 0.67373573 0.66680261]
Mean CV Accuracy: 0.668378600894201
Test Accuracy: 0.6809602649006623


In [58]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
lr.fit(cv_train_features, train_label_names)
lr_bow_cv_scores = cross_val_score(lr, cv_train_features, train_label_names, cv=5)
lr_bow_cv_mean_score = np.mean(lr_bow_cv_scores)
print('CV Accuracy (5-fold):', lr_bow_cv_scores)
print('Mean CV Accuracy:', lr_bow_cv_mean_score)
lr_bow_test_score = lr.score(cv_test_features, test_label_names)
print('Test Accuracy:', lr_bow_test_score)

CV Accuracy (5-fold): [0.67509172 0.67414356 0.68311582 0.69127243 0.6721044 ]
Mean CV Accuracy: 0.6791455879506999
Test Accuracy: 0.6943708609271523


In [59]:
from sklearn.svm import LinearSVC

svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(cv_train_features, train_label_names)
svm_bow_cv_scores = cross_val_score(svm, cv_train_features, train_label_names, cv=5)
svm_bow_cv_mean_score = np.mean(svm_bow_cv_scores)
print('CV Accuracy (5-fold):', svm_bow_cv_scores)
print('Mean CV Accuracy:', svm_bow_cv_mean_score)
svm_bow_test_score = svm.score(cv_test_features, test_label_names)
print('Test Accuracy:', svm_bow_test_score)

CV Accuracy (5-fold): [0.63350999 0.63539967 0.65171289 0.64437194 0.64151713]
Mean CV Accuracy: 0.6413023238182896
Test Accuracy: 0.6567880794701987


In [60]:
from sklearn.linear_model import SGDClassifier

svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, random_state=42)
svm_sgd.fit(cv_train_features, train_label_names)
svmsgd_bow_cv_scores = cross_val_score(svm_sgd, cv_train_features, train_label_names, cv=5)
svmsgd_bow_cv_mean_score = np.mean(svmsgd_bow_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_bow_cv_scores)
print('Mean CV Accuracy:', svmsgd_bow_cv_mean_score)
svmsgd_bow_test_score = svm_sgd.score(cv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_bow_test_score)

CV Accuracy (5-fold): [0.65593151 0.61745514 0.61867863 0.63662316 0.62846656]
Mean CV Accuracy: 0.6314310006922974
Test Accuracy: 0.6605960264900662


In [61]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(cv_train_features, train_label_names)
rfc_bow_cv_scores = cross_val_score(rfc, cv_train_features, train_label_names, cv=5)
rfc_bow_cv_mean_score = np.mean(rfc_bow_cv_scores)
print('CV Accuracy (5-fold):', rfc_bow_cv_scores)
print('Mean CV Accuracy:', rfc_bow_cv_mean_score)
rfc_bow_test_score = rfc.score(cv_test_features, test_label_names)
print('Test Accuracy:', rfc_bow_test_score)

CV Accuracy (5-fold): [0.52751733 0.50448613 0.5362969  0.51957586 0.52406199]
Mean CV Accuracy: 0.5223876413274289
Test Accuracy: 0.545364238410596


In [62]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(cv_train_features, train_label_names)
gbc_bow_cv_scores = cross_val_score(gbc, cv_train_features, train_label_names, cv=5)
gbc_bow_cv_mean_score = np.mean(gbc_bow_cv_scores)
print('CV Accuracy (5-fold):', gbc_bow_cv_scores)
print('Mean CV Accuracy:', gbc_bow_cv_mean_score)
gbc_bow_test_score = gbc.score(cv_test_features, test_label_names)
print('Test Accuracy:', gbc_bow_test_score)

CV Accuracy (5-fold): [0.54830819 0.52079935 0.55791191 0.56525285 0.55750408]
Mean CV Accuracy: 0.5499552766562766
Test Accuracy: 0.5514900662251656


### TF-IDF Features

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer

# build BOW features on train articles
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0)
tv_train_features = tv.fit_transform(train_corpus)
# transform test articles into features
tv_test_features = tv.transform(test_corpus)
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

TFIDF model:> Train features shape: (12261, 74133)  Test features shape: (6040, 74133)


In [65]:
mnb = MultinomialNB(alpha=1)
mnb.fit(tv_train_features, train_label_names)
mnb_tfidf_cv_scores = cross_val_score(mnb, tv_train_features, train_label_names, cv=5)
mnb_tfidf_cv_mean_score = np.mean(mnb_tfidf_cv_scores)
print('CV Accuracy (5-fold):', mnb_tfidf_cv_scores)
print('Mean CV Accuracy:', mnb_tfidf_cv_mean_score)
mnb_tfidf_test_score = mnb.score(tv_test_features, test_label_names)
print('Test Accuracy:', mnb_tfidf_test_score)

CV Accuracy (5-fold): [0.71463514 0.68719413 0.71615008 0.71288744 0.70880914]
Mean CV Accuracy: 0.7079351847356734
Test Accuracy: 0.7109271523178808


In [66]:
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
lr.fit(tv_train_features, train_label_names)
lr_tfidf_cv_scores = cross_val_score(lr, tv_train_features, train_label_names, cv=5)
lr_tfidf_cv_mean_score = np.mean(lr_tfidf_cv_scores)
print('CV Accuracy (5-fold):', lr_tfidf_cv_scores)
print('Mean CV Accuracy:', lr_tfidf_cv_mean_score)
lr_tfidf_test_score = lr.score(tv_test_features, test_label_names)
print('Test Accuracy:', lr_tfidf_test_score)

CV Accuracy (5-fold): [0.74317163 0.7177814  0.74877651 0.74673736 0.73572594]
Mean CV Accuracy: 0.7384385667515025
Test Accuracy: 0.7538079470198675


In [67]:
svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(tv_train_features, train_label_names)
svm_tfidf_cv_scores = cross_val_score(svm, tv_train_features, train_label_names, cv=5)
svm_tfidf_cv_mean_score = np.mean(svm_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svm_tfidf_cv_scores)
print('Mean CV Accuracy:', svm_tfidf_cv_mean_score)
svm_tfidf_test_score = svm.score(tv_test_features, test_label_names)
print('Test Accuracy:', svm_tfidf_test_score)

CV Accuracy (5-fold): [0.75295556 0.7365416  0.75978793 0.7675367  0.74877651]
Mean CV Accuracy: 0.753119661046932
Test Accuracy: 0.7690397350993378


In [68]:
svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, random_state=42)
svm_sgd.fit(tv_train_features, train_label_names)
svmsgd_tfidf_cv_scores = cross_val_score(svm_sgd, tv_train_features, train_label_names, cv=5)
svmsgd_tfidf_cv_mean_score = np.mean(svmsgd_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_tfidf_cv_scores)
print('Mean CV Accuracy:', svmsgd_tfidf_cv_mean_score)
svmsgd_tfidf_test_score = svm_sgd.score(tv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_tfidf_test_score)

CV Accuracy (5-fold): [0.7582552  0.73776509 0.76223491 0.7728385  0.75489396]
Mean CV Accuracy: 0.75719753220247
Test Accuracy: 0.7658940397350993


In [69]:
rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(tv_train_features, train_label_names)
rfc_tfidf_cv_scores = cross_val_score(rfc, tv_train_features, train_label_names, cv=5)
rfc_tfidf_cv_mean_score = np.mean(rfc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', rfc_tfidf_cv_scores)
print('Mean CV Accuracy:', rfc_tfidf_cv_mean_score)
rfc_tfidf_test_score = rfc.score(tv_test_features, test_label_names)
print('Test Accuracy:', rfc_tfidf_test_score)

CV Accuracy (5-fold): [0.51447208 0.51386623 0.50040783 0.5403752  0.52243067]
Mean CV Accuracy: 0.5183104019514674
Test Accuracy: 0.5448675496688742


In [70]:
gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(tv_train_features, train_label_names)
gbc_tfidf_cv_scores = cross_val_score(gbc, tv_train_features, train_label_names, cv=5)
gbc_tfidf_cv_mean_score = np.mean(gbc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', gbc_tfidf_cv_scores)
print('Mean CV Accuracy:', gbc_tfidf_cv_mean_score)
gbc_tfidf_test_score = gbc.score(tv_test_features, test_label_names)
print('Test Accuracy:', gbc_tfidf_test_score)

CV Accuracy (5-fold): [0.55156951 0.52650897 0.55995106 0.57544861 0.56035889]
Mean CV Accuracy: 0.5547674086862376
Test Accuracy: 0.552317880794702


In [71]:
pd.DataFrame([['Naive Bayes', mnb_bow_cv_mean_score, mnb_bow_test_score, 
               mnb_tfidf_cv_mean_score, mnb_tfidf_test_score],
              ['Logistic Regression', lr_bow_cv_mean_score, lr_bow_test_score, 
               lr_tfidf_cv_mean_score, lr_tfidf_test_score],
              ['Linear SVM', svm_bow_cv_mean_score, svm_bow_test_score, 
               svm_tfidf_cv_mean_score, svm_tfidf_test_score],
              ['Linear SVM (SGD)', svmsgd_bow_cv_mean_score, svmsgd_bow_test_score, 
               svmsgd_tfidf_cv_mean_score, svmsgd_tfidf_test_score],
              ['Random Forest', rfc_bow_cv_mean_score, rfc_bow_test_score, 
               rfc_tfidf_cv_mean_score, rfc_tfidf_test_score],
              ['Gradient Boosted Machines', gbc_bow_cv_mean_score, gbc_bow_test_score, 
               gbc_tfidf_cv_mean_score, gbc_tfidf_test_score]],
             columns=['Model', 'CV Score (TF)', 'Test Score (TF)', 'CV Score (TF-IDF)', 'Test Score (TF-IDF)'],
             ).T

Unnamed: 0,0,1,2,3,4,5
Model,Naive Bayes,Logistic Regression,Linear SVM,Linear SVM (SGD),Random Forest,Gradient Boosted Machines
CV Score (TF),0.668379,0.679146,0.641302,0.631431,0.522388,0.549955
Test Score (TF),0.68096,0.694371,0.656788,0.660596,0.545364,0.55149
CV Score (TF-IDF),0.707935,0.738439,0.75312,0.757198,0.51831,0.554767
Test Score (TF-IDF),0.710927,0.753808,0.76904,0.765894,0.544868,0.552318
