In [94]:
# preamble to be able to run notebooks in Jupyter and Colab
try:
    from google.colab import drive
    import sys
    
    drive.mount('/content/drive')
    notes_home = "/content/drive/Shared drives/CSC310/ds/notes/"
    user_home = "/content/drive/My Drive/"
    
    sys.path.insert(1,notes_home) # let the notebook access the notes folder

except ModuleNotFoundError:
    notes_home = "" # running native Jupyter environment -- notes home is the same as the notebook
    user_home = ""  # under Jupyter we assume the user directory is the same as the notebook

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# NLP & ML

We saw that we convert text document into a ‘vector model’ (bag-of-words).

The vector model allows us to perform mathematical analysis on documents - “which documents are similar to each other?”

> Next question: can we construct machine learning models on document collections using the vector model?

**Yes!** We can construct classifiers.


Consider again our news article data set.

We would like to construct a classifier that can correctly classifier political and science documents.

We will begin with our KNN algorithm (k nearest neighbors). Since documents are considered point in an n-dimensional space KNN seems well suited for this problem.

## Data

In [95]:
# setup
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from assets.confint import classification_confint
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from assets.treeviz import tree_print

In [96]:
print("******** data **********")

# get the newsgroup database
newsgroups = pd.read_csv("/content/drive/Shared drives/CSC310/ds/notes/assets/newsgroups-noheaders.csv")
newsgroups.head(n=10)

******** data **********


Unnamed: 0,text,label
0,\nIn billions of dollars (%GNP):\nyear GNP ...,space
1,ajteel@dendrite.cs.Colorado.EDU (A.J. Teel) w...,space
2,\nMy opinion is this: In a society whose econ...,space
3,"Ahhh, remember the days of Yesterday? When we...",space
4,"\n""...a la Chrysler""?? Okay kids, to the near...",space
5,"\n As for advertising -- sure, why not? A N...",politics
6,"\n What, pray tell, does this mean? Just who ...",space
7,\nWhere does the shadow come from? There's no...,politics
8,^^^^^^^^^...,politics
9,"#Yet, when a law was proposed for Virginia tha...",space


In [97]:
print("******** docarray **********")

# build the stemmer object
stemmer = PorterStemmer()

# build a new default analyzer using CountVectorizer that only uses words: [a-zA-Z]+
# also eliminate stop words
analyzer= CountVectorizer(analyzer = "word", 
                          stop_words = 'english',
                          token_pattern = "[a-zA-Z]+").build_analyzer()

# build a new analyzer that stems using the default analyzer to create the words to be stemmed
def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

# build docarray
vectorizer = CountVectorizer(analyzer=stemmed_words,
                             #analyzer=analyzer,
                             binary=True,
                             min_df=2) # each word has to appear at least twice
docarray = vectorizer.fit_transform(newsgroups['text']).toarray()
docarray.shape
doc_df = pd.DataFrame(docarray, columns=list(vectorizer.get_feature_names()))
doc_df.head()

******** docarray **********


Unnamed: 0,aa,abandon,abbey,abc,abil,abl,aboard,abolish,abort,abroad,absenc,absolut,absorb,absorpt,abstract,absurd,abund,abus,abyss,ac,acad,acadamia,academ,academi,academia,acceler,accept,access,accid,accident,accommod,accomod,accompani,accomplish,accord,account,accredit,accur,accuraci,accus,...,wwii,x,xavier,y,ya,yah,yale,yamada,yard,ye,yea,yeah,year,yearli,yee,yell,yellow,yeltsin,yer,yesterday,yield,yo,york,yoshiro,young,youngster,youth,ysc,yscvax,ytou,yugoslavia,yup,z,zealand,zenit,zero,zeta,zip,zone,zoo
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


## Decision Tree

In [98]:
print("******** model **********")


# Decision Tree
model = DecisionTreeClassifier()

# grid search
param_grid = {'max_depth': list(range(1,30)), 'criterion':['gini','entropy']}
grid = GridSearchCV(model, param_grid, cv=2, verbose=10, n_jobs=-1)
grid.fit(docarray, newsgroups['label'])
print("Grid Search: best parameters: {}".format(grid.best_params_))
tree_print(grid.best_estimator_,doc_df)

******** model **********
Fitting 2 folds for each of 58 candidates, totalling 116 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1519s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done  70 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 110 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done 116 out of 116 | elapsed:   17.4s finished


Grid Search: best parameters: {'criterion': 'gini', 'max_depth': 23}
if space =< 0.5: 
  |then if orbit =< 0.5: 
  |  |then if peopl =< 0.5: 
  |  |  |then if clinton =< 0.5: 
  |  |  |  |then if homosexu =< 0.5: 
  |  |  |  |  |then if tax =< 0.5: 
  |  |  |  |  |  |then if parti =< 0.5: 
  |  |  |  |  |  |  |then if crime =< 0.5: 
  |  |  |  |  |  |  |  |then if trial =< 0.5: 
  |  |  |  |  |  |  |  |  |then if libertarian =< 0.5: 
  |  |  |  |  |  |  |  |  |  |then if statement =< 0.5: 
  |  |  |  |  |  |  |  |  |  |  |then if liber =< 0.5: 
  |  |  |  |  |  |  |  |  |  |  |  |then if argument =< 0.5: 
  |  |  |  |  |  |  |  |  |  |  |  |  |then if u =< 0.5: 
  |  |  |  |  |  |  |  |  |  |  |  |  |  |then if presid =< 0.5: 
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |then if drug =< 0.5: 
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |then if yeah =< 0.5: 
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |then if hous =< 0.5: 
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  

In [99]:
print("******** Accuracy **********")

# accuracy of best model with confidence interval
best_model = grid.best_estimator_
predict_y = best_model.predict(docarray)
acc = accuracy_score(newsgroups['label'], predict_y)
lb,ub = classification_confint(acc,docarray.shape[0])
print("Accuracy: {:3.2f} ({:3.2f},{:3.2f})".format(acc,lb,ub))

******** Accuracy **********
Accuracy: 0.92 (0.90,0.94)


In [100]:
print("******** confusion matrix **********")

# build the confusion matrix
cats = ['politics','space']
cm = confusion_matrix(newsgroups['label'], predict_y, labels=cats)
cm_df = pd.DataFrame(cm, index=cats, columns=cats)
print("Confusion Matrix:\n{}".format(cm_df))

******** confusion matrix **********
Confusion Matrix:
          politics  space
politics       578      2
space           82    376


## KNN

In [101]:
print("******** model **********")


# KNN
model = KNeighborsClassifier()

# grid search
param_grid = {'n_neighbors': list(range(1,15,3))}
grid = GridSearchCV(model, param_grid, cv=2, verbose=10, n_jobs=-1)
grid.fit(docarray, newsgroups['label'])
print("Grid Search: best parameters: {}".format(grid.best_params_))

******** model **********
Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   29.0s finished


Grid Search: best parameters: {'n_neighbors': 4}


In [102]:
print("******** model fine tuning **********")


# KNN
model = KNeighborsClassifier()

# grid search
cpoint = grid.best_params_['n_neighbors']
lpoint = cpoint -2
hpoint = cpoint +3
param_grid = {'n_neighbors': list(range(lpoint,hpoint))}
grid = GridSearchCV(model, param_grid, cv=2, verbose=10, n_jobs=-1)
grid.fit(docarray, newsgroups['label'])
print("Grid Search: best parameters: {}".format(grid.best_params_))

******** model fine tuning **********
Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   29.0s finished


Grid Search: best parameters: {'n_neighbors': 2}


In [103]:
print("******** Accuracy **********")

# accuracy of best model with confidence interval
best_model = grid.best_estimator_
predict_y = best_model.predict(docarray)
acc = accuracy_score(newsgroups['label'], predict_y)
lb,ub = classification_confint(acc,docarray.shape[0])
print("Accuracy: {:3.2f} ({:3.2f},{:3.2f})".format(acc,lb,ub))

******** Accuracy **********
Accuracy: 0.94 (0.92,0.95)


In [104]:
print("******** confusion matrix **********")

# build the confusion matrix
cats = ['politics','space']
cm = confusion_matrix(newsgroups['label'], predict_y, labels=cats)
cm_df = pd.DataFrame(cm, index=cats, columns=cats)
print("Confusion Matrix:\n{}".format(cm_df))

******** confusion matrix **********
Confusion Matrix:
          politics  space
politics       577      3
space           60    398


## Naive Bayes (NB)

* “Standard” model for text processing
* Fast to train, has no problems with very high dimensional data
* NB is a classification technique based on Bayes’ Theorem with an assumption of independence among predictors. 
* In simple terms, a NB classifier assumes that the presence of a particular feature in a class is unrelated to the presence of any other feature. 
* For example, a fruit may be considered to be an apple if it is red, round, and about 3 inches in diameter. Even if these features depend on each other or upon the existence of the other features, all of these properties independently contribute to the probability that this fruit is an apple and that is why it is known as ‘Naive’.


### The Mathematics

[Source](https://www.analyticsvidhya.com/blog/2017/09/naive-bayes-explained)

* Bayes theorem provides a way of calculating posterior probability $P(c|x)$ from $P(c)$, $P(x)$ and $P(x|c)$. Look at the equation below, where
  * $P(c|x)$ is the posterior probability of class (c, target) given predictor (x, attributes).
  * $P(c)$ is the prior probability of class.
  * $P(x|c)$ is the likelihood which is the probability of predictor given class.
  * $P(x)$ is the prior probability of predictor.

<img src="https://www.analyticsvidhya.com/wp-content/uploads/2015/09/Bayes_rule-300x172.png" width="400" height="400">

### Example

Let's assume we have a predictor `Weather` and a target `Play` that contains classes (left table below).  

<img src="https://www.analyticsvidhya.com/wp-content/uploads/2015/08/Bayes_41.png">

We want to compute if we play tennis when sunny.  That is we compute the two probabilities,
1. $P(Yes|Sunny)$
1. $P(No|Sunny)$
and then pick the statement with the higher probability.

Basically, NB just counts, let's look at $P(Yes|Sunny)$,

$P(Yes|Sunny) = \frac{P(Sunny|Yes)P(Yes)}{P(Sunny)} = \frac{3/9\times 9/14}{5/14} = \frac{.33 \times .64}{.36}=.60$

Now, let's look at $P(No|Sunny)$,

$P(No|Sunny) = \frac{P(Sunny|No)P(No)}{P(Sunny)} = \frac{2/5\times 5/14}{5/14} = \frac{.40 \times .36}{.36}=.40$

We are playing tennis when sunny because the posterior probability $P(Yes|Sunny)$ is higher.

Let’s take our text classification problem and use a Naive Bayes classifier on it.

The setup and data prep is the same as in the case of the KNN classifier.

In [105]:
## Naive Bayes

print("******** model **********")


# Naive Bayes
model = MultinomialNB()
# NOTE: NB does not have any hyper-parameters - no overfitting - no searching over parameter space!
model.fit(docarray, newsgroups['label'])


print("******** Accuracy **********")

# accuracy of best model with confidence interval
best_model = model
predict_y = best_model.predict(docarray)
acc = accuracy_score(newsgroups['label'], predict_y)
lb,ub = classification_confint(acc,docarray.shape[0])
print("Accuracy: {:3.2f} ({:3.2f},{:3.2f})".format(acc,lb,ub))

print("******** confusion matrix **********")

# build the confusion matrix
cats = ['politics','space']
cm = confusion_matrix(newsgroups['label'], predict_y, labels=cats)
cm_df = pd.DataFrame(cm, index=cats, columns=cats)
print("Confusion Matrix:\n{}".format(cm_df))

******** model **********
******** Accuracy **********
Accuracy: 0.96 (0.95,0.98)
******** confusion matrix **********
Confusion Matrix:
          politics  space
politics       556     24
space           13    445


Trains very fast and has a higher accuracy than KNN and the difference in accuracy is statistically significant!

> NB does not have any hyper-parameters - no overfitting - no searching over parameter space!

Hint: Try cross-validating the NB model - you will find that the fold accuracies and the mean accuracy will fall into the CI computed above.



Assignment -- See BrightSpace