# Simple NLP Classification w/ sklearn

- `sklearn.naive_bayes.MultinomialNB` for `CountVectorizer`, b/c works well w/ integers
- SVM or Linear models for `TfidfVectorizer` b/c dealing with floats.

I tried to use `flashtext` to replace punctuation w/ empty strings, but it wasn't working for some reason.

In [87]:
import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import movie_reviews

labels = [re.match(r"\w{3}", l)[0] for l in movie_reviews.fileids()]

df = pd.DataFrame(
    {'text':movie_reviews.raw(fileids=[l]),'label': re.match(r"\w{3}", l)[0]} for l in movie_reviews.fileids()
)

In [4]:
df.head()

Unnamed: 0,label,text
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


In [9]:
punctuations = list(string.punctuation)
punctuations[:10]

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*']

In [22]:
# punc = ["!", ".", "x"]
# replacement = [""]*len(punc)

# for old, new in zip(punc, replacement):
#     print(old, new)

! 
. 
x 


In [76]:
# Remove punctuation from the text
df['text'] = df.text.apply(lambda x: " ".join(x for x in x.split() if x not in punctuations))

In [78]:
y = df.label

print("class balance:\n", df.label.value_counts())

from sklearn.model_selection import train_test_split
text_train, text_test, y_train, y_test = train_test_split(df['text'], y,
                                                    test_size = 0.33,
                                                    stratify = y,
                                                    random_state = 42)

class balance:
 pos    1000
neg    1000
Name: label, dtype: int64


## Naive Bayes

In [81]:
from sklearn.feature_extraction.text import CountVectorizer

# Count Vectorize
vect = CountVectorizer(stop_words = 'english')

X_train = vect.fit_transform(text_train.values)
X_test = vect.transform(text_test.values)

In [92]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [93]:
pred = nb_classifier.predict(X_test)

In [94]:
metrics.accuracy_score(y_test, pred)

0.8

In [95]:
metrics.confusion_matrix(y_test, pred)

array([[264,  66],
       [ 66, 264]])

## Naive Bayes w/ TF-IDF

In [96]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words = 'english', max_df = 0.7)

tfidf_train = tfidf.fit_transform(text_train)
tfidf_test = tfidf.transform(text_test)

In [97]:
# Hyperparameter tuning: alpha

alphas = np.arange(0, 1, 0.1)

# Create a function to train and predict for each alpha

def train_and_predict(alpha):
    # Instantitate classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fit to training data
    nb_classifier.fit(tfidf_train, y_train)
    # Make predictions
    pred = nb_classifier.predict(tfidf_test)
    # Calculate accuracy score
    score = metrics.accuracy_score(y_test, pred)
    return score

In [98]:
for alpha in alphas:
    print('Alpha:\t', alpha)
    print('Score:\t', train_and_predict(alpha))
    print()

Alpha:	 0.0
Score:	 0.7106060606060606

Alpha:	 0.1
Score:	 0.793939393939394

Alpha:	 0.2
Score:	 0.8015151515151515

Alpha:	 0.30000000000000004
Score:	 0.8045454545454546

Alpha:	 0.4
Score:	 0.8045454545454546

Alpha:	 0.5
Score:	 0.803030303030303

Alpha:	 0.6000000000000001
Score:	 0.8090909090909091

Alpha:	 0.7000000000000001
Score:	 0.8106060606060606

Alpha:	 0.8
Score:	 0.8090909090909091

Alpha:	 0.9
Score:	 0.8090909090909091



  'setting alpha = %.1e' % _ALPHA_MIN)


## Does the classifier make sense?

In [99]:
# Class labels
class_labels = nb_classifier.classes_

# Feature names
feat_names = tfidf.get_feature_names()

In [104]:
# Zip them together, sorted by weights

feat_with_weights = sorted(zip(nb_classifier.coef_[0], feat_names))

# Print first class label and top 10 weighted feats
class_labels[0], feat_with_weights[:10]

('neg',
 [(-12.457508177588833, '00s'),
  (-12.457508177588833, '106'),
  (-12.457508177588833, '108'),
  (-12.457508177588833, '10b'),
  (-12.457508177588833, '10s'),
  (-12.457508177588833, '10th'),
  (-12.457508177588833, '113'),
  (-12.457508177588833, '115'),
  (-12.457508177588833, '11th'),
  (-12.457508177588833, '125')])

In [105]:
# Print second class label and bottom 10 weighted feats
class_labels[1], feat_with_weights[-10:]

('pos',
 [(-6.0102023150476205, 'characters'),
  (-5.933945871439321, 'character'),
  (-5.912158517254413, 'lifeboat'),
  (-5.737288022453538, 'storyboarded'),
  (-5.730076452737978, 'timekiller'),
  (-5.707576983800263, 'goodall'),
  (-5.683284291231219, 'justice'),
  (-5.346812054610006, 'likeability'),
  (-4.999322020248346, 'moviegoers'),
  (-4.320989925473542, 'filmcritic')])