In [7]:

# http://qwone.com/~jason/20Newsgroups/
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [8]:
# show categories
twenty_train.target_names[:5]


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware']

In [5]:
# check out first data
twenty_train.data[0].split('\n')

["From: lerxst@wam.umd.edu (where's my thing)",
 'Subject: WHAT car is this!?',
 'Nntp-Posting-Host: rac3.wam.umd.edu',
 'Organization: University of Maryland, College Park',
 'Lines: 15',
 '',
 ' I was wondering if anyone out there could enlighten me on this car I saw',
 'the other day. It was a 2-door sports car, looked to be from the late 60s/',
 'early 70s. It was called a Bricklin. The doors were really small. In addition,',
 'the front bumper was separate from the rest of the body. This is ',
 'all I know. If anyone can tellme a model name, engine specs, years',
 'of production, where this car is made, history, or whatever info you',
 'have on this funky looking car, please e-mail.',
 '',
 'Thanks,',
 '- IL',
 '   ---- brought to you by your neighborhood Lerxst ----',
 '',
 '',
 '',
 '',
 '']

In [None]:
### bag of words (all the words from all the docs)
### https://en.wikipedia.org/wiki/Bag-of-words_model
### https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
### features are all words in all documents 
### TF is the count of each word per doc 
### shape is #docs (n_samples), #words (n_features)
### TF-IDF 
### https://en.wikipedia.org/wiki/Tf%E2%80%93idf

In [15]:
# imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
from nltk import word_tokenize
import numpy as np
import string
import nltk
nltk.download(['stopwords','punkt'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mmcda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mmcda\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [7]:
# make a classifier using naive bayes

text_clf = Pipeline([
                      ('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
                    ])
                    
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)      

# test, predict,  display accurracy
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

In [48]:
# make a classifier using support vector machine (SVM)
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html

text_clf_svm = Pipeline([
                            ('vect', CountVectorizer()),
                            ('tfidf', TfidfTransformer()),
                            ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42)),
                        ])
text_clf_svm.n_iter=5

_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)

# test, predict,  display accurracy
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)


0.8240839086563994

In [49]:
# use grid search to find best hyper params for NB model

parameters = {
                'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
                'tfidf__use_idf': (True, False),
                'clf__alpha': (1e-2, 1e-3),
            }

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [53]:
# use grid search to find best hyper params for SVM model

parameters_svm = {
                    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
                    'tfidf__use_idf': (True, False),
                    'clf-svm__alpha': (1e-2, 1e-3),
                }
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)

print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

0.9051618841994754
{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


In [11]:
from nltk.stem.snowball import SnowballStemmer

def stemming_tokenizer(text):
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    return [stemmer.stem(w) for w in word_tokenize(text)]


In [16]:
text_clf_svm = Pipeline([
                            ('vect', TfidfVectorizer(tokenizer=stemming_tokenizer
                                    , stop_words=stopwords.words('english') + list(string.punctuation))),
                            ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42)),
                        ])
text_clf_svm.n_iter=5

_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)

# test, predict,  display accurracy
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)



0.8215613382899628

In [19]:
text_clf = Pipeline([
                      ('vect', TfidfVectorizer(tokenizer=stemming_tokenizer
                                    , stop_words=stopwords.words('english') + list(string.punctuation), min_df=5)),
                      ('clf', MultinomialNB(alpha=0.005)),
                    ])
                    
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)      

# test, predict,  display accurracy
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)



0.8324482209240573