# NLP & ML

## KNN

In [1]:
import pandas
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import cross_val_score
from re import sub
from nltk.stem import PorterStemmer


print("******** setup **********")
stemmer = PorterStemmer()
cats = ['talk.politics.misc', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', 
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=cats)

print("******** prepare data **********")
new_data = []
for i in range(len(newsgroups_train.data)):
    new_data.append(sub("[^a-zA-Z]", " ", newsgroups_train.data[i]))

lowercase_data = []
for i in range(len(new_data)):
    lowercase_data.append(new_data[i].lower())

stemmed_data = []
for i in range(len(lowercase_data)):
    words = lowercase_data[i].split()
    stemmed_words = []
    for w in words:
        stemmed_words.append(stemmer.stem(w))
    stemmed_data.append(" ".join(stemmed_words))

print("******** setup vector model **********")
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word", binary = True, min_df=2, stop_words='english')
docarray = vectorizer.fit_transform(stemmed_data).toarray()

print("******** model and XV **********")
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=4)

# do the 10-fold cross validation
scores = cross_val_score(model, docarray, newsgroups_train.target, cv=10)
print("Fold Accuracies: {}".format(scores))
print("XV Accuracy: {: 6.2f}%".format(scores.mean()*100))

******** setup **********
******** prepare data **********
******** setup vector model **********
******** model and XV **********
Fold Accuracies: [ 0.68224299  0.6635514   0.61682243  0.61320755  0.62264151  0.63809524
  0.59047619  0.66666667  0.62857143  0.6       ]
XV Accuracy:  63.22%


## Naive Bayes

In [2]:
import pandas
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import cross_val_score
from re import sub
from nltk.stem import PorterStemmer


print("******** setup **********")
stemmer = PorterStemmer()
cats = ['talk.politics.misc', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', 
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=cats)

print("******** prepare data **********")
new_data = []
for i in range(len(newsgroups_train.data)):
    new_data.append(sub("[^a-zA-Z]", " ", newsgroups_train.data[i]))

lowercase_data = []
for i in range(len(new_data)):
    lowercase_data.append(new_data[i].lower())

stemmed_data = []
for i in range(len(lowercase_data)):
    words = lowercase_data[i].split()
    stemmed_words = []
    for w in words:
        stemmed_words.append(stemmer.stem(w))
    stemmed_data.append(" ".join(stemmed_words))

print("******** setup vector model **********")
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word", binary = True, min_df=2, stop_words='english')
docarray = vectorizer.fit_transform(stemmed_data).toarray()
coords = vectorizer.get_feature_names()
docterm = pandas.DataFrame(data=docarray,columns=coords)

print("******** model and XV **********")
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()


# do the 10-fold cross validation
scores = cross_val_score(model, docarray, newsgroups_train.target, cv=10)
print("Fold Accuracies: {}".format(scores))
print("XV Accuracy: {: 6.2f}%".format(scores.mean()*100))

******** setup **********
******** prepare data **********
******** setup vector model **********
******** model and XV **********
Fold Accuracies: [ 0.93457944  0.95327103  0.93457944  0.95283019  0.93396226  0.96190476
  0.92380952  0.84761905  0.9047619   0.91428571]
XV Accuracy:  92.62%
