# NLP & ML

## KNN

In [1]:
import pandas
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import cross_val_score
from re import sub
from nltk.stem import PorterStemmer


print("******** setup **********")
stemmer = PorterStemmer()

# get the newsgroup database
cats = ['talk.politics.misc', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', 
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=cats)

# extract into dataframes
text = pandas.DataFrame(newsgroups_train.data, columns=['text'])
label = pandas.DataFrame(newsgroups_train.target, columns=['label'])['label'].apply(lambda x: cats[x])

******** setup **********


In [2]:
text.iloc[3,0]

'Ahhh, remember the days of Yesterday?  When we were only \n\tgoing to pay $17 / month?\n\n\tWhen only 1.2% of the population would pay extra taxes?\n\n\tRemember when a few of us predicted that it wasn\'t true?  :)\n\tRemember the Inaugural?   Dancing and Singing!  Liberation\n\tat last!  \n\n\tWell, figure *this* out:\n\n\t5% VAT, estimated to raise $60-100 Billion per year ( on CNN )\n\tWork it out, chum...\n\n\t     $60,000,000,000  /  125,000,000 taxpayers = $480 / year\n\n        But, you exclaim, " I\'ll get FREE HEALTH CARE! "\n\tBut, I exclaim, " No, you won\'t! "\n\n\tThis is only for that poor 37 million who have none.  Not for\n\tYOU, chum. :)  That comes LATER.\n\n\tAdd in the estimates of the energy tax costs - $300-500 / year\n\n\tPlus, all that extra "corporate and rich" taxes that will \n\ttrickle down, and what do you have?\n\n\t$1,000 / year, just like I said two months ago.\n\n\tAnd, the best part?   You don\'t GET ANYTHING for it.\n\n\tDeficit is STILL projected to

In [3]:
label.head()

0    sci.space
1    sci.space
2    sci.space
3    sci.space
4    sci.space
Name: label, dtype: object

In [4]:
print("******** prepare data **********")
new_data = []
for i in range(text.shape[0]):
    new_data.append(sub("[^a-zA-Z]", " ", text.iloc[i,0]))

lowercase_data = []
for i in range(len(new_data)):
    lowercase_data.append(new_data[i].lower())

stemmed_data = []
for i in range(len(lowercase_data)):
    words = lowercase_data[i].split()
    stemmed_words = []
    for w in words:
        stemmed_words.append(stemmer.stem(w))
    stemmed_data.append(" ".join(stemmed_words))

print("******** setup vector model **********")
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word", binary = True, min_df=2, stop_words='english')
docarray = vectorizer.fit_transform(stemmed_data).toarray()

print("******** model and XV **********")
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=4)

# do the 10-fold cross validation
scores = cross_val_score(model, docarray, label, cv=10)
print("Fold Accuracies: {}".format(scores))
print("XV Accuracy: {: 6.2f}%".format(scores.mean()*100))

******** prepare data **********
******** setup vector model **********
******** model and XV **********
Fold Accuracies: [ 0.53271028  0.5046729   0.5046729   0.56603774  0.51886792  0.48571429
  0.4952381   0.4952381   0.56190476  0.51428571]
XV Accuracy:  51.79%


## Naive Bayes

In [5]:
import pandas
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import cross_val_score
from re import sub
from nltk.stem import PorterStemmer


print("******** setup **********")
stemmer = PorterStemmer()

# get the newsgroup database
cats = ['talk.politics.misc', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', 
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=cats)

# extract into dataframes
text = pandas.DataFrame(newsgroups_train.data, columns=['text'])
label = pandas.DataFrame(newsgroups_train.target, columns=['label'])['label'].apply(lambda x: cats[x])

print("******** prepare data **********")
new_data = []
for i in range(text.shape[0]):
    new_data.append(sub("[^a-zA-Z]", " ", text.iloc[i,0]))

lowercase_data = []
for i in range(len(new_data)):
    lowercase_data.append(new_data[i].lower())

stemmed_data = []
for i in range(len(lowercase_data)):
    words = lowercase_data[i].split()
    stemmed_words = []
    for w in words:
        stemmed_words.append(stemmer.stem(w))
    stemmed_data.append(" ".join(stemmed_words))

print("******** setup vector model **********")
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word", binary = True, min_df=2, stop_words='english')
docarray = vectorizer.fit_transform(stemmed_data).toarray()
coords = vectorizer.get_feature_names()
docterm = pandas.DataFrame(data=docarray,columns=coords)

print("******** model and XV **********")
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()


# do the 10-fold cross validation
scores = cross_val_score(model, docarray, newsgroups_train.target, cv=10)
print("Fold Accuracies: {}".format(scores))
print("XV Accuracy: {: 6.2f}%".format(scores.mean()*100))

******** setup **********
******** prepare data **********
******** setup vector model **********
******** model and XV **********
Fold Accuracies: [ 0.93457944  0.95327103  0.93457944  0.95283019  0.93396226  0.96190476
  0.92380952  0.84761905  0.9047619   0.91428571]
XV Accuracy:  92.62%
