In [23]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from os.path import expanduser
import re
from nltk.stem.porter import PorterStemmer

In [24]:
stop_words = [word.strip() for word in open('stop_words.txt').readlines()]

In [60]:
len(stop_words)


572

In [25]:
def stemming_tokenizer(str_input):
    porter_stemmer = PorterStemmer()
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [26]:
with open('dems.txt', 'r',encoding="utf-8") as file:
    dem_text = [line.strip('\n') for line in file]
with open('gop.txt', 'r',encoding="utf-8") as file:
    gop_text = [line.strip('\n') for line in file]

In [27]:
vectorizer = CountVectorizer(input=dem_text + gop_text,
                             stop_words=stop_words,
                             max_features=1200,tokenizer=stemming_tokenizer)

In [28]:
dem_bow = vectorizer.fit_transform(dem_text)
gop_bow = vectorizer.fit_transform(gop_text)

In [43]:
#%%
(dem_bow.shape, gop_bow.shape)
#%%

((19373, 1200), (18978, 1200))

In [50]:
vectorizer.get_feature_names()

['abil',
 'abl',
 'abort',
 'absolut',
 'abus',
 'academi',
 'accept',
 'access',
 'accomplish',
 'accord',
 'account',
 'accus',
 'achiev',
 'act',
 'action',
 'activ',
 'actual',
 'ad',
 'adam',
 'add',
 'addict',
 'addit',
 'address',
 'admin',
 'administr',
 'admit',
 'advanc',
 'advoc',
 'affect',
 'afford',
 'african',
 'afternoon',
 'ag',
 'agenc',
 'agenda',
 'agent',
 'aggress',
 'ago',
 'agre',
 'agreement',
 'agricultur',
 'ahead',
 'aid',
 'air',
 'al',
 'aliv',
 'alli',
 'allow',
 'am',
 'amaz',
 'ambassador',
 'amend',
 'america',
 'american',
 'amp',
 'anniversari',
 'announc',
 'annual',
 'answer',
 'anti-semit',
 'apollo',
 'appear',
 'appli',
 'applic',
 'appreci',
 'approach',
 'approv',
 'april',
 'aren',
 'arm',
 'armi',
 'arriv',
 'art',
 'articl',
 'asian',
 'assist',
 'associ',
 'astronaut',
 'atlant',
 'attack',
 'attempt',
 'attend',
 'attent',
 'author',
 'averag',
 'awar',
 'award',
 'awesom',
 'babi',
 'backfir',
 'bad',
 'ballot',
 'ban',
 'bank',
 'base',

In [51]:
x = sparse.vstack((dem_bow, gop_bow))
ones = np.ones(19373)
zeros = np.zeros(18978)
y = np.hstack((ones, zeros))

In [52]:
from sklearn.naive_bayes import BernoulliNB

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)


In [53]:
from sklearn.naive_bayes import BernoulliNB
naive_bayes = BernoulliNB()
model = naive_bayes.fit(x_train, y_train)

In [54]:
y_predictions = model.predict(x_test)

In [55]:
y_predictions, y_test

(array([1., 0., 1., ..., 0., 1., 0.]), array([1., 1., 1., ..., 0., 1., 0.]))

In [56]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predictions)

0.975177304964539

In [57]:
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

cv = KFold(n_splits=10, shuffle=False)
nb = MultinomialNB()
for train_index, test_index in cv.split(x,y):
    x_train, x_test, y_train, y_test = x[train_index], x[test_index], y[train_index], y[test_index]
    model=nb.fit(x_train, y_train)
y_predictions = model.predict(x_test)


In [58]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(nb, x, y, cv=10)
scores


array([0.97288843, 0.98435463, 0.9767927 , 0.9767927 , 0.96792699,
       0.92881356, 0.9452412 , 0.92698827, 0.92881356, 0.95827901])

In [59]:
print("Accuracy: %0.2f " % (scores.mean()))

Accuracy: 0.96 
