In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from os.path import expanduser
import re
from nltk.stem.porter import PorterStemmer

In [3]:
stop_words = [word.strip() for word in open('stop_words.txt').readlines()]

In [4]:
def stemming_tokenizer(str_input):
    porter_stemmer = PorterStemmer()
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [5]:
len(stop_words)

491

In [6]:
with open('dems.txt', 'r') as file:
    dem_text = [line.strip('\n') for line in file]

In [7]:
with open('gop.txt', 'r') as file:
    gop_text = [line.strip('\n') for line in file]

In [8]:
len(dem_text)

19373

In [9]:
vectorizer = CountVectorizer(input=dem_text + gop_text,
                             stop_words=stop_words,
                             max_features=1200)

In [10]:
dem_bow = vectorizer.fit_transform(dem_text)
gop_bow = vectorizer.fit_transform(gop_text)

In [11]:
dem_bow

<19373x1200 sparse matrix of type '<class 'numpy.int64'>'
	with 181831 stored elements in Compressed Sparse Row format>

In [12]:
dem_bow.shape

(19373, 1200)

In [13]:
vectorizer.get_feature_names()

['ability',
 'able',
 'abortion',
 'abortions',
 'absolutely',
 'abuse',
 'academy',
 'accept',
 'access',
 'accountable',
 'achieve',
 'act',
 'action',
 'actions',
 'actual',
 'actually',
 'ad',
 'adam',
 'added',
 'additional',
 'address',
 'admin',
 'administration',
 'advance',
 'affected',
 'afford',
 'african',
 'afternoon',
 'ag',
 'agenda',
 'agents',
 'ago',
 'agree',
 'agreement',
 'agriculture',
 'ahead',
 'aid',
 'air',
 'al',
 'alive',
 'allow',
 'ally',
 'am',
 'amazing',
 'ambassador',
 'amendment',
 'america',
 'american',
 'americans',
 'amp',
 'anniversary',
 'announced',
 'annual',
 'answer',
 'anti',
 'apply',
 'appreciate',
 'approach',
 'approval',
 'april',
 'aren',
 'armed',
 'army',
 'art',
 'article',
 'articles',
 'asian',
 'assistance',
 'attack',
 'attacks',
 'attempt',
 'attend',
 'attention',
 'average',
 'award',
 'awesome',
 'babies',
 'bad',
 'ballot',
 'base',
 'baseball',
 'based',
 'baseless',
 'batchelorshow',
 'beat',
 'beautiful',
 'begin',
 'be

In [14]:
(dem_bow.shape, gop_bow.shape)

((19373, 1200), (18978, 1200))

In [15]:
x = sparse.vstack((dem_bow, gop_bow))
ones = np.ones(19373)
zeros = np.zeros(18978)
y = np.hstack((ones, zeros))

In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [17]:
from sklearn.naive_bayes import BernoulliNB
naive_bayes = BernoulliNB()
model = naive_bayes.fit(x_train, y_train)


In [18]:
y_predictions = model.predict(x_test)

In [19]:
y_predictions, y_test

(array([1., 1., 1., ..., 0., 1., 0.]), array([1., 1., 1., ..., 0., 1., 0.]))

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predictions)

0.9693366708385481

In [26]:
dem_bow[0][0].toarray()

array([[0, 0, 0, ..., 0, 0, 0]])