In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from os.path import expanduser
import re
from nltk.stem.porter import PorterStemmer

In [4]:
stop_words = [word.strip() for word in open('stop_words.txt').readlines()]

In [5]:
len(stop_words)


572

In [6]:
def stemming_tokenizer(str_input):
    porter_stemmer = PorterStemmer()
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [7]:
with open('dems.txt', 'r',encoding="utf-8") as file:
    dem_text = [line.strip('\n') for line in file]
with open('gop.txt', 'r',encoding="utf-8") as file:
    gop_text = [line.strip('\n') for line in file]
with open('NonPolitical.txt', 'r',encoding="utf-8") as file:
    nonp_text = [line.strip('\n') for line in file]

In [8]:
vectorizer = CountVectorizer(input=dem_text + gop_text+nonp_text,
                             stop_words=stop_words,
                             max_features=1200,tokenizer=stemming_tokenizer)

In [9]:
dem_bow = vectorizer.fit_transform(dem_text)
gop_bow = vectorizer.fit_transform(gop_text)
nonp_bow = vectorizer.fit_transform(nonp_text)

In [10]:
#%%
(dem_bow.shape, gop_bow.shape, nonp_bow.shape)
#%%

((19373, 1200), (18978, 1200), (12913, 1200))

In [11]:
x = sparse.vstack((dem_bow, gop_bow, nonp_bow))
ones = np.ones(19373)
zeros = np.zeros(18978)
twos = np.full(12913,2)
y = np.hstack((ones, zeros, twos))


In [43]:
class_names = ['Democrats','Republicans','Non-Political']

In [77]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix


# Creating the Training and Test set from data


X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

# Feature Scaling. This is a very important step in machine learning. 
#It helps the algorithm quickly learn a better solution to the problem.

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [83]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=100,random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)


# accuracy on X_test 
from sklearn.metrics import accuracy_score
accuracy = regressor.score(X_test, y_test) 
print(accuracy)

0.72584
