In [1]:
# LIBRARIES
# TEXT PROCESSING
import nltk
from nltk.corpus import stopwords
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# DATA SCIENCE
import pandas as pd
import numpy as np

# VISUALIZATION
from wordcloud import WordCloud
from matplotlib import pyplot as plt
get_ipython().magic(u'matplotlib inline')
plt.style.use("ggplot")

# ELSE
from collections import Counter
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [2]:
# FUNCTION TO CLEAN UP BODY TEXT
def cleanUpText(text, additional_stopwords=[]):
    # REMOVE MARK UP
    new_text = text.replace("\r", "").replace("\n", "")
    
    # REMOVE URLS
    new_text = re.sub(r"\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", "", new_text).strip()
    
    # REMOVE PUNCTUATION
    new_text = new_text.translate(None, string.punctuation)
    
    # REMOVE NUMBERS
    new_text = re.sub(r"\d+", "", new_text)
    
    # LOWERCASE
    new_text = new_text.lower()
    
    #SPLIT
    new_text = new_text.split()
    
    # REMOVE STOPWORDS
    stops = stopwords.words("english") + additional_stopwords
    
    return [word for word in new_text if word not in stops]



# FUNCTION TO CLEAN UP BODY TEXT
def cleanUpTitle(text):
    # REMOVE MARK UP
    new_text = text.replace("\r", "").replace("\n", "")
    
    # REMOVE NUMBERS
    new_text = re.sub(r"\d+", "", new_text)
    
    # LOWERCASE
    new_text = new_text.lower()
    
    return(new_text)

In [3]:
# READ IN DATA TO TRAIN MODEL
petnlp = pd.DataFrame.from_csv("petnlp.csv")

cleaned_titles = [cleanUpTitle(x) for x in petnlp["title"]]


# DEFINE NEW STOP WORDS
new_stops = "President, president, people, without, needs, since, used, get, would, us, united, states, people, american, americans, national, government, petition, make, also, many, must, need, change, ask, use, every, trump, white, house, america, America, executive, Executive"
new_stops = new_stops.split(", ")


tokens = [cleanUpText(x, new_stops) for x in petnlp["body"]]

blobs = [unicode(" ".join(x), errors="replace") for x in tokens]
blobs = [x.encode("ascii", "replace") for x in blobs]
blobs_df = pd.DataFrame({"title":cleaned_titles, "blobs":blobs, "ideology":petnlp["ideology"]})


# REMOVE THE NULLS
index = blobs_df["ideology"].index[blobs_df["ideology"].apply(pd.isnull)]
blobs_df = blobs_df[~pd.isnull(blobs_df["ideology"])]
blobs_df = blobs_df.reset_index(drop=True)

# CHOOSE TRAINING DATA
train = [32, 262, 136, 240, 22, 24, 33, 66, 115, 84, 197, 246, 127, 88, 125, 224, 90, 23, 211, 168, 249, 156, 164, 239, 186, 155, 29, 185, 234, 232, 192, 218, 74, 113, 14, 172, 161, 47, 85, 244, 56, 207, 105, 46, 44, 142, 13, 199, 117, 7, 67, 18, 221, 145, 189, 71, 179, 242, 171, 52, 60, 152, 195, 162, 86, 147, 120, 245, 139, 256, 93, 220, 8, 62, 91, 82, 75, 96, 94, 223, 39, 151, 5, 250, 188, 3, 34, 35, 16, 259, 203, 124, 77, 238, 11, 101, 37, 194, 233, 76, 243, 149, 102, 236, 258, 260, 166, 158, 116, 144, 108, 205, 248, 57, 21, 235, 80, 255, 61, 150, 252, 38, 253, 17, 89, 148, 68, 123, 12, 58, 109, 170, 231, 191, 196, 119, 209, 237, 6, 122, 78, 103, 206, 28, 217, 20, 48, 54, 204, 193, 69, 43, 143, 112, 118, 1, 154, 219, 30, 190, 169, 0, 176, 198, 180, 19, 59, 137, 107, 135, 175, 131, 36, 27, 153, 140, 216, 2, 261, 126, 104, 110, 51, 87, 187, 121, 98, 132, 95, 79, 247, 50, 92, 133, 70, 157, 167, 214, 159, 163, 184, 160, 15, 97, 178, 225, 202, 173, 26, 63]

# VECTORIZING TRAINING DATA
vectorizer = CountVectorizer(analyzer= "word",
                            tokenizer = None,
                            preprocessor = None,
                            stop_words = None,
                            max_features = 5000)
train_data_features = vectorizer.fit_transform(blobs_df["blobs"][train])
train_data_features = train_data_features.toarray()


# GENERATE OUR FOREST
forest = RandomForestClassifier(n_estimators = 100)
our_forest = forest.fit(train_data_features, blobs_df["ideology"][train])



In [5]:
# GET INDICES OF TEST DATA
test = set(range(len(blobs_df))) - set(train)
test = list(test)

# VECTORIZING TEST DATA
test_data_features = vectorizer.transform(blobs_df["blobs"][test])
test_data_features = test_data_features.toarray()


# PREDICT
result = forest.predict(test_data_features)
pred_df = pd.DataFrame({"petition":blobs_df["title"][test], "true_ideol":blobs_df["ideology"][test], "pred_ideol":result})
pred_df = pred_df.reindex()

In [6]:
# CALCULATE ERROR
pred_df["correct"] = (pred_df["pred_ideol"] == pred_df["true_ideol"])

# ERROR RATE
len(pred_df[pred_df["correct"]==False])*(len(pred_df)**(-1))

0.3962264150943396