In [39]:
from model.models import Model
from preprocess.data_pipeline import  read_mongo
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [2]:
petitions_model = Model()

In [72]:
data = read_mongo("changeorg", "us_closed_petitions",
                      {"$and": [{"petition_id": {"$gt":6763319 , "$lt": 9000000}},
                                {"endorsements": { "$exists": True }}]})

In [76]:
data.head().T

Unnamed: 0,0,1,2,3,4
_id,578efecdd9ccb337a19f747f,578f00ced9ccb337a19f777f,578f015cd9ccb337a19f786a,578f05e5d9ccb337a19f7f74,578f0616d9ccb337a19f7fc0
ask,Funding Our Children's Future,Billions Taxable on Homeowner Association Fore...,Bear Branch Elementary School,Give furries the rights they need,End the Harassment of Democratic Superdelegate...
calculated_goal,100,100,100,100,100
category,,,,,
comments_likes,0,0,0,0,0
created_at,2016-04-11T17:44:09Z,2016-04-11T23:31:29Z,2016-04-12T01:52:33Z,2016-04-12T16:58:00Z,2016-04-12T17:21:20Z
creator_city,,,Ransomville,Arlington,
creator_country,US,US,US,US,US
creator_description,,,,,
creator_display_name,,,Nick Carberry,Cookie Penguin,N A


In [75]:
data[["targets"]]

Unnamed: 0,targets
0,"[{u'type': u'us_government', u'name': u'State ..."
1,"[{u'type': u'us_government', u'name': u'Sen. S..."
2,"[{u'type': u'us_government', u'name': u'Texas ..."
3,"[{u'type': u'us_government', u'name': u'U.S. H..."
4,"[{u'type': u'custom', u'name': u'DNC'}, {u'typ..."
5,"[{u'type': u'us_government', u'name': u'U.S. H..."
6,"[{u'type': u'us_government', u'name': u'Gov. G..."
7,"[{u'type': u'us_government', u'name': u'Louisi..."
8,"[{u'type': u'custom', u'name': u'Hon. Jeffrey ..."
9,"[{u'type': u'us_government', u'name': u'Sen. B..."


In [27]:
data_df = petitions_model.data_pipeline(data)

In [28]:
y = data_df.pop("status")

In [29]:
X = data_df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [31]:
vectorizer = TfidfVectorizer(stop_words='english', use_idf=False, max_features=10)

In [34]:
vect = vectorizer.fit_transform(X_train["description"].values)

In [36]:
tf_col = ['tf_%s' % x for x in vectorizer.get_feature_names()]

In [37]:
tf_col

[u'tf_act',
 u'tf_health',
 u'tf_human',
 u'tf_massage',
 u'tf_max',
 u'tf_need',
 u'tf_new',
 u'tf_people',
 u'tf_state',
 u'tf_time']

In [41]:
tf_df = pd.DataFrame(vect.toarray(), columns=tf_col)

In [52]:
X_train.shape

(48, 30)

In [46]:
tf_df.shape

(48, 10)

In [51]:
X_train = X_train.drop("description", axis=1)

In [59]:
tf_df.head()

Unnamed: 0,tf_act,tf_health,tf_human,tf_massage,tf_max,tf_need,tf_new,tf_people,tf_state,tf_time
0,0.0,0.632456,0.0,0.0,0.0,0.0,0.0,0.632456,0.316228,0.316228
1,0.077968,0.0,0.155936,0.0,0.974601,0.0,0.077968,0.0,0.0,0.116952
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.894427,0.0,0.447214
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
X_train = X_train.reset_index()

In [65]:
X_train

Unnamed: 0,num_past_petitions,num_past_verified_victories,num_past_victories,num_comments,title_len,overview_len,news_coverages,letter_body_len,milestones,ask_len,...,creator_type_user,num_tweets,comments_likes,endorsements,signature_count,num_capitalized_words_description,num_bold_words_description,num_italic_words_description,num_links_description,has_hashtag_description
0,0,0,0,439,78,961,6,64,10,64,...,1.0,0,481,0,1020,0,0,0,0,0
1,0,0,0,0,60,5204,0,15,0,15,...,1.0,0,0,0,2,2,0,0,0,0
2,0,0,0,1,67,403,0,36,0,29,...,1.0,0,0,0,2,0,0,0,0,0
3,0,0,0,0,60,23,0,46,0,18,...,1.0,0,0,0,1,0,0,0,0,0
4,0,0,0,46,94,11,2,60,5,4,...,1.0,0,173,0,116,0,0,0,0,0
5,0,0,0,2,99,932,0,81,1,72,...,1.0,0,0,0,9,2,0,4,0,0
6,0,0,0,0,24,36,0,7,0,13,...,1.0,0,0,0,0,0,0,0,0,0
7,0,0,0,473,67,3133,1,3027,10,24,...,0.0,0,133,0,1488,5,0,0,0,0
8,0,0,0,671,31,1188,8,1554,14,18,...,1.0,0,180,0,3506,8,0,0,0,0
9,0,0,0,2,60,1373,0,43,0,43,...,1.0,0,2,0,3,3,12,0,0,0


In [63]:
pd.concat([X_train, tf_df], axis=1)

Unnamed: 0,index,num_past_petitions,num_past_verified_victories,num_past_victories,num_comments,title_len,overview_len,news_coverages,letter_body_len,milestones,...,tf_act,tf_health,tf_human,tf_massage,tf_max,tf_need,tf_new,tf_people,tf_state,tf_time
0,7,0,0,0,439,78,961,6,64,10,...,0.0,0.632456,0.0,0.0,0.0,0.0,0.0,0.632456,0.316228,0.316228
1,56,0,0,0,0,60,5204,0,15,0,...,0.077968,0.0,0.155936,0.0,0.974601,0.0,0.077968,0.0,0.0,0.116952
2,42,0,0,0,1,67,403,0,36,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.894427,0.0,0.447214
3,30,0,0,0,0,60,23,0,46,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,47,0,0,0,46,94,11,2,60,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,49,0,0,0,2,99,932,0,81,1,...,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.57735,0.0,0.57735
6,19,0,0,0,0,24,36,0,7,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,59,0,0,0,473,67,3133,1,3027,10,...,0.30317,0.060634,0.424437,0.848875,0.0,0.060634,0.0,0.0,0.0,0.0
8,25,0,0,0,671,31,1188,8,1554,14,...,0.872872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.436436,0.218218
9,40,0,0,0,2,60,1373,0,43,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [14]:
X_train = petitions_model.vectorize_text(X_train)


In [23]:
X_train.head()


Unnamed: 0,num_past_petitions,num_past_verified_victories,num_past_victories,num_comments,title_len,overview_len,news_coverages,letter_body_len,milestones,ask_len,...,num_tweets,comments_likes,endorsements,signature_count,num_capitalized_words_description,num_bold_words_description,num_italic_words_description,num_links_description,has_hashtag_description,description
7,0,0,0,439,78,961,6,64,10,64,...,0,481,0,1020,0,0,0,0,0,Please do not close W.O. Moss Memorial Medical...
56,0,0,0,0,60,5204,0,15,0,15,...,0,0,0,2,2,0,0,0,0,"Please help us get justice for Max, our belove..."
42,0,0,0,1,67,403,0,36,0,29,...,0,0,0,2,0,0,0,0,0,We have national recognition for many differen...
30,0,0,0,0,60,23,0,46,0,18,...,0,0,0,1,0,0,0,0,0,make him be nice
47,0,0,0,46,94,11,2,60,5,4,...,0,173,0,116,0,0,0,0,0,ohio


In [66]:
        vect = vectorizer.transform(X_test["description"])

In [70]:
        tf_df = pd.DataFrame(vect.toarray(), columns=tf_col)
    

In [71]:
tf_df

Unnamed: 0,tf_act,tf_health,tf_human,tf_massage,tf_max,tf_need,tf_new,tf_people,tf_state,tf_time
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.288675,0.0,0.0,0.0,0.0,0.0,0.0,0.288675,0.866025,0.288675
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.707107,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.970143,0.0,0.242536
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.894427,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.894427,0.447214
9,0.0,0.0,0.19245,0.0,0.0,0.19245,0.0,0.0,0.0,0.96225


In [None]:
        X_test = pd.concat([X_test, tf_df], axis=1)
        new_cols = set(self.columns).difference(set(X_test.columns))
        del_cols = set(X_test.columns).difference(set(self.columns))
        X_test = X_test.drop(list(del_cols), axis=1)
        for new_col in new_cols:
            X_test[new_col] = 0
        return self.model.predict(X_test.values)