In [3]:
from pymongo import MongoClient
import pandas as pd

In [5]:
# Get MongoDB
mc = MongoClient()
db = mc["changeorg"]
petitions_col = db["petitions_scraped"]

In [70]:
df = pd.DataFrame(list(petitions_col.find({})))

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10085 entries, 0 to 10084
Data columns (total 66 columns):
_id                                10085 non-null object
ask                                10085 non-null object
calculated_goal                    10085 non-null int64
calculated_goal_with_endorsers     9545 non-null float64
comments_last_page                 10085 non-null bool
comments_likes                     10085 non-null int64
created_at                         10085 non-null object
creator_name                       10085 non-null object
creator_photo                      6407 non-null object
description                        10085 non-null object
discoverable                       10085 non-null bool
display_title                      10085 non-null object
displayed_signature_count          10085 non-null int64
displayed_supporter_count          9545 non-null float64
document_id                        10085 non-null object
end_date                           10085 non

In [15]:
df.head().T

Unnamed: 0,0,1,2,3,4
_id,579d958f51f2b8a00721ed50,579d958f51f2b8a00721ed56,579d958f51f2b8a00721ed57,579d958f51f2b8a00721ed59,579d959051f2b8a00721ed5d
ask,CARTA A\nDiana María Guillén #Senasa\nVicepres...,S,that the blacklivematter group no longer be al...,Mi Apoyo que se celebre otro juicio,to shutdown cartoon network
calculated_goal,100,100,100,100,100
calculated_goal_with_endorsers,100,100,100,100,100
comments_last_page,True,True,True,True,True
comments_likes,0,0,0,0,0
created_at,2016-01-20T17:00:13Z,2016-01-21T07:15:46Z,2015-08-29T15:45:18Z,2016-02-03T08:00:27Z,2016-01-21T14:00:15Z
creator_name,Laura Ferreyra,Ms Me,michael lacey,silvia yolanda gomara corredera,louis alves
creator_photo,{u'url': u'photos/7/ym/fw/qTYMFWijKULoeFi-full...,{u'url': u'photos/2/cw/cl/nHCwCldKUmQRjVx-full...,{u'url': u'photos/8/tm/lf/XFTmlFTUGreDcLr-full...,{u'url': u'photos/7/jc/il/rgJcilbQywzZouF-full...,{u'url': u'photos/1/rp/bd/ijrpbdzvaakbPgk-full...
description,<p>CARTA A<br>Diana María Guillén #Senasa<br>V...,Tg,<p>.</p>,Yo me alegraría y toda España y sobretodo sus ...,<p>shutdown cartoon network complely</p>


## ask

Ask is a summary of what the petition is asking for. It is texts and has not Nans

## goal
Goal seems to be an attribute they used in the past, the most recent data has this field set to None. Calculated goal and calculated_goal_with_endorsers contains the same information, but the second one contains some NaNs.
We keep "calculated_goal" and drop the other 2

In [16]:
df[df["goal"].notnull()][["goal", "calculated_goal", "calculated_goal_with_endorsers"]].head()

Unnamed: 0,goal,calculated_goal,calculated_goal_with_endorsers
988,0.0,2500,2500.0
1015,0.0,7500,7500.0
1073,0.0,100,100.0
1075,0.0,100,100.0
1093,0.0,1000,1000.0


In [17]:
df.pop("goal")
df.pop("calculated_goal_with_endorsers")

0           100.0
1           100.0
2           100.0
3           100.0
4           100.0
5           100.0
6           100.0
7           100.0
8           100.0
9           100.0
10         2500.0
11          100.0
12         7500.0
13          100.0
14          100.0
15          100.0
16          100.0
17          100.0
18          200.0
19          500.0
20          100.0
21          100.0
22          100.0
23          100.0
24          100.0
25          100.0
26          100.0
27          100.0
28          100.0
29          100.0
           ...   
10055       100.0
10056     25000.0
10057       100.0
10058       200.0
10059       100.0
10060       100.0
10061     35000.0
10062       100.0
10063    200000.0
10064    200000.0
10065       100.0
10066       100.0
10067       100.0
10068     10000.0
10069       100.0
10070       100.0
10071       100.0
10072       100.0
10073       100.0
10074       100.0
10075       100.0
10076      1000.0
10077       100.0
10078      1000.0
10079     

## comments_last_page

comments last page is a boolean. This is a feature generated during the data scraping. Some petitions have over 20.000 pages of comments. Instead of counting all of the comments we put a maximum number of pages to scrape and stored a boolean indicating if it was the last page.

## comments_likes

comments_likes is the sum of the likes in the comments of the petition

## created_at and end_date

* created_at is the date when the petition was created. We need to set it to a datetime type instead of an object
* end_date is the date when the petition campaign ends

In [18]:
df["end_date"] = pd.to_datetime(df["end_date"])

In [19]:
df["created_at"] = pd.to_datetime(df["created_at"])

## creator_name

Name of the petition's creator. We drop it since we don't consider the information relevant

## creator_photo media photo video

creator_photo, media and photo is a dictionary with more information. There are some Nans. We keep it like that for now and will do some feature engineering creating a column that indicates if the creator has a photo

## description

Contains de description of the petition. Interesting to do some NLP after the data cleaning

## discoverable

Not sure what this column means, we keep it for now and see if it is significant.

In [20]:
df[df["discoverable"] == False].head().T

Unnamed: 0,1439,1666,1749,2262,2294
_id,579d989051f2b8a007220471,579d98fd51f2b8a0072207e2,579d992d51f2b8a0072208f6,579d99df51f2b8a007220e6a,579d99ea51f2b8a007220eb7
ask,Protect North Carolina residents from dangerou...,Tell the General Assembly you support high qua...,It’s Time for Commonsense Gun Safety Laws in O...,Demand a Vote on Life-Saving Background Checks,Repeal the law requiring Florida gambling faci...
calculated_goal,200,5000,15000,7500,100
comments_last_page,True,True,True,True,True
comments_likes,0,0,0,0,0
created_at,2015-04-20 18:00:28,2015-03-06 20:02:35,2015-03-07 00:10:33,2015-07-29 21:43:02,2015-04-02 19:23:01
creator_name,NORTH CAROLINA CONSERVATION NETWORK,Maryland Maternity Access Coalition,Americans for Responsible Solutions,Everytown for Gun Safety,Marilyn Varnberg
creator_photo,{u'url': u'photos/1/kn/ym/SNKNYMkCKYrEQeo-full...,{u'url': u'photos/4/rk/rs/pHrKrSJahrCPxMt-full...,{u'url': u'photos/9/za/iv/EdZAIvojSrccuoS-full...,{u'url': u'photos/1/vp/js/tsvpjSIeIRKfXcE-full...,
description,<p>Tell U.S. Senators Richard Burr and Thom Ti...,<p>Tell the Maryland General Assembly you supp...,<p>Our gun laws are not doing enough to protec...,<p>Another movie theater shooting. Another nig...,<p>More than half of the greyhound races held ...
discoverable,False,False,False,False,False


## display_title and title and petition_title

As with the goal, title and petition_title seems to be an attribute used in the past, there are a lot of Nan and the ones that contain information have the same information as display_title. So we keep display_title

In [21]:
df[df["title"].notnull()][["display_title", "title", "petition_title"]].head()

Unnamed: 0,display_title,title,petition_title
1,"Federated States of Micronesia State Senate, M...","Federated States of Micronesia State Senate, M...","Federated States of Micronesia State Senate, M..."
2,Barack Obama: that the blacklivematter group n...,Barack Obama: that the blacklivematter group n...,Barack Obama: that the blacklivematter group n...
5,Pennsylvania State House: Members of the PA Ho...,Pennsylvania State House: Members of the PA Ho...,Pennsylvania State House: Members of the PA Ho...
6,"city of Thorton, People for the Ethical Treatm...","city of Thorton, People for the Ethical Treatm...","city of Thorton, People for the Ethical Treatm..."
9,We Support Apple vs FBI NO BACKDOOR,We Support Apple vs FBI NO BACKDOOR,We Support Apple vs FBI NO BACKDOOR


In [22]:
df.pop("title")

0                                                     None
1        Federated States of Micronesia State Senate, M...
2        Barack Obama: that the blacklivematter group n...
3                                                     None
4                                                     None
5        Pennsylvania State House: Members of the PA Ho...
6        city of Thorton, People for the Ethical Treatm...
7                                                     None
8                                                     None
9                      We Support Apple vs FBI NO BACKDOOR
10       Cyndi Stevenson, Travis Hutson, Florida Govern...
11       Joe Donnelly: Stop Carrier from moving jobs to...
12       President Obama, share your vision for drug po...
13                                                    None
14                          congresistas: Un cambio social
15                        Karen Bass: Testing the Platform
16       Stand up for Traditional Marriage's (man and w.

In [23]:
df.pop("petition_title")

0        CARTA A\nDiana María Guillén #Senasa\nVicepres...
1        Federated States of Micronesia State Senate, M...
2        Barack Obama: that the blacklivematter group n...
3                      Mi Apoyo que se celebre otro juicio
4                              to shutdown cartoon network
5        Pennsylvania State House: Members of the PA Ho...
6        city of Thorton, People for the Ethical Treatm...
7        Que las medidas adoptadas no trasgredan la int...
8                                        Lets have school.
9                      We Support Apple vs FBI NO BACKDOOR
10       Cyndi Stevenson, Travis Hutson, Florida Govern...
11       Joe Donnelly: Stop Carrier from moving jobs to...
12       President Obama, share your vision for drug po...
13       Que esos hermosos adorables animalitos no pier...
14                          congresistas: Un cambio social
15                        Karen Bass: Testing the Platform
16       Stand up for Traditional Marriage's (man and w.

## displayed_signature_count total_signature_count

Signature count. No Nans. Info seems to be redundant, but at some data points the info is slightly different. We keep displayed since it is the one showed.

In [24]:
df[df["total_signature_count"]!= df["displayed_signature_count"]][["displayed_signature_count", "total_signature_count", "status"]].head()

Unnamed: 0,displayed_signature_count,total_signature_count,status
81,14806,13566,victory
432,2891,2668,victory
494,19432,15815,victory
546,68842,35550,victory
718,68842,33243,victory


## displayed_supporter_count total_supporter_count

Contains some nans, so, since it is a count we will fill it with 0. Redundant info. We keep displayed.

In [25]:
df["displayed_supporter_count"].fillna(0, inplace=True) 

In [26]:
df[df["total_supporter_count"]!= df["displayed_supporter_count"]][["total_supporter_count", "displayed_supporter_count", "status"]].head()

Unnamed: 0,total_supporter_count,displayed_supporter_count,status
81,13566.0,14806.0,victory
272,,0.0,closed
290,,0.0,closed
318,,0.0,closed
345,,0.0,closed


In [27]:
df.pop("total_supporter_count")
df.pop("total_signature_count")

0            1
1            1
2            1
3            1
4            1
5           16
6           86
7            1
8            1
9            1
10        2419
11           1
12        5048
13           1
14           1
15           1
16          36
17           1
18         107
19         258
20           1
21          20
22           8
23          17
24           5
25           1
26           1
27           1
28          22
29           1
         ...  
10055        1
10056    17348
10057       19
10058      157
10059        7
10060        1
10061    33777
10062        1
10063    14405
10064    10105
10065       49
10066        1
10067        4
10068     9803
10069        1
10070        1
10071       28
10072        2
10073        0
10074        0
10075       12
10076      675
10077        1
10078      655
10079        2
10080       45
10081      335
10082        2
10083        1
10084      214
Name: total_signature_count, dtype: int64

## document_id

To drop, doesn't seem relevant

In [28]:
df.pop("document_id")

0        5590794
1        5597810
2        4091268
3        5803866
4        5600798
5        6303017
6        5589858
7        5905506
8        5700710
9        6103410
10       4091588
11       6005358
12       6206038
13       5601586
14       5915138
15       6303935
16       6105698
17       4091808
18       5812094
19       6011590
20       5605202
21       6207358
22       5926090
23       5700762
24       6305528
25       4094420
26       6117798
27       5700922
28       5814622
29       6012186
          ...   
10055    3465713
10056    3465701
10057    3471973
10058    3469385
10059    3388397
10060    3559042
10061    3477261
10062    3487553
10063      G5067
10064      G5067
10065    3565259
10066    3567219
10067    3569347
10068    3607591
10069    3579083
10070    3636684
10071    3638328
10072    3638356
10073    3639144
10074    3639644
10075    3739452
10076    3741075
10077    3643614
10078    3741087
10079    3739416
10080    3745715
10081    3744048
10082    37476

## endorsements and ensorser_count

endorsements was created during the data collection, contains the same information as endorser_count. But the second one contain Nans instead of 0's. We keep endorsements.

In [29]:
df[df["endorsements"]!= 0][["endorsements", "endorser_count", "status", "id"]].head()

Unnamed: 0,endorsements,endorser_count,status,id
202,4,4.0,victory,4221268
814,5,5.0,victory,4991122


## fb_popularity

Feature generated during data collection. It gets the Facebook Popularity of the petition url. Is the sum of 
        * number of likes of this URL
        * number of shares of this URL (this includes copy/pasting a link back to Facebook)
        * number of likes and comments on stories on Facebook about this URL
        * number of inbox messages containing this URL as an attachment.

In [30]:
df[df["fb_popularity"]!= 0][["fb_popularity", "status"]].head()

Unnamed: 0,fb_popularity,status


looks like we have a bug!

## id

is is the petition id

## is_organization organization

Info collected during the data scraping. Indicates if the petitioner is an organization. We drop organization's name

In [31]:
df.pop("organization")

0                                                      NaN
1                                                      NaN
2                                                      NaN
3                                                      NaN
4                                                      NaN
5        {u'website': u'', u'verification_requested_at'...
6                                                      NaN
7                                                      NaN
8                                                      NaN
9                                                      NaN
10                                                     NaN
11                                                     NaN
12       {u'website': u'http://www.SchoolsNotPrisons.or...
13       {u'website': u'', u'verification_requested_at'...
14                                                     NaN
15                                                     NaN
16                                                     N

In [32]:
df[df["is_organization"]== True][["is_organization", "status"]].head()

Unnamed: 0,is_organization,status
5,True,closed
12,True,closed
13,True,closed
21,True,victory
49,True,victory


## is_pledge

It tells us if the petition is a pledge or not 

In [33]:
df[df["is_pledge"]== True][["is_pledge", "status"]].head()

Unnamed: 0,is_pledge,status
4135,True,closed
7749,True,closed


## is_verified_victory and is_victory

Info collected during web scraping

In [34]:
df[df["is_victory"]== True][["is_verified_victory", "is_victory"]].head()

Unnamed: 0,is_verified_victory,is_victory
6,False,True
16,False,True
18,False,True
21,False,True
44,False,True


## languages

Languages of the petition. It is a dictionary structure inside. Needs some feature engineering.

In [35]:
df["languages"].apply(lambda x: x[0]).unique()

array([u'es', u'en', u'pt', u'it', u'th', u'ja'], dtype=object)

## last_past_verified_victory_date last_past_victory_date last_update

Dates collected during initial data collection. Need to be converted to datetime

In [36]:
df["last_past_verified_victory_date"] = pd.to_datetime(df["last_past_verified_victory_date"])
df["last_past_victory_date"] = pd.to_datetime(df["last_past_victory_date"])
df["last_update"] = pd.to_datetime(df["last_update"])

## letter_body

Petitions have a letter attached that is the one that will be presented to the target. Needs feature engineering.

## links_fb_popularity

Similar to fb popularity but calculates the popularity of the links inside the text

In [37]:
df[df["links_fb_popularity"]== True][["links_fb_popularity", "status"]].head()

Unnamed: 0,links_fb_popularity,status


We have a bug

## milestones news_coverages num_comments num_past_petitions num_past_verified_victories num_past_victories num_responses num_tweets   


Info collected during web scraping. 

## original_locale

Locale of the petition. Keep.

In [38]:
df["original_locale"].unique()

array([u'es', u'en-US', u'es-419', u'en-GB', u'en-CA', u'pt-BR', u'it',
       u'th', u'es-AR', u'en-IN', u'en-AU', u'ja'], dtype=object)

In [39]:
df[((df["original_locale"] != 'en-US') & (df["status"]=="victory"))][["original_locale","status", "is_verified_victory","displayed_signature_count", "id"]]

Unnamed: 0,original_locale,status,is_verified_victory,displayed_signature_count,id
45,en-GB,victory,False,4,6139146
65,es,victory,False,3,6323426
82,es,victory,False,4,5708342
182,es,victory,False,1,5681030
284,es,victory,False,4,4267796
341,es,victory,False,84,4514691
374,pt-BR,victory,False,2,4518855
408,es,victory,False,2,6599630
471,es,victory,False,1,4638554
522,en-CA,victory,False,1,5016994


## primary_target targets

Contains the information of the primary target. We keep targets since there are all the targets included.

In [40]:
df.pop("primary_target")

0        {u'publicly_visible': False, u'slug': None, u'...
1        {u'publicly_visible': True, u'slug': u'federat...
2        {u'publicly_visible': True, u'slug': u'barack-...
3        {u'publicly_visible': False, u'slug': u'tom-ap...
4        {u'publicly_visible': False, u'slug': u'l-loui...
5        {u'publicly_visible': True, u'slug': u'pennsyl...
6        {u'publicly_visible': False, u'slug': u'city-o...
7        {u'publicly_visible': False, u'slug': u'andy-a...
8        {u'publicly_visible': False, u'slug': u'kasha-...
9        {u'publicly_visible': True, u'slug': u'u-s-hou...
10       {u'publicly_visible': False, u'slug': u'cyndi-...
11       {u'publicly_visible': False, u'slug': u'joe-do...
12       {u'publicly_visible': True, u'slug': u'barack-...
13       {u'publicly_visible': False, u'slug': u'elijah...
14       {u'publicly_visible': False, u'slug': u'congre...
15       {u'publicly_visible': False, u'slug': u'karen-...
16       {u'publicly_visible': False, u'slug': u'dan-hu.

## progress

Indicates the progress of the signature count. We keep it for now but will probably be dropped in the future.

## relevant_location and restricted_location

Contains info about the relevant location for the petition and if it is restricted. Needs feature engineering.

## slug

slug of the url for https://www.change.org/p/congress-pass-the-able-act the slug is congress-pass-the-able-act

## tags

Contains the tags of the petition. Needs feature engineering

## targeting_description

Info is redundant with targets. Drop.

In [41]:
df.pop("targeting_description")

0        CARTA A Diana María Guillén #Senasa Vicepresid...
1        Federated States of Micronesia State Senate, M...
2                                             Barack Obama
3                                              Tom Apodaca
4                                          L. Louise Lucas
5        Pennsylvania State House, Pennsylvania State S...
6        City of Thornton, People for the Ethical Treat...
7                                              Andy Anders
8                                             Kasha Kelley
9        U.S. House of Representatives, U.S. Senate, Ba...
10       Cyndi Stevenson, Travis Hutson, Florida Govern...
11                Joe Donnelly, Gregory Hayes, Chuck Jones
12                                            Barack Obama
13                                         Elijah Cummings
14                                            congresistas
15                                              Karen Bass
16                                             Dan Huber

## topic

It has a lot of Nans, we keep it for now but I think it will be redundant with tags or NLP. Needs feature engineering.

## tweets_followers

number of followers of the embedded tweets

In [42]:
df[df["tweets_followers"]> 0][["tweets_followers", "status"]].head()

Unnamed: 0,tweets_followers,status
81,36857,victory
114,11407,victory
139,7822,victory
155,4301,closed
202,2149688,victory


## twitter_popularity

count of likes and shares of embedded tweets

In [43]:
df[df["twitter_popularity"]> 0][["twitter_popularity", "status"]].head()

Unnamed: 0,twitter_popularity,status
155,1,closed
202,365,victory
355,4356,victory
415,10,closed
500,38,victory


## victory_description victory_date weekly_signature_count photo_id

Drop.

In [44]:
df.pop("victory_description")
df.pop("victory_date")
df.pop("weekly_signature_count")
df.pop("photo_id")

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
5       NaN
6       NaN
7       NaN
8       NaN
9       NaN
10      NaN
11      NaN
12      NaN
13      NaN
14      NaN
15      NaN
16      NaN
17      NaN
18      NaN
19      NaN
20      NaN
21      NaN
22      NaN
23      NaN
24      NaN
25      NaN
26      NaN
27      NaN
28      NaN
29      NaN
         ..
10055   NaN
10056   NaN
10057   NaN
10058   NaN
10059   NaN
10060   NaN
10061   NaN
10062   NaN
10063   NaN
10064   NaN
10065   NaN
10066   NaN
10067   NaN
10068   NaN
10069   NaN
10070   NaN
10071   NaN
10072   NaN
10073   NaN
10074   NaN
10075   NaN
10076   NaN
10077   NaN
10078   NaN
10079   NaN
10080   NaN
10081   NaN
10082   NaN
10083   NaN
10084   NaN
Name: photo_id, dtype: float64

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10085 entries, 0 to 10084
Data columns (total 52 columns):
_id                                10085 non-null object
ask                                10085 non-null object
calculated_goal                    10085 non-null int64
comments_last_page                 10085 non-null bool
comments_likes                     10085 non-null int64
created_at                         10085 non-null datetime64[ns]
creator_name                       10085 non-null object
creator_photo                      6407 non-null object
description                        10085 non-null object
discoverable                       10085 non-null bool
display_title                      10085 non-null object
displayed_signature_count          10085 non-null int64
displayed_supporter_count          10085 non-null float64
end_date                           10085 non-null datetime64[ns]
endorsements                       10085 non-null int64
endorser_count              

In [46]:
df.shape

(10085, 52)

In [47]:
df.head().T

Unnamed: 0,0,1,2,3,4
_id,579d958f51f2b8a00721ed50,579d958f51f2b8a00721ed56,579d958f51f2b8a00721ed57,579d958f51f2b8a00721ed59,579d959051f2b8a00721ed5d
ask,CARTA A\nDiana María Guillén #Senasa\nVicepres...,S,that the blacklivematter group no longer be al...,Mi Apoyo que se celebre otro juicio,to shutdown cartoon network
calculated_goal,100,100,100,100,100
comments_last_page,True,True,True,True,True
comments_likes,0,0,0,0,0
created_at,2016-01-20 17:00:13,2016-01-21 07:15:46,2015-08-29 15:45:18,2016-02-03 08:00:27,2016-01-21 14:00:15
creator_name,Laura Ferreyra,Ms Me,michael lacey,silvia yolanda gomara corredera,louis alves
creator_photo,{u'url': u'photos/7/ym/fw/qTYMFWijKULoeFi-full...,{u'url': u'photos/2/cw/cl/nHCwCldKUmQRjVx-full...,{u'url': u'photos/8/tm/lf/XFTmlFTUGreDcLr-full...,{u'url': u'photos/7/jc/il/rgJcilbQywzZouF-full...,{u'url': u'photos/1/rp/bd/ijrpbdzvaakbPgk-full...
description,<p>CARTA A<br>Diana María Guillén #Senasa<br>V...,Tg,<p>.</p>,Yo me alegraría y toda España y sobretodo sus ...,<p>shutdown cartoon network complely</p>
discoverable,True,True,True,True,True


------------------------

In [48]:
def relevant_country(location):
    if location is not None:
        return location["country_code"]
    else:
        return None

In [49]:
df["relevant_country"]=df["relevant_location"].apply(lambda x: relevant_country(x))

In [189]:
    def relevant_state(location):
        if location is not None:
            return location["state_code"]
        else:
            return None

In [191]:
df["relevant_state"] = df["relevant_location"].apply(lambda x: relevant_state(x))

In [194]:
df["user_country"] = df["user"].apply(lambda x: x["country_code"])

In [195]:
df["user_state"] = df["user"].apply(lambda x: x["state_code"])

In [198]:
df[df["relevant_state"].isnull()].count()

_id                                9770
ask                                9770
calculated_goal                    9770
comments_last_page                 9770
comments_likes                     9770
created_at                         9770
creator_name                       9770
creator_photo                      6152
description                        9770
discoverable                       9770
display_title                      9770
displayed_signature_count          9770
displayed_supporter_count          9770
end_date                           9770
endorsements                       9770
endorser_count                     9243
fb_popularity                      9770
id                                 9770
is_organization                    9770
is_pledge                          9770
is_verified_victory                9770
is_victory                         9770
languages                          9770
last_past_verified_victory_date      48
last_past_victory_date              127


In [125]:
def target_states(targets):
    states = []
    titles = []
    names = []
    for target in targets:
        if target["type"] == "Politician":
            states.append(target["additional_data"]["state"])
            titles.append(target["additional_data"]["title"])
            names.append(target["display_name"])
    return states, titles, names

In [126]:
df["targets"].apply(lambda x: target_states(x))

0                   ([NC], [Representative], [Alma Adams])
1                    ([None], [President], [Barack Obama])
2                    ([None], [President], [Barack Obama])
3                   ([NC], [State Senator], [Tom Apodaca])
4        ([VA, NC], [State Senator, State Senator], [L....
5                                             ([], [], [])
6                  ([CO], [Governor], [John Hickenlooper])
7            ([LA], [State Representative], [Andy Anders])
8           ([KS], [State Representative], [Kasha Kelley])
9                    ([None], [President], [Barack Obama])
10       ([FL, FL, FL, FL], [State Representative, Stat...
11                       ([IN], [Senator], [Joe Donnelly])
12                   ([None], [President], [Barack Obama])
13             ([MD], [Representative], [Elijah Cummings])
14         ([WA], [State Representative], [Sharon Santos])
15                  ([CA], [Representative], [Karen Bass])
16           ([TX], [State Representative], [Dan Huberty

In [148]:
d = {
    "city": "Washington",
    "country_code": "US",
    "lat": None,
    "lng": None,
    "state_code": "DC",
    "google_places_id": None,
    "photo_id": None
  }

In [71]:
df["topic"].unique()

array([u'environment', u'humanrights', None, u'criminaljustice',
       u'economicjustice', u'animals', u'health', u'education',
       u'gayrights', u'immigration', u'womensrights', u'',
       u'humantrafficking', u'food', u'socialentrepreneurship',
       u'globalpoverty', u'race', u'homelessness'], dtype=object)

In [233]:
df[["targets", "id"]].head()


Unnamed: 0,targets,id
0,"[{u'publicly_visible': False, u'slug': None, u...",5590794
1,"[{u'publicly_visible': True, u'slug': u'federa...",5597810
2,"[{u'publicly_visible': True, u'slug': u'barack...",4091268
3,"[{u'publicly_visible': False, u'slug': u'tom-a...",5803866
4,"[{u'publicly_visible': False, u'slug': u'l-lou...",5600798


# removing trolls

In [66]:
df[(df['status'] == "closed") | ((df['status'] =="victory") & (df['displayed_signature_count'] >100))].T

Unnamed: 0,0,1,2,3,4,5,7,8,9,10,...,10075,10076,10077,10078,10079,10080,10081,10082,10083,10084
_id,579d958f51f2b8a00721ed50,579d958f51f2b8a00721ed56,579d958f51f2b8a00721ed57,579d958f51f2b8a00721ed59,579d959051f2b8a00721ed5d,579d959051f2b8a00721ed5f,579d959051f2b8a00721ed5c,579d959051f2b8a00721ed63,579d959051f2b8a00721ed61,579d959051f2b8a00721ed65,...,579e1d43a7c4008cc957c62f,579e1d48a7c4008cc957c630,579e1d3ba7c4008cc957c62d,579e1d4ea7c4008cc957c631,579e1d41a7c4008cc957c62e,579e1d55a7c4008cc957c633,579e1d52a7c4008cc957c632,579e1d57a7c4008cc957c634,579e1d59a7c4008cc957c635,579e1d38a7c4008cc957c62c
ask,CARTA A\nDiana María Guillén #Senasa\nVicepres...,S,that the blacklivematter group no longer be al...,Mi Apoyo que se celebre otro juicio,to shutdown cartoon network,Members of the PA House and Senate: refuse yo...,Que las medidas adoptadas no trasgredan la int...,Lets have school.,We Support Apple vs FBI NO BACKDOOR,Lindy and Amanda's law: Create stricter laws t...,...,Change the Laws of Presidency,IMPEACH OBAMA,Impeach President Barack Obama,Provide air conditioning to Texas Inmates,Make desecretion of the US FLAG Illegal,To make a skatepark at academy field,Don't Let Congress Take Away One of the Strong...,Remove Martin Luther King Jr.'s name from all ...,britany,We are asking you to block the parking of the ...
calculated_goal,100,100,100,100,100,100,100,100,100,2500,...,100,1000,100,1000,100,100,500,100,100,500
comments_last_page,True,True,True,True,True,True,True,True,True,False,...,True,False,True,False,True,True,True,True,True,True
comments_likes,0,0,0,0,0,1,0,0,0,1,...,1,116,0,32,0,0,0,0,0,6
created_at,2016-01-20 17:00:13,2016-01-21 07:15:46,2015-08-29 15:45:18,2016-02-03 08:00:27,2016-01-21 14:00:15,2016-03-01 00:46:51,2016-02-08 15:30:12,2016-01-27 21:18:30,2016-02-18 02:02:40,2015-08-29 16:29:51,...,2015-07-12 17:13:05,2015-07-13 02:54:09,2015-06-25 15:54:57,2015-07-13 03:01:03,2015-07-12 17:07:44,2015-07-14 03:38:38,2015-07-13 18:45:19,2015-07-14 14:29:48,2015-07-15 19:04:27,2015-06-25 02:09:08
creator_name,Laura Ferreyra,Ms Me,michael lacey,silvia yolanda gomara corredera,louis alves,Concerned Citizen of Pennsylvania,Orlando Farrago,katherine portillo,Tom Mahoney,Ken Neikirk,...,Change the Laws of Presidency - Nicholas Grant,Valerie Grimes,Logan NoLastNameForYou,Angie Clark-Bobo,Pamela Berry,Jacob Phelps,Connecticut Senate Democrats,Franklin Dogooder,Brit Madden,Barbara Taylor
creator_photo,{u'url': u'photos/7/ym/fw/qTYMFWijKULoeFi-full...,{u'url': u'photos/2/cw/cl/nHCwCldKUmQRjVx-full...,{u'url': u'photos/8/tm/lf/XFTmlFTUGreDcLr-full...,{u'url': u'photos/7/jc/il/rgJcilbQywzZouF-full...,{u'url': u'photos/1/rp/bd/ijrpbdzvaakbPgk-full...,{u'url': u'photos/1/ud/ie/FQUDiEaFECibmIo-full...,,,{u'url': u'photos/7/oc/pm/NwoCpmJLOphGJrl-full...,{u'url': u'photos/0/db/yf/gaDBYftmeqlrgiw-full...,...,{u'url': u'photos/9/fh/jr/xEFhJriZbufAAOq-full...,{u'url': u'photos/0/zu/uq/FFzUUQMGaTNwBmO-full...,,{u'url': u'photos/5/oy/zc/WbOYZCCEvDxOHBb-full...,{u'url': u'photos/5/st/fs/XWstFsnDsGQtLUy-full...,,{u'url': u'photos/1/yc/gn/TWyCGnaOBHwWdoi-full...,,{u'url': u'photos/6/dv/vh/oUDvVhmTSwKHwdX-full...,{u'url': u'photos/7/ms/yd/uZMSydlAqrMytzx-full...
description,<p>CARTA A<br>Diana María Guillén #Senasa<br>V...,Tg,<p>.</p>,Yo me alegraría y toda España y sobretodo sus ...,<p>shutdown cartoon network complely</p>,"When we don't do our jobs, we don't get paid!...",Que La interpretación de los actos no sean par...,I want there to be school because if there isn...,"<p>On Feb 16, 2015 a federal judge ordered App...",<p>Lindy and Amanda's Law</p>\n<p> </p>\n<p>It...,...,<p>All Americans whether immigrants or adopted...,Since he has taken office the country has gone...,He is the worst mistake this country has ever ...,<p><strong>Since 2007 at least 14 prisoner dea...,<p>Men and women gave their lives to protect t...,So you don't have to drive a long time to a sk...,"<p>Here in Connecticut, we know that our perso...",This is important because Dr. King's message o...,<p>be cuz they need to bring the show back</p>,"<p>Charles (Charlie) Walker, an upstanding cit..."
discoverable,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
