# Natural Language Processing (chunking)

In [1]:
import pandas as pd
import pickle
import nltk
from nltk.chunk import ne_chunk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.tokenize import sent_tokenize
import re
from nltk.corpus import stopwords
import string
from geopy.geocoders import Nominatim

In [2]:
with open("5to6_sentences_w_units_terms_nolocation.pkl", 'rb') as picklefile:
   df = pickle.load(picklefile)

In [3]:
df.head()

Unnamed: 0,_id,authors,date_x,image,keywords,summary,text,url,valid,decode_text,error,date_y,language,doc_id,sent_w_num,sent_w_num_p,sent_w_num_h,sent_num_p_t,sent_num_h_t
0,58c6c276d897da59ecc6296f,[],2016-01-20 00:00:00,http://www.sunstar.com.ph/sites/default/files/...,"[water, pelco, dpwh, resume, roadworks, work, ...",She furthered that non-coordination with stake...,MACABEBE -- The Department of Public Works and...,http://www.sunstar.com.ph/pampanga/local-news/...,True,MACABEBE -- The Department of Public Works and...,all good,20160120131500,en,500000,[Balgan said in the letter that proper coordin...,[],"[""First and foremost, we appeal that the desig...",[],[]
1,58c6c277d897da59ecc62970,[],NaT,http://www.care-international.org/files/images...,"[flee, town, muslim, families, nigers, refugee...",“It was not our choice to come here but we wer...,Niger's forgotten war: Muslim refugees flee bo...,http://www.care-international.org/news/stories...,True,Niger's forgotten war: Muslim refugees flee bo...,all good,20160804131500,en,500001,"[A month ago, he was forced to uproot his town...","[A month ago, he was forced to uproot his town...",[],[Yet it has also recently become home to over ...,[]
2,58c6c27ad897da59ecc62971,[],NaT,http://www.belfasttelegraph.co.uk/news/world-n...,"[town, 700, working, german, island, twitter, ...",Acting prime minister Mariano Rajoy expressed ...,A German man suspected of starting a forest fi...,http://www.belfasttelegraph.co.uk/news/world-n...,True,A German man suspected of starting a forest fi...,all good,20160804131500,en,500002,[A German man suspected of starting a forest f...,[A German man suspected of starting a forest f...,[],[A German man suspected of starting a forest f...,[]
3,58c6c27ad897da59ecc62972,[],2016-07-22 00:00:00,https://cbssanfran.files.wordpress.com/2013/07...,"[jose, briefly, francisco, bomb, san, thrones,...",5 Reasons To Visit Ireland For St. Patrick's D...,Iconic Rock Arch From 'Game of Thrones' Falls ...,http://sanfrancisco.cbslocal.com/2016/07/22/sa...,True,Iconic Rock Arch From 'Game of Thrones' Falls ...,all good,20160722211500,en,500003,[5 Reasons To Visit Ireland For St. Patrick's ...,[],[],[],[]
4,58c6c27cd897da59ecc62973,[],2016-12-19 16:04:14,http://www.themalaymailonline.com/uploads/arti...,"[rupee, weeks, old, curbs, puts, money, notes,...",India puts fresh curbs on deposits of banned n...,India puts fresh curbs on deposits of banned n...,http://www.themalaymailonline.com/money/articl...,True,India puts fresh curbs on deposits of banned n...,all good,20161219171500,en,500004,[India puts fresh curbs on deposits of banned ...,"[The decision triggered a cash crisis, with lo...",[],[],[]


In [4]:
df.columns

Index([         u'_id',      u'authors',       u'date_x',        u'image',
           u'keywords',      u'summary',         u'text',          u'url',
              u'valid',  u'decode_text',        u'error',       u'date_y',
           u'language',       u'doc_id',   u'sent_w_num', u'sent_w_num_p',
       u'sent_w_num_h', u'sent_num_p_t', u'sent_num_h_t'],
      dtype='object')

## People facts

In [5]:
df_people = df.set_index(['doc_id', 'date_x', 'date_y', 'image', 'url',
                          'decode_text'])['sent_num_p_t'].apply(pd.Series).stack()

df_people = df_people.reset_index()

df_people.columns = ['doc_id', 'date_x', 'date_y', 'image', 'url', 
                     'decode_text' ,'sentence_number','sen_num_p_t']

df_people.head()

Unnamed: 0,doc_id,date_x,date_y,image,url,decode_text,sentence_number,sen_num_p_t
0,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,0,Yet it has also recently become home to over 2...
1,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,1,They are some of the 2.7 million people who ar...
2,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,2,"A month ago, he was forced to uproot his town ..."
3,500002,NaT,20160804131500,http://www.belfasttelegraph.co.uk/news/world-n...,http://www.belfasttelegraph.co.uk/news/world-n...,A German man suspected of starting a forest fi...,0,A German man suspected of starting a forest fi...
4,500002,NaT,20160804131500,http://www.belfasttelegraph.co.uk/news/world-n...,http://www.belfasttelegraph.co.uk/news/world-n...,A German man suspected of starting a forest fi...,1,It said 700 residents had been evacuated as a ...


In [6]:
df_people.shape

(43700, 8)

In [7]:
def word_tokenize_filter(sentence):
    words = word_tokenize(sentence)
    filtered_words = [word for word in words if word not in stopwords.words('english')]
    filtered_words = [c for c in filtered_words if c not in string.punctuation]
    return filtered_words

In [8]:
df_people['w_tokens'] = df_people.sen_num_p_t.apply(word_tokenize_filter)
df_people.head()

Unnamed: 0,doc_id,date_x,date_y,image,url,decode_text,sentence_number,sen_num_p_t,w_tokens
0,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,0,Yet it has also recently become home to over 2...,"[Yet, also, recently, become, home, 280,000, r..."
1,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,1,They are some of the 2.7 million people who ar...,"[They, 2.7, million, people, currently, displa..."
2,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,2,"A month ago, he was forced to uproot his town ...","[A, month, ago, forced, uproot, town, 3,000, p..."
3,500002,NaT,20160804131500,http://www.belfasttelegraph.co.uk/news/world-n...,http://www.belfasttelegraph.co.uk/news/world-n...,A German man suspected of starting a forest fi...,0,A German man suspected of starting a forest fi...,"[A, German, man, suspected, starting, forest, ..."
4,500002,NaT,20160804131500,http://www.belfasttelegraph.co.uk/news/world-n...,http://www.belfasttelegraph.co.uk/news/world-n...,A German man suspected of starting a forest fi...,1,It said 700 residents had been evacuated as a ...,"[It, said, 700, residents, evacuated, precauti..."


In [9]:
grammar = r'''Counts: {<NN>?<RB>?<JJ>?<CD><CD>?<JJ>*<NNS>*<RB>*<JJ>*<NNS>*<NNP>*<JJ>*<VB.?><JJ>?}'''

def chunk_finder(word_list):
    tags = pos_tag(word_list)
    chunkParser = nltk.RegexpParser(grammar)
    chunked = chunkParser.parse(tags)
    holychunk = [chunk for chunk in chunked if type(chunk) == nltk.tree.Tree]
    sentence_facts = []
    for holyc in holychunk:
        fact_list = []
        for specialtuples in holyc:
            if specialtuples[1] == "CD":
                number = specialtuples[0]
                fact_list.append(number)
            elif specialtuples[1] == "NNS":
                unit = specialtuples[0]
                fact_list.append(unit)
            elif specialtuples[1] == "VBN" or specialtuples[1] == "JJ" or specialtuples[1] == "VBD":
                verb = specialtuples[0]
                fact_list.append(verb)
            else:
                continue
        sentence_facts.append(fact_list)
    return sentence_facts

In [10]:
df_people['chunks'] = df_people.w_tokens.apply(chunk_finder)

In [11]:
df_people.head()

Unnamed: 0,doc_id,date_x,date_y,image,url,decode_text,sentence_number,sen_num_p_t,w_tokens,chunks
0,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,0,Yet it has also recently become home to over 2...,"[Yet, also, recently, become, home, 280,000, r...","[[280,000, refugees]]"
1,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,1,They are some of the 2.7 million people who ar...,"[They, 2.7, million, people, currently, displa...","[[2.7, million, people, displaced]]"
2,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,2,"A month ago, he was forced to uproot his town ...","[A, month, ago, forced, uproot, town, 3,000, p...","[[3,000, people]]"
3,500002,NaT,20160804131500,http://www.belfasttelegraph.co.uk/news/world-n...,http://www.belfasttelegraph.co.uk/news/world-n...,A German man suspected of starting a forest fi...,0,A German man suspected of starting a forest fi...,"[A, German, man, suspected, starting, forest, ...","[[700, people, arrested]]"
4,500002,NaT,20160804131500,http://www.belfasttelegraph.co.uk/news/world-n...,http://www.belfasttelegraph.co.uk/news/world-n...,A German man suspected of starting a forest fi...,1,It said 700 residents had been evacuated as a ...,"[It, said, 700, residents, evacuated, precauti...","[[700, residents, evacuated, precautionary]]"


In [12]:
df_people_chunks = df_people.set_index(['doc_id', 'date_x', 'date_y', 'image', 'url', 
                     'decode_text' ,'sentence_number','sen_num_p_t'])['chunks'].apply(pd.Series).stack()

df_people_chunks = df_people_chunks.reset_index()

df_people_chunks.columns = ['doc_id', 'date_x', 'date_y', 'image', 'url', 
                     'decode_text' ,'sentence_number','sen_num_p_t','chunk_num','chunks']

df_people_chunks.head()

Unnamed: 0,doc_id,date_x,date_y,image,url,decode_text,sentence_number,sen_num_p_t,chunk_num,chunks
0,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,0,Yet it has also recently become home to over 2...,0,"[280,000, refugees]"
1,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,1,They are some of the 2.7 million people who ar...,0,"[2.7, million, people, displaced]"
2,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,2,"A month ago, he was forced to uproot his town ...",0,"[3,000, people]"
3,500002,NaT,20160804131500,http://www.belfasttelegraph.co.uk/news/world-n...,http://www.belfasttelegraph.co.uk/news/world-n...,A German man suspected of starting a forest fi...,0,A German man suspected of starting a forest fi...,0,"[700, people, arrested]"
4,500002,NaT,20160804131500,http://www.belfasttelegraph.co.uk/news/world-n...,http://www.belfasttelegraph.co.uk/news/world-n...,A German man suspected of starting a forest fi...,1,It said 700 residents had been evacuated as a ...,0,"[700, residents, evacuated, precautionary]"


In [13]:
people_units = ['people', 'peoples', 'persons', 'individuals', 'children', 'inhabitants',
                'residents', 'migrants','civilians', 'children', 'villagers', 'citizens', 'refugees', 'minors',
               'evacuees', 'kids', 'idps', 'beneficiaries', 'tenants']

In [14]:
# NEED TO FIX NUMBERS THAT ARE WRITTEN!!!!

def people_unit(chunklist):
    numbers = []
    for word in people_units: 
        for e in chunklist:
            if word == e:
                tags = pos_tag(chunklist)
                for tag in tags:
                    if tag[1] == "CD":
                        numbers.append(tag[0])
    if len(numbers) == 0 or len(numbers) == 1:
        return numbers
    else:
        try:
            if numbers[1] == 'million':
                return int(numbers[0]) * 1000000
            elif numbers[1] == 'thousand':
                return int(numbers[0]) * 1000
            elif numbers[1] == 'hundred':
                return int(numbers[0]) * 100 
        except:
            pass
        

In [15]:
df_people_chunks['people_numbers'] = df_people_chunks.chunks.apply(people_unit)

In [16]:
def get_number(x):
    try:
        number = str(x[0])
        number = number.replace(",","")
        number = int(number)
        return number
    except:
        pass

In [17]:
df_people_chunks['final_number'] = df_people_chunks.people_numbers.apply(get_number)

In [18]:
df_people_chunks.head()

Unnamed: 0,doc_id,date_x,date_y,image,url,decode_text,sentence_number,sen_num_p_t,chunk_num,chunks,people_numbers,final_number
0,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,0,Yet it has also recently become home to over 2...,0,"[280,000, refugees]","[280,000]",280000.0
1,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,1,They are some of the 2.7 million people who ar...,0,"[2.7, million, people, displaced]",,
2,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,2,"A month ago, he was forced to uproot his town ...",0,"[3,000, people]","[3,000]",3000.0
3,500002,NaT,20160804131500,http://www.belfasttelegraph.co.uk/news/world-n...,http://www.belfasttelegraph.co.uk/news/world-n...,A German man suspected of starting a forest fi...,0,A German man suspected of starting a forest fi...,0,"[700, people, arrested]",[700],700.0
4,500002,NaT,20160804131500,http://www.belfasttelegraph.co.uk/news/world-n...,http://www.belfasttelegraph.co.uk/news/world-n...,A German man suspected of starting a forest fi...,1,It said 700 residents had been evacuated as a ...,0,"[700, residents, evacuated, precautionary]",[700],700.0


In [19]:
final_people = df_people_chunks[df_people_chunks.final_number > 0]

In [20]:
final_people.shape

(17226, 12)

In [21]:
displacement_terms = ['airlifted', 'affected', 'bused',' bussed', 'displace', 'displaced', 'displaces' 'destroyed', 'evacuate', 'evacuated', 
                    'evacuating', 'evicted', 'ferried' ,'forced', 'flee','fled', 'fleeing', 'homeless', 'housed', 'moved', 'relief', 
         'relocate', 'relocated', 'rescued', 'reocuppied', 'repatriated' 'sheltered', 'shelters', 'sheltering', 'submerged', 'transferred' 
                      'transport', 'transported', 'uprooted', 'uninhabitable', 'washed']

In [22]:
def term_unit(chunklist):
    verb = []
    for word in displacement_terms: 
        for e in chunklist:
            if word == e:
                tags = pos_tag(chunklist)
                for tag in tags:
                    if tag[1] == "VB" or tag[1] == "VBD" or tag[1] == "VBG" or tag[1] == "VBN" or \
                    tag[1] == "VBP" or tag[1] == "VBZ": # or tag[1] == "JJ":
                        verb.append(tag[0])
    return verb

In [23]:
final_people['verb'] = final_people.chunks.apply(term_unit)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [24]:
final_people.head()

Unnamed: 0,doc_id,date_x,date_y,image,url,decode_text,sentence_number,sen_num_p_t,chunk_num,chunks,people_numbers,final_number,verb
0,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,0,Yet it has also recently become home to over 2...,0,"[280,000, refugees]","[280,000]",280000.0,[]
2,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,2,"A month ago, he was forced to uproot his town ...",0,"[3,000, people]","[3,000]",3000.0,[]
3,500002,NaT,20160804131500,http://www.belfasttelegraph.co.uk/news/world-n...,http://www.belfasttelegraph.co.uk/news/world-n...,A German man suspected of starting a forest fi...,0,A German man suspected of starting a forest fi...,0,"[700, people, arrested]",[700],700.0,[]
4,500002,NaT,20160804131500,http://www.belfasttelegraph.co.uk/news/world-n...,http://www.belfasttelegraph.co.uk/news/world-n...,A German man suspected of starting a forest fi...,1,It said 700 residents had been evacuated as a ...,0,"[700, residents, evacuated, precautionary]",[700],700.0,[evacuated]
5,500011,2016-10-05,20161006051500,http://chronicle.augusta.com/sites/default/fil...,http://chronicle.augusta.com/news/hurricane/20...,"On Wednesday night, Georgia Gov. Nathan Deal u...",0,"Chatham County officials urged roughly 30,000 ...",0,"[30,000, residents]","[30,000]",30000.0,[]


In [25]:
def get_verb(x):
    try:
        return str(x[0])
    except:
        pass

In [26]:
final_people['final_verb'] = final_people.verb.apply(get_verb)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [28]:
final_people_displaced = final_people[final_people.verb != None]
final_people_displaced.shape

(17226, 14)

In [29]:
final_people_displaced.head()

Unnamed: 0,doc_id,date_x,date_y,image,url,decode_text,sentence_number,sen_num_p_t,chunk_num,chunks,people_numbers,final_number,verb,final_verb
0,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,0,Yet it has also recently become home to over 2...,0,"[280,000, refugees]","[280,000]",280000.0,[],
2,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,2,"A month ago, he was forced to uproot his town ...",0,"[3,000, people]","[3,000]",3000.0,[],
3,500002,NaT,20160804131500,http://www.belfasttelegraph.co.uk/news/world-n...,http://www.belfasttelegraph.co.uk/news/world-n...,A German man suspected of starting a forest fi...,0,A German man suspected of starting a forest fi...,0,"[700, people, arrested]",[700],700.0,[],
4,500002,NaT,20160804131500,http://www.belfasttelegraph.co.uk/news/world-n...,http://www.belfasttelegraph.co.uk/news/world-n...,A German man suspected of starting a forest fi...,1,It said 700 residents had been evacuated as a ...,0,"[700, residents, evacuated, precautionary]",[700],700.0,[evacuated],evacuated
5,500011,2016-10-05,20161006051500,http://chronicle.augusta.com/sites/default/fil...,http://chronicle.augusta.com/news/hurricane/20...,"On Wednesday night, Georgia Gov. Nathan Deal u...",0,"Chatham County officials urged roughly 30,000 ...",0,"[30,000, residents]","[30,000]",30000.0,[],


In [30]:
with open("5to6_numbersandverbs.pkl", "wb") as picklefile:
    pickle.dump(final_people_displaced, picklefile)

## Conquer location

In [31]:
def word_tokenize_filter(sentence):
    words = word_tokenize(sentence)
    filtered_words = [word for word in words if word not in stopwords.words('english')]
    filtered_words = [c for c in filtered_words if c not in string.punctuation]
    return filtered_words

In [32]:
final_people_displaced['doc_word_tokens'] = final_people_displaced.decode_text.apply(word_tokenize_filter)

In [33]:
final_people_displaced.head()

Unnamed: 0,doc_id,date_x,date_y,image,url,decode_text,sentence_number,sen_num_p_t,chunk_num,chunks,people_numbers,final_number,verb,final_verb,doc_word_tokens
0,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,0,Yet it has also recently become home to over 2...,0,"[280,000, refugees]","[280,000]",280000.0,[],,"[Niger, 's, forgotten, war, Muslim, refugees, ..."
2,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,2,"A month ago, he was forced to uproot his town ...",0,"[3,000, people]","[3,000]",3000.0,[],,"[Niger, 's, forgotten, war, Muslim, refugees, ..."
3,500002,NaT,20160804131500,http://www.belfasttelegraph.co.uk/news/world-n...,http://www.belfasttelegraph.co.uk/news/world-n...,A German man suspected of starting a forest fi...,0,A German man suspected of starting a forest fi...,0,"[700, people, arrested]",[700],700.0,[],,"[A, German, man, suspected, starting, forest, ..."
4,500002,NaT,20160804131500,http://www.belfasttelegraph.co.uk/news/world-n...,http://www.belfasttelegraph.co.uk/news/world-n...,A German man suspected of starting a forest fi...,1,It said 700 residents had been evacuated as a ...,0,"[700, residents, evacuated, precautionary]",[700],700.0,[evacuated],evacuated,"[A, German, man, suspected, starting, forest, ..."
5,500011,2016-10-05,20161006051500,http://chronicle.augusta.com/sites/default/fil...,http://chronicle.augusta.com/news/hurricane/20...,"On Wednesday night, Georgia Gov. Nathan Deal u...",0,"Chatham County officials urged roughly 30,000 ...",0,"[30,000, residents]","[30,000]",30000.0,[],,"[On, Wednesday, night, Georgia, Gov, Nathan, D..."


In [34]:
import pycountry

def one_locations(locations):
    countries = []
    for location in locations:
        try:
            country = pycountry.countries.get(name=location)
            if country.numeric > 0:
                country = country.name
                countries.append(country)
        except:
            pass
    return countries

In [35]:
final_people_displaced['countries'] = final_people_displaced.doc_word_tokens.apply(one_locations)

In [36]:
final_people_displaced.head()

Unnamed: 0,doc_id,date_x,date_y,image,url,decode_text,sentence_number,sen_num_p_t,chunk_num,chunks,people_numbers,final_number,verb,final_verb,doc_word_tokens,countries
0,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,0,Yet it has also recently become home to over 2...,0,"[280,000, refugees]","[280,000]",280000.0,[],,"[Niger, 's, forgotten, war, Muslim, refugees, ...","[Niger, Lebanon, Jordan, Niger, Nigeria, Niger..."
2,500001,NaT,20160804131500,http://www.care-international.org/files/images...,http://www.care-international.org/news/stories...,Niger's forgotten war: Muslim refugees flee bo...,2,"A month ago, he was forced to uproot his town ...",0,"[3,000, people]","[3,000]",3000.0,[],,"[Niger, 's, forgotten, war, Muslim, refugees, ...","[Niger, Lebanon, Jordan, Niger, Nigeria, Niger..."
3,500002,NaT,20160804131500,http://www.belfasttelegraph.co.uk/news/world-n...,http://www.belfasttelegraph.co.uk/news/world-n...,A German man suspected of starting a forest fi...,0,A German man suspected of starting a forest fi...,0,"[700, people, arrested]",[700],700.0,[],,"[A, German, man, suspected, starting, forest, ...",[]
4,500002,NaT,20160804131500,http://www.belfasttelegraph.co.uk/news/world-n...,http://www.belfasttelegraph.co.uk/news/world-n...,A German man suspected of starting a forest fi...,1,It said 700 residents had been evacuated as a ...,0,"[700, residents, evacuated, precautionary]",[700],700.0,[evacuated],evacuated,"[A, German, man, suspected, starting, forest, ...",[]
5,500011,2016-10-05,20161006051500,http://chronicle.augusta.com/sites/default/fil...,http://chronicle.augusta.com/news/hurricane/20...,"On Wednesday night, Georgia Gov. Nathan Deal u...",0,"Chatham County officials urged roughly 30,000 ...",0,"[30,000, residents]","[30,000]",30000.0,[],,"[On, Wednesday, night, Georgia, Gov, Nathan, D...","[Georgia, Georgia, Georgia, Georgia, Georgia, ..."


In [37]:
import collections

def country(countries):
    try:
        counter = collections.Counter(countries)
        country = counter.most_common(1)[0][0]
        one_country = country.encode("ascii","ignore")
        return one_country
    except:
        pass

In [38]:
final_people_displaced['one_country'] = final_people_displaced.countries.apply(country)

In [39]:
with open("5to6_country_numbers_verb.pkl", "wb") as picklefile:
    pickle.dump(final_people_displaced, picklefile)

In [40]:
final_people_displaced.shape

(17226, 17)

## Explore

In [38]:
final_people_displaced.groupby(['one_country'])['final_number'].sum()

one_country
Afghanistan       99611.0
Australia            23.0
Austria           10510.0
Bangladesh          150.0
Belarus           24036.0
Benin              5115.0
Brazil           111892.0
Bulgaria           4640.0
Cambodia          64615.0
Cameroon          36364.0
Canada             3040.0
Chile             18600.0
China          10625014.0
Colombia          44226.0
Congo              7926.0
Croatia            5058.0
Cuba                  2.0
Ecuador            7340.0
Egypt              3903.0
France              836.0
Georgia           24120.0
Ghana               150.0
Greece             2000.0
Guatemala         74551.0
Guinea           132685.0
Haiti             41508.0
Honduras            115.0
Hungary            1254.0
India            521252.0
Indonesia         58165.0
Iraq              66467.0
Ireland            1000.0
Israel          3293172.0
Japan              7334.0
Lebanon            6106.0
Liberia         1452200.0
Madagascar          486.0
Malaysia         391976.0


In [40]:
final_people_displaced[final_people_displaced['one_country'] == "Mexico"]

Unnamed: 0,doc_id,url,date,Tag,one_country,sentence_number,sen_num_p_t,chunk_num,chunks,people_numbers,final_number,verb,final_verb,w_tokens,location
780,256,http://floodlist.com/america/mexico-floods-mic...,2015-03-18 10:25:06+00:00,Disasters,Mexico,0,At least six people died in floods in the stat...,1,"[many, 4,000, people, forced, frm]","[4,000]",4000.0,[forced],forced,"[At, least, six, people, died, floods, state, ...",[[October]]
1406,428,http://content.govdelivery.com/attachments/USD...,,Disasters,Mexico,0,"; 4,830; Pulaski Co) 2 shelters open overnight...",8,"[77,129, residents]","[77,129]",77129.0,[],,"[4,830, Pulaski, Co, 2, shelters, open, overni...","[[Pulaski, Co], [EDT], [Sections, Interstate],..."
1417,429,http://www.washingtonpost.com/national/colby-f...,,Disasters,Mexico,0,"; 4,830; Pulaski Co) 2 shelters open overnight...",8,"[77,129, residents]","[77,129]",77129.0,[],,"[4,830, Pulaski, Co, 2, shelters, open, overni...","[[Pulaski, Co], [EDT], [Sections, Interstate],..."
