In [1]:
import pandas as pd
# If we go w/ spaCy
import spacy
import re

In [2]:
nlp = spacy.load("en_core_web_md")

In [3]:
wiki = pd.read_json('data/wiki_protest.json')

In [4]:
# Maybe try entities?

test = nlp("An estimated 1,000 people gathered on May 31 On May 31, a large crowd gathered peacefully On the afternoon of May 29, approximately 150")

for ent in test.ents:
    print(ent.text, ent.label_)

An estimated 1,000 CARDINAL
May 31 DATE
May 31 DATE
the afternoon TIME
May 29 DATE


In [5]:
state_ab = pd.read_csv('data/state_ab.csv').set_index('State')['Code'].to_dict()
state_ab['New York (state)'] = 'NY'
state_ab['Washington (state)'] = 'WA'


wiki.replace({'state': state_ab}, inplace = True)

In [6]:
# function to get dates: 

def extract_dates(text):
    dates = []
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_=='DATE':
            dates.append(ent.text)
    return '; '.join(dates)

In [7]:
wiki['dates'] = wiki['text'].apply(extract_dates)

In [8]:
# function to get size: 

def extract_size(text):
    doc = nlp(text)
    
    
    s = text.lower()
    for prefix in ['up to', 'more than','around','at least', 'about', 'approximately', 'over', 'almost']:
        for suffix in ['protesters', 'people', 'town residents']:
            h = re.findall('%s \d+(?:,\d+)? %s' % (prefix, suffix), s)
            if len(h) > 0:
                return h[0]
    if 'hundreds of protesters' in s:
        return 'hundreds'
    if 'hundreds of people' in s:
        return 'hundreds'
    if 'few hundred people' in s:
        return 'hundreds'
    if 'few hundred people' in s:
        return 'hundreds'
    if 'hundreds of protestors' in s:
        return 'hundreds'
    if 'over a hundred' in s:
        return 'hundred'

    for d in ['dozens of protestors', 'few dozen' , 'at least a dozen']:
        if d in s:
            return d


    size = []
    
    for ent in doc.ents:
        if ent.label_=='CARDINAL':
            size.append(ent.text)
    return ', '.join(size)

In [9]:
wiki['size_str'] = wiki['text'].apply(extract_size)

In [10]:
wiki.sample(10)

Unnamed: 0,city,text,state,references,dates,size_str
138,Phoenix,Phoenix saw demonstrations for four straight n...,AZ,[https://www.azcentral.com/story/news/local/ph...,May 28,Hundreds
748,North Canton,About 50 people withstood a rainstorm and pro...,OH,[https://www.cantonrep.com/news/20200604/prote...,June 4,about 50 people
572,Grand Island,A group estimated at 300 people marched toward...,NE,[https://www.theindependent.com/news/local/clo...,"June 1, 2020",300
330,Greenwich,Several dozen protesters rallied near the Gree...,CT,[https://www.greenwichtime.com/local/article/W...,June 1,Several dozen
360,Palm Coast,"On June 3, around 200 protesters peacefully p...",FL,[https://www.news-journalonline.com/news/20200...,June 3,around 200 protesters
455,Mason City,May 31: About 30 people held signs along North...,IA,[https://www.kimt.com/content/news/Protesters-...,May 31,about 30 people
153,Bay Area,"Berkeley: 3,000-4,000 protesters marched in So...",CA,[https://www.berkeleyside.com/2020/06/06/thous...,June 6,3000
235,La Crescenta-Montrose,June 1: Around 20 protested at Demoret Park....,CA,[https://www.montrosepress.com/news/residents-...,June 1; June 2:,around 100 protesters
428,Tinley Park,The Chicago suburb ordered a curfew on the nig...,IL,[https://www.nbcchicago.com/news/local/chicago...,,
375,Palm Beach (town),"On May 31, the Palm Beach Police Department i...",FL,[https://www.palmbeachpost.com/news/20200531/w...,May 31; Mar-A-Lago,


In [11]:
wiki['dates'].value_counts().head(20)

May 31              136
                    128
May 30              128
June 1               82
June 2               55
June 6               48
June 3               44
June 4               40
May 29               38
June 5               24
June 7               15
May 28               10
May 31, 2020          8
June 1, 2020          8
Sunday, May 31        7
June 3, 2020          6
Saturday; May 30      5
May 30, 2020          5
June 8                4
Friday; May 29        4
Name: dates, dtype: int64

In [12]:
def date_hack(date):
    date = date.split('; ')[0]
    if '2020' not in date:
        return date + ', 2020'
    return date

In [13]:
wiki['date'] = pd.to_datetime(wiki['dates'].apply(date_hack), errors='coerce').dt.date

In [14]:
size_words = {'hundreds' : 200,
             'dozens' : 24,
             'around 200 people': 200,
             'over 100 people': 120,
              'over 1,000 people' : 1200,
              'around 1,000 people' : 1000,
             'more than 100 people': 120,
              'about 100 people' :100,
              'over 100 people' :120,
              'about 100 people' : 100,
             'over 100 people': 120,
              'around 80 people' :  80,
              'over 100 people' : 120,
              'around 300 people' : 300,
              'more than 100 people' : 120,
              'around 50 people' : 50,
              'several hundred' : 200,
              '300' : 300,
              '500' : 500,
              'over 200 people' : 220,
              'around 50 people' : 50,
              'hundred' : 100,
              'more than 1,000 people' : 1200,
             'thousands': 2000,
              'around 100 people' : 100,
              'around 400 people' : 400,
              'around 400 people' : 400, 
              'around 100 people' : 100,
              'more than 200 people' : 220,
              'over 150 people' : 160,
              '1,000' : 1000,
              '200': 200,
              'several dozen' :24,
              'over 100 ' : 120,
              'more than 500 people' : 550
             }

def size_convert(size_str):
    size_str = size_str.lower()
    size_str = size_str.replace('protesters', 'people')
    size_str = size_str.replace('about', 'around')

    for word in size_words:
        if size_str.lower() == word:
            return size_words[word]

In [15]:
wiki['size'] = wiki['size_str'].apply(size_convert)

In [16]:
wiki[wiki['size'].isna()]['size_str'].value_counts().head(20)

                            238
One                           4
about 150 protesters          4
About 200                     3
about 250 people              3
8                             3
Roughly 300                   3
about 500 people              3
over 100                      3
at least 100 protesters       2
about 30 people               2
approximately 200 people      2
More than 200                 2
About 50 to 60                2
nearly 100                    2
more than 300 protesters      2
over 300 people               2
More than 1000                2
more than 300 people          2
12                            2
Name: size_str, dtype: int64

In [17]:
cl_df = pd.read_csv("data/cl_blm.csv")

cl_df.keys()

Index(['id', 'date', 'city_st', 'location', 'size', 'size_str', 'urls',
       'collection'],
      dtype='object')

In [18]:
keep = ['date', 'city_st',  'size', 'size_str', 'urls',
       'collection']

In [19]:
wiki['collection'] = 'Wikipedia'

In [20]:
wiki['city_st'] = wiki['city'] + ', ' + wiki['state']

In [21]:
wiki['urls'] = wiki['references']

In [22]:
wiki[keep].dropna(subset=['date']).to_csv('data/wiki_blm.csv')

In [23]:
wiki['state'].value_counts().head(50)

CA             162
NY              79
PA              60
VA              56
NJ              46
WA              39
OH              36
FL              35
TX              35
OR              31
IL              29
MI              26
CT              25
NC              23
MA              20
GA              18
KS              15
AL              14
IN              12
AK              12
ME              11
MN              11
MD              11
IA              11
LA               9
NH               9
TN               9
CO               9
WI               9
MT               9
AR               8
WY               8
AZ               8
WV               8
NE               8
KY               8
ID               8
NM               7
VT               7
DE               7
SD               6
Puerto Rico      6
Chicago          6
UT               5
OK               5
ND               5
NV               4
SC               4
MS               4
RI               4
Name: state, dtype: int64