In [24]:
import pandas as pd
# If we go w/ spaCy
import spacy
import re

In [25]:
nlp = spacy.load("en_core_web_md")

In [48]:
wiki = pd.read_json('data/wiki_protest.json')

In [49]:
# Maybe try entities?

test = nlp("An estimated 1,000 people gathered on May 31 On May 31, a large crowd gathered peacefully On the afternoon of May 29, approximately 150")

for ent in test.ents:
    print(ent.text, ent.label_)

An estimated 1,000 CARDINAL
May 31 DATE
May 31 DATE
the afternoon TIME
May 29 DATE


In [50]:
state_ab = pd.read_csv('data/state_ab.csv').set_index('State')['Code'].to_dict()
state_ab['New York (state)'] = 'NY'
state_ab['Washington (state)'] = 'WA'


wiki.replace({'state': state_ab}, inplace = True)

In [51]:
# function to get dates: 

def extract_dates(text):
    dates = []
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_=='DATE':
            dates.append(ent.text)
    return '; '.join(dates)

In [52]:
wiki['dates'] = wiki['text'].apply(extract_dates)

In [53]:
# function to get size: 

def extract_size(text):
    s = text.lower()
    for prefix in ['up to', 'more than','around','at least', 'about', 'approximately', 'over', 'almost']:
        for suffix in ['protesters', 'people', 'town residents']:
            h = re.findall('%s \d+(?:,\d+)? %s' % (prefix, suffix), s)
            if len(h) > 0:
                return h[0]
    if 'hundreds of protesters' in s:
        return 'hundreds'
    if 'hundreds of people' in s:
        return 'hundreds'
    if 'few hundred people' in s:
        return 'hundreds'
    if 'few hundred people' in s:
        return 'hundreds'
    if 'hundreds of protestors' in s:
        return 'hundreds'
    if 'over a hundred' in s:
        return 'hundred'

    for d in ['dozens of protestors', 'few dozen' , 'at least a dozen']:
        if d in s:
            return d


    size = []
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_=='CARDINAL':
            size.append(ent.text)
    return ', '.join(size)

In [54]:
wiki['size_str'] = wiki['text'].apply(extract_size)

In [55]:
wiki.sample(10)

Unnamed: 0,city,text,state,references,dates,size_str
250,Pasadena,"On May 30, 2020 6: 30pm Protesters gathered a...",CA,[https://www.pasadenanow.com/main/demonstrator...,May 30; 2020 6,
608,Millville,"On May 30, nearly 75 demonstrators gathered ne...",NJ,[https://www.nj.com/hudson/2020/06/bayonne-pro...,May 30,nearly 75
33,St. Louis,"On May 30, around 1,500 protesters marched do...",St. Louis,[https://www.kmov.com/news/demonstrators-march...,May 30,"around 1,500 protesters"
280,Hemet,May 31: A protest happened at the Hemet Vall...,CA,[https://myvalleynews.com/hemet-protests-turn-...,May 31,hundreds
628,Hudson Valley,"A boy holds a sign at a protest in Monroe, NY ...",NY,[https://wnyt.com/albany-new-york-news/rally-p...,May 30,
928,Manassas,"On May 30, several hundred protesters gathere...",VA,[https://www.princewilliamtimes.com/news/updat...,May 30,several hundred
818,Philadelphia,\nMain article: George Floyd protests in Phil...,PA,[],,
228,Cerritos,June 1: Hundreds protested at Gridley Park a...,CA,[http://lmlamplighter.com/hundreds-attend-floy...,June 1,Hundreds
190,Willits,There were protests in Willits.\nBay Area,CA,[https://www.willitsnews.com/2020/06/01/willit...,,
407,Bloomington,"On May 29, a group of around 10 to 15 proteste...",IL,[https://week.com/2020/05/29/protesters-agains...,May 29,around 10 to 15


In [56]:
wiki['dates'].value_counts().head(20)

May 31              133
May 30              129
                    118
June 1               82
June 2               55
June 6               46
June 3               45
June 4               40
May 29               39
June 5               24
June 7               16
May 28               10
May 31, 2020          8
June 1, 2020          8
Sunday, May 31        7
June 3, 2020          6
May 30, 2020          5
Saturday; May 30      5
June 8                4
Saturday              4
Name: dates, dtype: int64

In [57]:
def date_hack(date):
    date = date.split('; ')[0]
    if '2020' not in date:
        return date + ', 2020'
    return date

In [58]:
wiki['date'] = pd.to_datetime(wiki['dates'].apply(date_hack), errors='coerce').dt.date

In [59]:
size_words = {'hundreds' : 200,
             'dozens' : 24,
             'around 200 people': 200,
             'over 100 people': 120,
              'over 1,000 people' : 1200,
              'around 1,000 people' : 1000,
             'more than 100 people': 120,
              'about 100 people' :100,
              'over 100 people' :120,
              'about 100 people' : 100,
             'over 100 people': 120,
              'around 80 people' :  80,
              'over 100 people' : 120,
              'around 300 people' : 300,
              'more than 100 people' : 120,
              'around 50 people' : 50,
              'several hundred' : 200,
              '300' : 300,
              '500' : 500,
              'over 200 people' : 220,
              'around 50 people' : 50,
              'hundred' : 100,
              'more than 1,000 people' : 1200,
             'thousands': 2000,
              'around 100 people' : 100,
              'around 400 people' : 400,
              'around 400 people' : 400, 
              'around 100 people' : 100,
              'more than 200 people' : 220,
              'over 150 people' : 160,
              '1,000' : 1000,
              '200': 200,
              'several dozen' :24,
              'over 100 ' : 120,
              'more than 500 people' : 550
             }

def size_convert(size_str):
    size_str = size_str.lower()
    size_str = size_str.replace('protesters', 'people')
    size_str = size_str.replace('about', 'around')

    for word in size_words:
        if size_str.lower() == word:
            return size_words[word]

In [60]:
wiki['size'] = wiki['size_str'].apply(size_convert)

In [61]:
wiki[wiki['size'].isna()]['size_str'].value_counts().head(20)

                            234
One                           4
8                             3
over 300 people               3
over 100                      3
about 250 people              3
Roughly 300                   3
About 200                     3
Roughly 15                    2
More than 150                 2
approximately 150 people      2
100                           2
3,000                         2
70 to 100                     2
about 30 people               2
over 150                      2
A few hundred                 2
approximately 100 people      2
around 500 people             2
about 40 protesters           2
Name: size_str, dtype: int64

In [62]:
cl_df = pd.read_csv("data/cl_blm.csv")

cl_df.keys()

Index(['id', 'date', 'city_st', 'location', 'size', 'size_str', 'urls',
       'collection'],
      dtype='object')

In [63]:
keep = ['date', 'city_st',  'size', 'size_str', 'urls',
       'collection']

In [64]:
wiki['collection'] = 'Wikipedia'

In [65]:
wiki['city_st'] = wiki['city'] + ', ' + wiki['state']

In [66]:
wiki['urls'] = wiki['references']

In [67]:
wiki[keep].dropna(subset=['date']).to_csv('data/wiki_blm.csv')

In [68]:
wiki['state'].value_counts().head(50)

CA             159
PA              60
NY              59
VA              56
NJ              46
WA              39
OH              36
FL              35
TX              35
OR              31
IL              29
MI              26
CT              25
MA              20
NC              20
GA              18
KS              15
AL              14
IN              12
AK              12
IA              11
ME              11
MN              11
MD              11
LA               9
MT               9
CO               9
TN               9
WI               9
NH               9
AR               8
NE               8
ID               8
KY               8
AZ               8
MO               8
WY               7
NM               7
DE               7
VT               7
Chicago          6
SD               6
Puerto Rico      6
WV               6
OK               5
UT               5
ND               5
NV               4
SC               4
MS               4
Name: state, dtype: int64