In [129]:
import pandas as pd
# If we go w/ spaCy
import spacy
import re

In [130]:
nlp = spacy.load("en_core_web_md")

In [131]:
wiki = pd.read_json('data/wiki_protest.json')

In [132]:
# Maybe try entities?

test = nlp("An estimated 1,000 people gathered on May 31 On May 31, a large crowd gathered peacefully On the afternoon of May 29, approximately 150")

for ent in test.ents:
    print(ent.text, ent.label_)

An estimated 1,000 CARDINAL
May 31 DATE
May 31 DATE
the afternoon TIME
May 29 DATE


In [154]:
#clean up state

wiki['state'] = wiki['state'].str.replace(' \(state\)','')
wiki['state'].value_counts().head(20)

California        159
Pennsylvania       60
New York           59
Virginia           56
New Jersey         46
Washington         39
Ohio               36
Florida            35
Texas              35
Oregon             31
Illinois           29
Michigan           26
Connecticut        25
North Carolina     20
Massachusetts      20
Georgia            18
Kansas             15
Alabama            14
Alaska             12
Indiana            12
Name: state, dtype: int64

In [155]:
# function to get dates: 

def extract_dates(text):
    dates = []
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_=='DATE':
            dates.append(ent.text)
    return '; '.join(dates)

In [156]:
wiki['dates'] = wiki['text'].apply(extract_dates)

In [157]:
# function to get size: 

def extract_size(text):
    s = text.lower()
    for prefix in ['up to', 'more than','around','at least', 'about', 'approximately', 'over', 'almost']:
        for suffix in ['protesters', 'people', 'town residents']:
            h = re.findall('%s \d+(?:,\d+)? %s' % (prefix, suffix), s)
            if len(h) > 0:
                return h[0]
    if 'hundreds of protesters' in s:
        return 'hundreds'
    if 'hundreds of people' in s:
        return 'hundreds'
    if 'few hundred people' in s:
        return 'hundreds'
    if 'few hundred people' in s:
        return 'hundreds'
    if 'hundreds of protestors' in s:
        return 'hundreds'
    if 'over a hundred' in s:
        return 'hundred'

    for d in ['dozens of protestors', 'few dozen' , 'at least a dozen']:
        if d in s:
            return d


    size = []
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_=='CARDINAL':
            size.append(ent.text)
    return ', '.join(size)

In [158]:
wiki['size_str'] = wiki['text'].apply(extract_size)

In [159]:
wiki.sample(10)

Unnamed: 0,city,text,state,references,dates,size_str,date,size,collection,city_st,urls
906,Orange,"On June 4, hundred of protesters marched alon...",Virginia,[https://www.dailyprogress.com/orangenews/peac...,June 4,hundred,2020-06-04,100.0,Wikipedia,"Orange, Virginia",[https://www.dailyprogress.com/orangenews/peac...
508,Bel Air,"On June 4, hundreds of protesters marched down...",Maryland,[https://foxbaltimore.com/news/local/hundreds-...,June 4,hundreds,2020-06-04,200.0,Wikipedia,"Bel Air, Maryland",[https://foxbaltimore.com/news/local/hundreds-...
920,Arlington,"Given the county's proximity to Washington, D...",Virginia,[https://www.cnn.com/us/live-news/george-floyd...,,,2020-01-01,,Wikipedia,"Arlington, Virginia",[https://www.cnn.com/us/live-news/george-floyd...
549,Lansing and East Lansing,\nProtesters in East Lansing on June 2\nSevera...,Michigan,[https://www.lansingstatejournal.com/story/new...,June 2; May 31; June 1,about 100 protesters,2020-06-02,100.0,Wikipedia,"Lansing and East Lansing, Michigan",[https://www.lansingstatejournal.com/story/new...
201,June 2020 protest against police brutality in ...,A protest was held at UN Plaza the afternoon ...,California,[http://abc7news.com/george-floyd-protest-san-...,May 30; June 6,,2020-05-30,,Wikipedia,June 2020 protest against police brutality in ...,[http://abc7news.com/george-floyd-protest-san-...
305,Pacific Beach,June 6: Hundreds of surfers and other suppor...,California,[https://www.pbmonthly.net/news/story/2020-06-...,June 6,Hundreds,2020-06-06,200.0,Wikipedia,"Pacific Beach, California",[https://www.pbmonthly.net/news/story/2020-06-...
471,Manhattan,"On May 30, protesters marched down Bluemont Av...",Kansas,[https://themercury.com/news/demonstrators-pea...,May 30; June 1,,2020-05-30,,Wikipedia,"Manhattan, Kansas",[https://themercury.com/news/demonstrators-pea...
547,Grand Rapids,\nProtests on Fulton Street in Grand Rapids on...,Michigan,[https://www.mlive.com/news/grand-rapids/2020/...,"May 30, 2020; June 1\n; Saturday; May 30",Thousands,2020-05-30,2000.0,Wikipedia,"Grand Rapids, Michigan",[https://www.mlive.com/news/grand-rapids/2020/...
720,Findlay,Dozens of people protested peacefully in fron...,Ohio,[https://www.wtol.com/article/news/local/findl...,,Dozens,2020-01-01,24.0,Wikipedia,"Findlay, Ohio",[https://www.wtol.com/article/news/local/findl...
575,Grand Island,A group estimated at 300 people marched toward...,Nebraska,[https://www.theindependent.com/news/local/clo...,"June 1, 2020",300,2020-06-01,300.0,Wikipedia,"Grand Island, Nebraska",[https://www.theindependent.com/news/local/clo...


In [160]:
wiki['dates'].value_counts().head(20)

May 31              133
May 30              129
                    118
June 1               82
June 2               55
June 6               46
June 3               45
June 4               40
May 29               39
June 5               24
June 7               16
May 28               10
May 31, 2020          8
June 1, 2020          8
Sunday, May 31        7
June 3, 2020          6
Saturday; May 30      5
May 30, 2020          5
Saturday              4
Friday; May 29        4
Name: dates, dtype: int64

In [161]:
def date_hack(date):
    date = date.split('; ')[0]
    if '2020' not in date:
        return date + ', 2020'
    return date

In [162]:
wiki['date'] = pd.to_datetime(wiki['dates'].apply(date_hack), errors='coerce').dt.date

In [164]:
size_words = {'hundreds' : 200,
             'dozens' : 24,
             'around 200 people': 200,
             'over 100 people': 120,
              'over 1,000 people' : 1200,
              'around 1,000 people' : 1000,
             'more than 100 people': 120,
              'about 100 people' :100,
              'over 100 people' :120,
              'about 100 people' : 100,
             'over 100 people': 120,
              'around 80 people' :  80,
              'over 100 people' : 120,
              'around 300 people' : 300,
              'more than 100 people' : 120,
              'around 50 people' : 50,
              'several hundred' : 200,
              '300' : 300,
              '500' : 500,
              'over 200 people' : 220,
              'around 50 people' : 50,
              'hundred' : 100,
              'more than 1,000 people' : 1200,
             'thousands': 2000,
              'around 100 people' : 100,
              'around 400 people' : 400,
              'around 400 people' : 400, 
              'around 100 people' : 100,
              'more than 200 people' : 220,
              'over 150 people' : 160,
              '1,000' : 1000,
              '200': 200,
              'several dozen' :24,
              'over 100 ' : 120,
              'more than 500 people' : 550
             }

def size_convert(size_str):
    size_str = size_str.lower()
    size_str = size_str.replace('protesters', 'people')
    size_str = size_str.replace('about', 'around')

    for word in size_words:
        if size_str.lower() == word:
            return size_words[word]

In [165]:
wiki['size'] = wiki['size_str'].apply(size_convert)

In [166]:
wiki[wiki['size'].isna()]['size_str'].value_counts().head(20)

                                234
One                               4
about 250 people                  3
About 200                         3
over 300 people                   3
over 100                          3
Roughly 300                       3
8                                 3
approximately 1,000               2
about 150 people                  2
200–300                           2
nearly 100                        2
about 150 protesters              2
400-500                           2
approximately 300 protesters      2
more than 300 people              2
four                              2
A few hundred                     2
More than 1000                    2
about 30 people                   2
Name: size_str, dtype: int64

In [168]:
cl_df = pd.read_csv("data/cl_blm.csv")

cl_df.keys()

Index(['id', 'date', 'city_st', 'location', 'size', 'size_str', 'urls',
       'collection'],
      dtype='object')

In [169]:
keep = ['date', 'city_st',  'size', 'size_str', 'urls',
       'collection']

In [170]:
wiki['collection'] = 'Wikipedia'

In [171]:
wiki['city_st'] = wiki['city'] + ', ' + wiki['state']

In [172]:
wiki['urls'] = wiki['references']

In [176]:
wiki[keep].dropna(subset=['date']).to_csv('data/wiki_blm.csv')

In [153]:
wiki['state'].value_counts().head(50)

California        159
Pennsylvania       60
New York           59
Virginia           56
New Jersey         46
Washington         39
Ohio               36
Florida            35
Texas              35
Oregon             31
Illinois           29
Michigan           26
Connecticut        25
North Carolina     20
Massachusetts      20
Georgia            18
Kansas             15
Alabama            14
Alaska             12
Indiana            12
Maryland           11
Minnesota          11
Iowa               11
Maine              11
Colorado            9
New Hampshire       9
Louisiana           9
Wisconsin           9
Tennessee           9
Montana             9
Arkansas            8
Kentucky            8
Arizona             8
Nebraska            8
Missouri            8
Idaho               8
New Mexico          7
Delaware            7
Vermont             7
Wyoming             6
South Dakota        6
Chicago             6
Puerto Rico         6
West Virginia       6
North Dakota        5
Oklahoma  