In [1]:

import pandas as pd
import requests
import re
import ast
import numpy as np

# Count Love

In [2]:
# main data
cl_url = "https://countlove.org/data/events.json"
r = requests.get(cl_url)
cl_df = pd.DataFrame(r.json()["data"])

In [3]:
# article links
cl_links = "https://countlove.org/data/links.json"
r = requests.get(cl_links)

In [4]:
# parse article links so they can be merged into main data file

l = []
j = r.json()

for key in j.keys():
    # display(key)
    id = j[key]
    urls = []
    for date in id.keys():
        urls.append(id[date][0])

    m = {"id": key, "urls": urls}

    l.append(m)
link_df = pd.DataFrame(l)
link_df['url_count'] = link_df['urls'].str.len()

In [5]:
# merge cl data and links
link_df['id'] = link_df['id'].astype(int)
cl_df2 = pd.merge(cl_df, link_df, left_on='id', right_on='id')

In [6]:
# turn tag list into strings for matching
def tag2(tags):
    try:
        return ', '.join([str(i) for i in tags])
    except:
        return str(tags)

In [7]:
# variable cleaning

cl_df2['date'] = pd.to_datetime(cl_df2['date'])
cl_df2['collection'] = 'Count Love'
cl_df2['size_str'] = cl_df2['count'].astype(str)
cl_df2['size'] = cl_df2['count']

In [8]:
#extract floyed protests
blm_screen = cl_df2['tags'].apply(tag2) == '2, 19, 4661, 4662'
cl_blm = cl_df2[blm_screen].copy()

In [9]:
len(cl_blm)

3467

In [10]:
#remove extra pieces that aren't city/state

def location_fix(location):
    locations = location.split(', ')
    if len(locations) <=2:
        return location
    return ', '.join(locations[-2:])

cl_blm['city_st'] = cl_blm['location'].apply(location_fix)

In [11]:
#wrong date on early event

cl_blm.loc[cl_blm['id'] == 30289, 'date'] = pd.to_datetime('2020-05-30')


In [12]:
date_start = pd.to_datetime('2020-05-24')
cl_blm[pd.to_datetime(cl_blm['date']) >= date_start]



Unnamed: 0,id,date,location,count,lat,lon,event,source,tags,relevant,total_articles,urls,url_count,collection,size_str,size,city_st
24446,30289,2020-05-30,"Tulsa, OK",,36.154,-95.993,Uncategorized,https://kfor.com/news/local/protester-hit-by-v...,"[2, 19, 4661, 4662]",True,1,[https://kfor.com/news/local/protester-hit-by-...,1,Count Love,,,"Tulsa, OK"
24447,29731,2020-05-26,"Powderhorn, Minneapolis, MN",1000.0,44.940,-93.263,Uncategorized,https://www.startribune.com/police-protesters-...,"[2, 19, 4661, 4662]",True,57,[https://www.grandforksherald.com/news/governm...,7,Count Love,1000.0,1000.0,"Minneapolis, MN"
24452,29741,2020-05-26,"Ann Arbor, MI",20.0,42.281,-83.743,Uncategorized,https://www.mlive.com/news/ann-arbor/2020/05/y...,"[2, 19, 4661, 4662]",True,4,[https://www.mlive.com/news/ann-arbor/2020/05/...,4,Count Love,20.0,20.0,"Ann Arbor, MI"
24453,29744,2020-05-26,"St. Paul, MN",,44.954,-93.090,Uncategorized,https://www.cbsnews.com/news/protesters-gather...,"[2, 19, 4661, 4662]",True,2,[https://www.cbsnews.com/news/protesters-gathe...,2,Count Love,,,"St. Paul, MN"
24456,29792,2020-05-26,"Emancipation Park, Houston, TX",,29.736,-95.364,Uncategorized,https://www.houstonchronicle.com/news/houston-...,"[2, 19, 4661, 4662]",True,3,[https://www.houstonchronicle.com/news/houston...,2,Count Love,,,"Houston, TX"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27757,33481,2020-06-21,"Richmond, VA",,37.541,-77.436,Uncategorized,https://www.virginiamercury.com/blog-va/police...,"[2, 19, 4661, 4662]",True,1,[https://www.virginiamercury.com/blog-va/polic...,1,Count Love,,,"Richmond, VA"
27761,33509,2020-06-21,"Florissant, MO",,38.789,-90.323,Uncategorized,https://www.kmov.com/news/2-protesters-arreste...,"[2, 19, 4661, 4662]",True,1,[https://www.kmov.com/news/2-protesters-arrest...,1,Count Love,,,"Florissant, MO"
27763,33478,2020-06-22,"State Capitol, Boston, MA",100.0,42.359,-71.064,Uncategorized,https://www.wcvb.com/article/hundreds-march-to...,"[2, 19, 4661, 4662]",True,1,[https://www.wcvb.com/article/hundreds-march-t...,1,Count Love,100.0,100.0,"Boston, MA"
27767,33494,2020-06-22,"New Canaan, CT",30.0,41.147,-73.495,Uncategorized,https://www.ncadvertiser.com/news/article/Prot...,"[2, 19, 4661, 4662]",True,1,[https://www.ncadvertiser.com/news/article/Pro...,1,Count Love,30.0,30.0,"New Canaan, CT"


In [13]:
keep = ['id', 'date', 'city_st', 'location', 'size', 'size_str', 'urls', 'collection']

In [14]:
cl_blm[keep].to_csv('data/cl_blm.csv', index=False)

# Elephrame
(Scraping using "BLM Scrape Elephrame.ipynb")

In [63]:
ele_df = pd.read_csv('data/Elephrame.csv')
ele_df['date'] = pd.to_datetime(ele_df['Date'])

In [64]:
def str_2_list(links):
    return ast.literal_eval(links)

ele_df['urls'] = ele_df['Links'].apply(str_2_list)

In [65]:
ele_df['city_st'] = ele_df['Location'].apply(location_fix)



In [66]:
# Extract numberic component of size_str for size est.


def size_est(size):
    try:
        return int(re.findall('(\d+)', size)[0])
    except:
        if 'undreds' in size:
            return 200
        if 'housands' in size:
            return 2000
        if 'ozens' in size:
            return 24

ele_df['size'] = ele_df['Size'].apply(size_est)


In [67]:
ele_df.rename(
    columns={
        "ID": "id",
        "Collection": "collection",
        "Size": "size_str",
        "Description": "description",
        "Location" : "location"
    },
    inplace=True,
)

In [68]:
ele_df[keep].to_csv('data/ele.csv', index=False)

CCC

In [15]:
may_sheet = ('https://docs.google.com/spreadsheets/d/'
               '1pZo5p9EKZJ87IvPVjIp50nQQPET_ucV8vKVfZ6NpOvg/'
               'edit#gid=1571725208')

may_sheet = may_sheet.replace('edit#gid','export?format=csv&gid')

In [16]:
may_df = pd.read_csv(may_sheet)

In [17]:
june_sheet = ('https://docs.google.com/spreadsheets/d/'
              '1-HM-bFsnTd9omYOrB8JOMeQ0XzPvCaVaADKqXQ_RpXg/'
              'edit#gid=0')
june_sheet = june_sheet.replace('edit#gid','export?format=csv&gid')

In [28]:
june_df = pd.read_csv(june_sheet, header=1)

In [29]:
ccc_df = may_df.append(june_df, ignore_index=True)

In [30]:
ccc_df['size_str'] = ccc_df['EstimateText']

In [31]:
ccc_df['Date'] = ccc_df['Date'].replace({'2020-045-3': '2020-05-30', 
                                         '6/114':'2020-06-14',
                                        '6/13' : '2020-06-13'})

In [32]:
ccc_df.keys()

Index(['City/Town', 'Location', 'County', 'StateTerritory', 'Country', 'Date',
       'EstimateText', 'EstimateLow', 'BestGuess', 'EstimateHigh',
       'AdjustedLow', 'AdjustedHigh', 'Actor', 'Claim', 'Pro(2)/Anti(1)',
       'EventType', 'ReportedArrests', 'ReportedParticipantInjuries',
       'ReportedPoliceInjuries', 'ReportedPropertyDamage', 'TownsCities',
       'Events', 'Source1', 'Source2', 'Source3', 'Misc.', 'TearGas',
       'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30',
       'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34',
       'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38',
       'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42',
       'Unnamed: 43', 'Unnamed: 44', 'size_str'],
      dtype='object')

In [33]:
ccc_df['Date'].dropna().value_counts()

2020-06-06    389
2020-05-30    332
2020-05-31    288
2020-06-19    250
2020-06-07    234
2020-06-02    229
2020-06-01    223
2020-06-05    218
2020-06-13    190
2020-06-03    157
2020-05-29    151
2020-06-12    129
2020-06-14    117
2020-06-04    114
2020-06-08     63
2020-05-28     52
2020-06-09     48
2020-06-11     40
2020-06-10     34
2020-06-18     19
2020-06-21     17
2020-06-20     14
2020-05-27     11
2020-06-15     10
2020-06-17      9
2020-06-16      8
2020-05-26      7
2020-06-26      2
2020-05-25      1
2020-06 07      1
2020-06-27      1
Name: Date, dtype: int64

In [34]:
ccc_df['city_st'] = ccc_df['City/Town'] +', ' + ccc_df['StateTerritory']
ccc_df['location'] = ccc_df['Location'].fillna('') + ', ' + ccc_df['city_st']
ccc_df['date'] = pd.to_datetime(ccc_df['Date'])
ccc_df['collection'] = 'CCC'
ccc_df['id'] = None

#remove blank rows from Excel
ccc_df.dropna(subset=['city_st'], inplace=True)

In [35]:
ccc_df['date'].value_counts()

2020-06-06    389
2020-05-30    332
2020-05-31    288
2020-06-19    250
2020-06-07    235
2020-06-02    229
2020-06-01    223
2020-06-05    218
2020-06-13    190
2020-06-03    157
2020-05-29    151
2020-06-12    129
2020-06-14    117
2020-06-04    114
2020-06-08     62
2020-05-28     52
2020-06-09     48
2020-06-11     39
2020-06-10     34
2020-06-18     19
2020-06-21     17
2020-06-20     14
2020-05-27     11
2020-06-15     10
2020-06-17      9
2020-06-16      8
2020-05-26      7
2020-06-26      2
2020-05-25      1
2020-06-27      1
Name: date, dtype: int64

In [36]:
#needs to be better average


def ccc_size(row):
    '''if np.isnan(row['BestGuess']) == False:
        return row['BestGuess']'''
    
    if row['EstimateLow'] == row['EstimateHigh']:
        return row['EstimateLow']
    
    try:
        return (row['EstimateLow']*1.1 + row['EstimateHigh']*.9) / 2
    except:
        pass
    
    return row['EstimateLow']

ccc_df['size'] = ccc_df.apply(ccc_size, axis=1)


In [37]:
#combine URLS across three variables
def fix_urls(row):
    urls  = []
    for s in ['Source1', 'Source2', 'Source3']:
        if isinstance(row[s], float) == False:
            urls.append(row[s])
    urls = list(set(urls))
    return urls
        
ccc_df['urls'] = ccc_df.apply(fix_urls, axis=1) 

In [38]:
ccc_df[keep].to_csv('data/ccc_blm.csv', index=False)