In [30]:
%load_ext blackcellmagic

import pandas as pd
import requests
import re
import ast
import numpy as np

The blackcellmagic extension is already loaded. To reload it, use:
  %reload_ext blackcellmagic


# Count Love

In [31]:
# main data
cl_url = "https://countlove.org/data/events.json"
r = requests.get(cl_url)
cl_df = pd.DataFrame(r.json()["data"])

In [32]:
# article links
cl_links = "https://countlove.org/data/links.json"
r = requests.get(cl_links)

In [33]:
# parse article links so they can be merged into main data file

l = []
j = r.json()

for key in j.keys():
    # display(key)
    id = j[key]
    urls = []
    for date in id.keys():
        urls.append(id[date][0])

    m = {"id": key, "urls": urls}

    l.append(m)
link_df = pd.DataFrame(l)
link_df['url_count'] = link_df['urls'].str.len()

In [34]:
# merge cl data and links
link_df['id'] = link_df['id'].astype(int)
cl_df2 = pd.merge(cl_df, link_df, left_on='id', right_on='id')

In [35]:
# turn tag list into strings for matching
def tag2(tags):
    try:
        return ', '.join([str(i) for i in tags])
    except:
        return str(tags)

In [36]:
# variable cleaning

cl_df2['date'] = pd.to_datetime(cl_df2['date'])
cl_df2['collection'] = 'Count Love'
cl_df2['size_str'] = cl_df2['count'].astype(str)
cl_df2['size'] = cl_df2['count']

In [37]:
#extract floyed protests
blm_screen = cl_df2['tags'].apply(tag2) == '2, 19, 4661, 4662'
cl_blm = cl_df2[blm_screen].copy()

In [38]:
len(cl_blm)

1835

In [39]:
#remove extra pieces that aren't city/state

def location_fix(location):
    locations = location.split(', ')
    if len(locations) <=2:
        return location
    return ', '.join(locations[-2:])

cl_blm['city_st'] = cl_blm['location'].apply(location_fix)

In [40]:
#wrong date on early event

cl_blm.loc[cl_blm['id'] == 30289, 'date'] = pd.to_datetime('2020-05-30')


In [41]:
date_start = pd.to_datetime('2020-05-24')
cl_blm[pd.to_datetime(cl_blm['date']) >= date_start]



Unnamed: 0,id,date,location,count,lat,lon,event,source,tags,relevant,total_articles,urls,url_count,collection,size_str,size,city_st
24443,30289,2020-05-30,"Tulsa, OK",,36.154,-95.993,Uncategorized,https://kfor.com/news/local/protester-hit-by-v...,"[2, 19, 4661, 4662]",True,1,[https://kfor.com/news/local/protester-hit-by-...,1,Count Love,,,"Tulsa, OK"
24444,29731,2020-05-26,"Powderhorn, Minneapolis, MN",1000.0,44.940,-93.263,Uncategorized,https://www.startribune.com/police-protesters-...,"[2, 19, 4661, 4662]",True,53,[https://www.grandforksherald.com/news/governm...,6,Count Love,1000.0,1000.0,"Minneapolis, MN"
24449,29741,2020-05-26,"Ann Arbor, MI",20.0,42.281,-83.743,Uncategorized,https://www.mlive.com/news/ann-arbor/2020/05/y...,"[2, 19, 4661, 4662]",True,4,[https://www.mlive.com/news/ann-arbor/2020/05/...,4,Count Love,20.0,20.0,"Ann Arbor, MI"
24450,29744,2020-05-26,"St. Paul, MN",,44.954,-93.090,Uncategorized,https://www.cbsnews.com/news/protesters-gather...,"[2, 19, 4661, 4662]",True,2,[https://www.cbsnews.com/news/protesters-gathe...,2,Count Love,,,"St. Paul, MN"
24453,29792,2020-05-26,"Emancipation Park, Houston, TX",,29.736,-95.364,Uncategorized,https://www.houstonchronicle.com/news/houston-...,"[2, 19, 4661, 4662]",True,3,[https://www.houstonchronicle.com/news/houston...,2,Count Love,,,"Houston, TX"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25923,31332,2020-06-08,"Houston, TX",100.0,29.760,-95.370,Uncategorized,https://www.ocregister.com/southern-california...,"[2, 19, 4661, 4662]",True,1,[https://www.ocregister.com/southern-californi...,1,Count Love,100.0,100.0,"Houston, TX"
25925,31347,2020-06-08,"Desert Hot Springs, CA",50.0,33.961,-116.502,Uncategorized,https://www.desertsun.com/story/news/2020/06/0...,"[2, 19, 4661, 4662]",True,1,[https://www.desertsun.com/story/news/2020/06/...,1,Count Love,50.0,50.0,"Desert Hot Springs, CA"
25926,31348,2020-06-08,"Palm Springs, CA",,33.830,-116.545,Uncategorized,https://www.desertsun.com/story/news/2020/06/0...,"[2, 19, 4661, 4662]",True,1,[https://www.desertsun.com/story/news/2020/06/...,1,Count Love,,,"Palm Springs, CA"
25927,31351,2020-06-08,"Manchester, NJ",200.0,39.971,-74.339,Uncategorized,https://www.app.com/story/news/local/community...,"[2, 19, 4661, 4662]",True,1,[https://www.app.com/story/news/local/communit...,1,Count Love,200.0,200.0,"Manchester, NJ"


In [42]:
keep = ['id', 'date', 'city_st', 'location', 'size', 'size_str', 'urls', 'collection']

In [43]:
cl_blm[keep].to_csv('cl_blm.csv', index=False)

# Elephrame
(Scraping using "BLM Scrape Elephrame.ipynb")

In [44]:
ele_df = pd.read_csv('Elephrame.csv')
ele_df['date'] = pd.to_datetime(ele_df['Date'])

In [45]:
def str_2_list(links):
    return ast.literal_eval(links)

ele_df['urls'] = ele_df['Links'].apply(str_2_list)

In [46]:
ele_df['city_st'] = ele_df['Location'].apply(location_fix)



In [47]:
# Extract numberic component of size_str for size est.


def size_est(size):
    try:
        return int(re.findall('(\d+)', size)[0])
    except:
        if 'undreds' in size:
            return 200
        if 'housands' in size:
            return 2000
        if 'ozens' in size:
            return 24

ele_df['size'] = ele_df['Size'].apply(size_est)


In [48]:
ele_df.rename(
    columns={
        "ID": "id",
        "Collection": "collection",
        "Size": "size_str",
        "Description": "description",
        "Location" : "location"
    },
    inplace=True,
)

In [49]:
ele_df[keep].to_csv('ele.csv', index=False)

CCC

In [50]:
may_sheet = ('https://docs.google.com/spreadsheets/d/'
               '1pZo5p9EKZJ87IvPVjIp50nQQPET_ucV8vKVfZ6NpOvg/'
               'edit#gid=1571725208')

may_sheet = may_sheet.replace('edit#gid','export?format=csv&gid')

In [51]:
may_df = pd.read_csv(may_sheet)

In [52]:
may_sheet

'https://docs.google.com/spreadsheets/d/1pZo5p9EKZJ87IvPVjIp50nQQPET_ucV8vKVfZ6NpOvg/export?format=csv&gid=1571725208'

In [53]:
june_sheet = ('https://docs.google.com/spreadsheets/d/'
              '1-HM-bFsnTd9omYOrB8JOMeQ0XzPvCaVaADKqXQ_RpXg/'
              'edit#gid=0')
june_sheet = june_sheet.replace('edit#gid','export?format=csv&gid')

In [54]:
june_df = pd.read_csv(june_sheet)

In [55]:
ccc_df = may_df.append(june_df, ignore_index=True)

In [56]:
ccc_df['size_str'] = ccc_df['EstimateText']

In [57]:
ccc_df['city_st'] = ccc_df['CityTown'] +', ' + ccc_df['StateTerritory']
ccc_df['location'] = ccc_df['Location'].fillna('') + ', ' + ccc_df['city_st']
ccc_df['date'] = pd.to_datetime(ccc_df['Date'])
ccc_df['collection'] = 'CCC'
ccc_df['id'] = None

#remove blank rows from Excel
ccc_df.dropna(subset=['city_st'], inplace=True)

In [58]:
#needs to be better average


def ccc_size(row):
    if np.isnan(row['BestGuess']) == False:
        return row['BestGuess']
    
    if row['EstimateLow'] == row['EstimateHigh']:
        return row['EstimateLow']

    
    return (row['EstimateLow'] + row['EstimateHigh']) / 2

ccc_df['size'] = ccc_df.apply(ccc_size, axis=1)


In [59]:
#combine URLS across three variables
def fix_urls(row):
    urls  = []
    for s in ['Source1', 'Source2', 'Source3']:
        if isinstance(row[s], float) == False:
            urls.append(row[s])
    urls = list(set(urls))
    return urls
        
ccc_df['urls'] = ccc_df.apply(fix_urls, axis=1) 

In [60]:
ccc_df[keep].to_csv('ccc_blm.csv', index=False)