In [1]:
%load_ext blackcellmagic

import pandas as pd
import requests
import re
import ast
import numpy as np

# Count Love

In [2]:
# main data
cl_url = "https://countlove.org/data/events.json"
r = requests.get(cl_url)
cl_df = pd.DataFrame(r.json()["data"])

In [3]:
# article links
cl_links = "https://countlove.org/data/links.json"
r = requests.get(cl_links)

In [4]:
# parse article links so they can be merged into main data file

l = []
j = r.json()

for key in j.keys():
    # display(key)
    id = j[key]
    urls = []
    for date in id.keys():
        urls.append(id[date][0])

    m = {"id": key, "urls": urls}

    l.append(m)
link_df = pd.DataFrame(l)
link_df['url_count'] = link_df['urls'].str.len()

In [5]:
# merge cl data and links
link_df['id'] = link_df['id'].astype(int)
cl_df2 = pd.merge(cl_df, link_df, left_on='id', right_on='id')

In [6]:
# turn tag list into strings for matching
def tag2(tags):
    try:
        return ', '.join([str(i) for i in tags])
    except:
        return str(tags)

In [7]:
# variable cleaning

cl_df2['date'] = pd.to_datetime(cl_df2['date'])
cl_df2['collection'] = 'Count Love'
cl_df2['size_str'] = cl_df2['count'].astype(str)
cl_df2['size'] = cl_df2['count']

In [8]:
#extract floyed protests
blm_screen = cl_df2['tags'].apply(tag2) == '2, 19, 4661, 4662'
cl_blm = cl_df2[blm_screen].copy()

In [9]:
len(cl_blm)

1835

In [10]:
#remove extra pieces that aren't city/state

def location_fix(location):
    locations = location.split(', ')
    if len(locations) <=2:
        return location
    return ', '.join(locations[-2:])

cl_blm['city_st'] = cl_blm['location'].apply(location_fix)

In [11]:
keep = ['id', 'date', 'city_st', 'location', 'size', 'size_str', 'urls', 'collection']

In [12]:
cl_blm[keep].to_csv('cl_blm.csv', index=False)

# Elephrame
(Scraping using "BLM Scrape Elephrame.ipynb")

In [13]:
ele_df = pd.read_csv('Elephrame.csv')
ele_df['date'] = pd.to_datetime(ele_df['Date'])

In [14]:
def str_2_list(links):
    return ast.literal_eval(links)

ele_df['urls'] = ele_df['Links'].apply(str_2_list)

In [15]:
ele_df['city_st'] = ele_df['Location'].apply(location_fix)



In [16]:
# Extract numberic component of size_str for size est.


def size_est(size):
    try:
        return int(re.findall('(\d+)', size)[0])
    except:
        if 'undreds' in size:
            return 200
        if 'housands' in size:
            return 2000
        if 'ozens' in size:
            return 24

ele_df['size'] = ele_df['Size'].apply(size_est)


In [17]:
ele_df.rename(
    columns={
        "ID": "id",
        "Collection": "collection",
        "Size": "size_str",
        "Description": "description",
        "Location" : "location"
    },
    inplace=True,
)

In [18]:
ele_df[keep].to_csv('ele.csv', index=False)

CCC

In [19]:
may_sheet = ('https://docs.google.com/spreadsheets/d/'
               '1pZo5p9EKZJ87IvPVjIp50nQQPET_ucV8vKVfZ6NpOvg/'
               'edit#gid=1571725208')

may_sheet = may_sheet.replace('edit#gid','export?format=csv&gid')

In [20]:
may_df = pd.read_csv(may_sheet)

In [21]:
may_sheet

'https://docs.google.com/spreadsheets/d/1pZo5p9EKZJ87IvPVjIp50nQQPET_ucV8vKVfZ6NpOvg/export?format=csv&gid=1571725208'

In [22]:
june_sheet = ('https://docs.google.com/spreadsheets/d/'
              '1-HM-bFsnTd9omYOrB8JOMeQ0XzPvCaVaADKqXQ_RpXg/'
              'edit#gid=0')
june_sheet = june_sheet.replace('edit#gid','export?format=csv&gid')

In [23]:
june_df = pd.read_csv(june_sheet)

In [24]:
ccc_df = may_df.append(june_df, ignore_index=True)

In [25]:
ccc_df['size_str'] = ccc_df['EstimateText']

In [26]:
ccc_df['city_st'] = ccc_df['CityTown'] +', ' + ccc_df['StateTerritory']
ccc_df['location'] = ccc_df['Location'].fillna('') + ', ' + ccc_df['city_st']
ccc_df['date'] = pd.to_datetime(ccc_df['Date'])
ccc_df['collection'] = 'CCC'
ccc_df['id'] = None

#remove blank rows from Excel
ccc_df.dropna(subset=['city_st'], inplace=True)

In [27]:
#needs to be better average


def ccc_size(row):
    if np.isnan(row['BestGuess']) == False:
        return row['BestGuess']
    
    if row['EstimateLow'] == row['EstimateHigh']:
        return row['EstimateLow']

    
    return (row['EstimateLow'] + row['EstimateHigh']) / 2

ccc_df['size'] = ccc_df.apply(ccc_size, axis=1)


In [28]:
#combine URLS across three variables
def fix_urls(row):
    urls  = []
    for s in ['Source1', 'Source2', 'Source3']:
        if isinstance(row[s], float) == False:
            urls.append(row[s])
    urls = list(set(urls))
    return urls
        
ccc_df['urls'] = ccc_df.apply(fix_urls, axis=1) 

In [29]:
ccc_df[keep].to_csv('ccc_blm.csv', index=False)