In [1]:
from requests_html import HTMLSession, HTML
import re
import pandas as pd
import spacy

from collections import defaultdict


In [2]:
# nlp = spacy.load("en_core_web_md")

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_George_Floyd_protests_in_the_United_States'

In [1]:
pages = ['George_Floyd_protests_in_Alabama',
 'George_Floyd_protests_in_Alaska',
 'George_Floyd_protests_in_Arizona',
 'George_Floyd_protests_in_Arkansas',
 'George_Floyd_protests_in_California',
 'George_Floyd_protests_in_Colorado',
 'George_Floyd_protests_in_Connecticut',
 'George_Floyd_protests_in_Florida',
 'George_Floyd_protests_in_Georgia',
 'George_Floyd_protests_in_Illinois',
 'George_Floyd_protests_in_Chicago',
 'George_Floyd_protests_in_Indiana',
 'George_Floyd_protests_in_Iowa',
 'George_Floyd_protests_in_Kansas',
 'George_Floyd_protests_in_Kentucky',
 'George_Floyd_protests_in_Louisiana',
 'George_Floyd_protests_in_Maine',
 'George_Floyd_protests_in_Maryland',
 'George_Floyd_protests_in_Massachusetts',
 'George_Floyd_protests_in_Michigan',
 'George_Floyd_protests_in_Minnesota',
 'George_Floyd_protests_in_Nebraska',
 'George_Floyd_protests_in_New_Jersey',
 'George_Floyd_protests_in_New_York_(state)',
 'George_Floyd_protests_in_North_Carolina',
 'George_Floyd_protests_in_Ohio',
 'George_Floyd_protests_in_Oregon',
 'George_Floyd_protests_in_Pennsylvania',
 'George_Floyd_protests_in_Puerto_Rico',
 'George_Floyd_protests_in_Tennessee',
 'George_Floyd_protests_in_Texas',
 'George_Floyd_protests_in_Utah',
 'George_Floyd_protests_in_Virginia',
 'George_Floyd_protests_in_Washington_(state)',
 'George_Floyd_protests_in_Wisconsin',]

cities = { 'George_Floyd_protests_in_New_York_City',
 'George_Floyd_protests_in_Washington,_D.C.',
 'George_Floyd_protests_in_Richmond,_Virginia',
 'George_Floyd_protests_in_Portland,_Oregon',
 'George_Floyd_protests_in_Columbus,_Ohio',
 'George_Floyd_protests_in_Philadelphia',
}

In [5]:
session = HTMLSession()
r = session.get(url)

In [6]:
def clean_text(text):
    return re.sub(r'\[[^)]*\]', '', text)

def ref_links(r):
    ref_dict = defaultdict(lambda: 'Missing')

    ref_text = r.html.html.split('id="References"')[1]
    ref_texts = ref_text.split('<li ')
    ref_texts = [r for r in ref_texts if 'cite_note-' in r]
    for piece in ref_texts:
        name = re.findall('(cite_note-.*?)"', piece)[0]
        parsed = HTML(html=piece)
        links = [l for l in list(parsed.links) if 'http' in l]
        try:
            link = links[0]
        except:
            link = 'missing'
        ref_dict[name] = link
    
    return ref_dict


def scrape_state_page(sp = 'George_Floyd_protests_in_North_Carolina'):
    url = 'https://en.wikipedia.org/wiki/' + sp
    session = HTMLSession()
    r = session.get(url)
    ref_dict = ref_links(r)
    
    body = r.html.html.split('id="References"')[0]

    reports = []
    for section in body.split('<h3>')[1:]:
        report = {}
        #print(section)
        parsed = HTML(html=section)
        #print(parsed.text)
        city = parsed.text.split('[edit]')[0]
        report['city'] = city

        text = ' '.join(parsed.text.splitlines()[1:])
        text = clean_text(text)
        report['text'] = text
        
        references = re.findall('(cite_note-.*?)"', section)
        references = [ref_dict[r] for r in references]
        #references = [r.split('-')[-1] for r in references]
        report['references'] = references 
        reports.append(report)
        
    
    df = pd.DataFrame(reports)
    
    state = sp.split('George_Floyd_protests_in_')[-1].replace('_', ' ')
    df['state'] = state
    
    return df
    

In [7]:
def scrape_state_page2(sp = 'George_Floyd_protests_in_California'):
    url = 'https://en.wikipedia.org/wiki/' + sp
    session = HTMLSession()
    r = session.get(url)
    ref_dict = ref_links(r)
    
    body = r.html.html.split('id="References"')[0]

    reports = []
    for section in body.split('<li>')[1:]:
        report = {}
        #print(section)
        parsed = HTML(html=section)
        #print(parsed.text)
        text = parsed.text.split('[edit]')[0]
        if ':' in text:
            report['city'] = text.split(':')[0]


            report['text'] =  clean_text(': '.join(text.split(':')[1:]))
            

            references = re.findall('(cite_note-.*?)"', section)
            references = [ref_dict[r] for r in references]
            #references = [r.split('-')[-1] for r in references]
            report['references'] = references 
            reports.append(report)
    df = pd.DataFrame(reports)
    
    state = sp.split('George_Floyd_protests_in_')[-1].replace('_', ' ')
    df['state'] = state
    
    return df

In [29]:
def scrape_national(sp = 'List_of_George_Floyd_protests_in_the_United_States'):
    url = 'https://en.wikipedia.org/wiki/' + sp
    session = HTMLSession()
    r = session.get(url)
    ref_dict = ref_links(r)
    
    body = r.html.html.split('id="References"')[0]

    reports = []
    for section in body.split('<li>')[2:]:
        report = {}
        #print(section)
        parsed = HTML(html=section)
        #print(parsed.text)
        text = parsed.text.split('[edit]')[0]
        if ':' in text and 'Monuments and memorials removed' not in text:
            

            state = parsed.find('a')[0].attrs['title']
        
        
            report['city'] = text.split(':')[0]


            report['text'] =  clean_text(': '.join(text.split(':')[1:]))
            report['state'] = state.split(', ')[-1]

            references = re.findall('(cite_note-.*?)"', section)
            references = [ref_dict[r] for r in references]
            #references = [r.split('-')[-1] for r in references]
            report['references'] = references 
            reports.append(report)
    df = pd.DataFrame(reports)
    
    state = sp.split('George_Floyd_protests_in_')[-1].replace('_', ' ')
    #df['state'] = state
    
    return df

In [40]:
states = [scrape_national()]
for sp in pages:
    reports = pd.DataFrame()
    reports = scrape_state_page(sp)
    states.append(reports)

    reports = scrape_state_page2(sp)
    states.append(reports)


In [51]:
pd.concat(states).to_json('data/wiki_protest.json', orient='records')

In [52]:
pd.concat(states).to_csv('data/wiki_protest.csv', index=False)