# 211 Counts Scrapper

This notebook scrapes the content of [211 Counts Dashboard](https://ny.211counts.org/) and stores the data in a SQLite database.  This scrapper should be able to process the data from any state with minimal alteration.

In [1]:
import requests
from datetime import datetime, timedelta, date
from bs4 import BeautifulSoup
import pandas as pd
from sqlalchemy import create_engine
import sqlite3

# The name of the database file
database_name = "211_counts_data.db"

# The start and end date to scrape
start_date = date(2014, 8, 1)
end_date = date(2020, 6, 8)

# The geographies
geos = {
    "36001": {'the_id': '{"ids":["463"]}', 'type_code': 'C'},
    "36003": {'the_id': '{"ids":["256"]}', 'type_code': 'C'},
    "36007": {'the_id': '{"ids":["268"]}', 'type_code': 'C'},
    "36009": {'the_id': '{"ids":["247"]}', 'type_code': 'C'},
    "36011": {'the_id': '{"ids":["278"]}', 'type_code': 'C'},
    "36013": {'the_id': '{"ids":["248"]}', 'type_code': 'C'},
    "36015": {'the_id': '{"ids":["258"]}', 'type_code': 'C'},
    "36017": {'the_id': '{"ids":["269"]}', 'type_code': 'C'},
    "36019": {'the_id': '{"ids":["453"]}', 'type_code': 'C'},
    "36021": {'the_id': '{"ids":["474"]}', 'type_code': 'C'},
    "36023": {'the_id': '{"ids":["284"]}', 'type_code': 'C'},
    "36025": {'the_id': '{"ids":["270"]}', 'type_code': 'C'},
    "36027": {'the_id': '{"ids":["461"]}', 'type_code': 'C'},
    "36029": {'the_id': '{"ids":["249"]}', 'type_code': 'C'},
    "36031": {'the_id': '{"ids":["452"]}', 'type_code': 'C'},
    "36033": {'the_id': '{"ids":["451"]}', 'type_code': 'C'},
    "36035": {'the_id': '{"ids":["473"]}', 'type_code': 'C'},
    "36037": {'the_id': '{"ids":["250"]}', 'type_code': 'C'},
    "36039": {'the_id': '{"ids":["472"]}', 'type_code': 'C'},
    "36041": {'the_id': '{"ids":["471"]}', 'type_code': 'C'},
    "36043": {'the_id': '{"ids":["265"]}', 'type_code': 'C'},
    "36045": {'the_id': '{"ids":["287"]}', 'type_code': 'C'},
    "36049": {'the_id': '{"ids":["288"]}', 'type_code': 'C'},
    "36051": {'the_id': '{"ids":["279"]}', 'type_code': 'C'}, 
    "36053": {'the_id': '{"ids":["266"]}', 'type_code': 'C'},
    "36055": {'the_id': '{"ids":["280"]}', 'type_code': 'C'},
    "36057": {'the_id': '{"ids":["470"]}', 'type_code': 'C'},
    "36059": {'the_id': '{"ids":["447"]}', 'type_code': 'C'},
    "36063": {'the_id': '{"ids":["253"]}', 'type_code': 'C'},
    "36065": {'the_id': '{"ids":["267"]}', 'type_code': 'C'},
    "36067": {'the_id': '{"ids":["291"]}', 'type_code': 'C'},
    "36069": {'the_id': '{"ids":["281"]}', 'type_code': 'C'},
    "36071": {'the_id': '{"ids":["460"]}', 'type_code': 'C'},
    "36073": {'the_id': '{"ids":["254"]}', 'type_code': 'C'},
    "36075": {'the_id': '{"ids":["292"]}', 'type_code': 'C'},
    "36077": {'the_id': '{"ids":["273"]}', 'type_code': 'C'},
    "36079": {'the_id': '{"ids":["459"]}', 'type_code': 'C'},
    "36083": {'the_id': '{"ids":["469"]}', 'type_code': 'C'},
    "36087": {'the_id': '{"ids":["458"]}', 'type_code': 'C'},
    "36089": {'the_id': '{"ids":["293"]}', 'type_code': 'C'},
    "36091": {'the_id': '{"ids":["468"]}', 'type_code': 'C'},
    "36093": {'the_id': '{"ids":["467"]}', 'type_code': 'C'},
    "36095": {'the_id': '{"ids":["466"]}', 'type_code': 'C'},
    "36097": {'the_id': '{"ids":["259"]}', 'type_code': 'C'},
    "36099": {'the_id': '{"ids":["282"]}', 'type_code': 'C'},
    "36101": {'the_id': '{"ids":["260"]}', 'type_code': 'C'},
    "36103": {'the_id': '{"ids":["475"]}', 'type_code': 'C'},
    "36105": {'the_id': '{"ids":["457"]}', 'type_code': 'C'},
    "36107": {'the_id': '{"ids":["275"]}', 'type_code': 'C'},
    "36109": {'the_id': '{"ids":["285"]}', 'type_code': 'C'},
    "36111": {'the_id': '{"ids":["456"]}', 'type_code': 'C'},
    "36113": {'the_id': '{"ids":["465"]}', 'type_code': 'C'},
    "36115": {'the_id': '{"ids":["464"]}', 'type_code': 'C'},
    "36117": {'the_id': '{"ids":["283"]}', 'type_code': 'C'},
    "36119": {'the_id': '{"ids":["455"]}', 'type_code': 'C'},
    "36121": {'the_id': '{"ids":["255"]}', 'type_code': 'C'},
    "36123": {'the_id': '{"ids":["262"]}', 'type_code': 'C'},
}

# Set to true if you want to initialize the database
initialize_db = True


def daterange(start_date, end_date):
    """
    Creates a range of dates (similar to the python range function)
    
    Parameters:
        start_date (datetime.date): The date to start from
        end_date (datetime.date): The date to end at
    
    Returns:
        Generator output (datetime.date): The next date in the range
    """
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)
        
        
def extract_data(el):
    """
    Extracts data from 211 scrapped HTML content.

    Parameters:
        el (bs4.element.Tag): BeautifulSoup HTML element

    Returns:
        data_id (int): The id of the category
        category (str): Description of the 2-1-1 call category
        percentage (float): The percent of phone calls in this category
        count (int): The count of phone calls of this category
    """
    category = el.find("span", {"class": "toolTipSubCategory"}).text
    data_id = int(el["data-id"])
    data = el.find("span", {"class": "value"})
    percentage = float(data["data-percentage"].strip("%")) / 100
    count = int(data["data-value"])
    return (data_id, category, percentage, count)


def get_211_date(the_date):
    """
    Produces the date in a form the 2-1-1 scraper needs.

    Parameters:
        the_date (datetime.date): The date to convert

    Returns:
        the_211_date (str): The date in 2-1-1 scrapper format
    """
    months = {
        1: "Jan",
        2: "Feb",
        3: "Mar",
        4: "Apr",
        5: "May",
        6: "Jun",
        7: "Jul",
        8: "Aug",
        9: "Sep",
        10: "Oct",
        11: "Nov",
        12: "Dec",
    }
    the_211_date = (
        months[the_date.month] + " " + str(the_date.day) + ", " + str(the_date.year)
    )
    return the_211_date


def process_211_lifeline_html(geo, html, the_date):
    """
    Parses the html, extracts data, and returns a pandas dataframe
    
    Parameters:
        geo (str): CGR Geography ID
        html (str): HTML content
        the_date (str): The date of the HTML content
    
    Returns:
        df (pandas.DataFrame): Data frame of total counts and percentages by category
    """
    data = list()
    soup = BeautifulSoup(html, features="lxml")
    # Get the top level categories
    for div in soup.findAll("div", {"class": "categoriesDiv"}):
        for el in div.findAll("div", {"class": "categories"}):
            try:
                data_id, category, percentage, count = extract_data(el)
                data.append(
                    {
                        "category_id": data_id,
                        "parent_category_id": None,
                        "category": category,
                        "percentage": percentage,
                        "count": count,
                    }
                )
            except:
                continue
    # Get the sub categories
    for div in soup.findAll("div", {"class": "subcategoriesDiv"}):
        for ul in div.findAll("ul", {"class": "details"}):
            parent = int(ul["id"].split("-")[1])
            for el in ul.findAll("div", {"class": "categories"}):
                try:
                    data_id, category, percentage, count = extract_data(el)
                    data.append(
                        {
                            "category_id": data_id,
                            "parent_category_id": parent,
                            "category": category,
                            "percentage": percentage,
                            "count": count,
                        }
                    )
                except:
                    continue

    df = pd.DataFrame(data)
    df["date"] = the_date
    df["geography_id"] = geo
    return(df)

def scrape_and_save_211_count_data(the_id, the_type_code, the_date, datetime_date, geo_id, db):
    """
    Scrapes 211 Count content, extracts the data, and saves it to a database
    
    Parameters:
        the_id (str): 211 ID for the geography
        the_type_code (str): 211 type code for the geography
        the_date (str): 211 formated date
        datetime_date (datetime.date): The date in datetime format
        geo_id (str): The internal geography id
        db (sqlalchemy.engine.base.Engine): The SQL Alchemy engine
    """
    
    cookies = {
        '_ga': 'GA1.2.1786037618.1591293165',
        '_gid': 'GA1.2.2065705029.1591626272',
    }
    
    headers = {
        'Connection': 'keep-alive',
        'Accept': 'text/html, */*; q=0.01',
        'X-Requested-With': 'XMLHttpRequest',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Origin': 'https://ny.211counts.org',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Dest': 'empty',
        'Referer': 'https://ny.211counts.org/',
        'Accept-Language': 'en-US,en;q=0.9,fr;q=0.8',
    }

    data = {
      'identifierCategory': '',
      'sourceType': '',
      'fromMobile': 'false',
      'id': the_id,
      'timeIntervalId': '0',
      'centerId': '22',
      'fromDate': the_date,
      'toDate': the_date,
      'type': the_type_code
    }
    # Request the data
    response = requests.post('https://ny.211counts.org/dashBoard/barChart', headers=headers, cookies=cookies, data=data)
    # Process the response
    df = process_211_lifeline_html(geo_id, response.text, datetime_date)
    # Save the processed data
    df.to_sql("totals", db, if_exists="append", index=False)

In [2]:
# Initialize DB
if initialize_db:
    import os
    if os.path.exists(database_name):
        os.unlink(database_name)
    con = sqlite3.connect(database_name)
    cur = con.cursor()
    cur.execute('''CREATE TABLE "totals" (
        "category_id" BIGINT,
        "parent_category_id" BIGINT DEFAULT NULL,
        "category" TEXT,
        "percentage" FLOAT,
        "count" BIGINT,
        "date" TEXT,
        "geography_id" TEXT
    );''')
    cur.execute('''CREATE INDEX "ix_totals_index" ON "totals" ("category_id", "category", "parent_category_id", "date", "geography_id");''')
    con.commit()
    con.close()

In [None]:
db = create_engine("sqlite:///" + database_name)

for single_date in daterange(start_date, end_date):
    print("Getting data for " + str(single_date))
    for geo_id, g in geos.items():
        the_id = g["the_id"]
        the_type_code = g["type_code"]
        the_date = get_211_date(single_date)
        scrape_and_save_211_count_data(the_id, the_type_code, the_date, single_date, geo_id, db)

Getting data for 2014-08-01
Getting data for 2014-08-02
Getting data for 2014-08-03
Getting data for 2014-08-04
Getting data for 2014-08-05
Getting data for 2014-08-06
Getting data for 2014-08-07
Getting data for 2014-08-08
Getting data for 2014-08-09
Getting data for 2014-08-10
Getting data for 2014-08-11
Getting data for 2014-08-12
Getting data for 2014-08-13
