In [1]:
## imports 
import pandas as pd
import numpy as np
import yaml
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# comment these out if you don't have plotnine--not essential here/only used once
import matplotlib.pyplot as plt
# import plotnine
# from plotnine import *

## way to connect to mysql 
## if you need to install
## uncomment this line:
#! pip install mysql-connector-python
import mysql.connector

## function to feed path name to load
## credentials
def load_creds(path: str):
    with open(path, 'r') as stream:
        try:
            creds = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    return(creds)

pd.options.display.max_rows = 999
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Preliminary: define connection and read sample of data

In [4]:
## read in creds; change the path name if stored
## elsewhere
creds = load_creds("09_db_cred.yaml")

# Activity 1

1. Create a new column -- `in_chicago` when pulling from the `caseinit` table that takes on the value of "YES" if INCIDENT_CITY = Chicago; "NO" otherwise (which represents incidents in Cook County suburbs outside the city limits);  and pull the table. Use `crosstabs` to confirm that this worked
2. Repeat step 1 but also filter out blank strings (`INCIDENT_CITY` == "")
3. Use `where` to row filter to initiations in Chicago and use group by to find the count of cases diverted and not diverted (`is_in_diversion`); pull the table with those counts
4. Modify the query in step 3 to find the proportion of cases in chicago diverted (hint you made need to use case when in a subquery)
5. Modify the query in step 4 to find the proportion of cases in chicago versus cases not in chicago sent to diversion 


In [12]:
# your code here 1
# Connect to database
cnx = mysql.connector.connect(user=creds['practice_database']['user'], 
                            password=creds['practice_database']['password'],
                            port=creds['practice_database']['port'],
                            database= creds['practice_database']['database'],
                            host = creds['practice_database']['host'])
cnx

new_col_q = """
SELECT INCIDENT_CITY,
       CASE
         WHEN INCIDENT_CITY = 'Chicago' THEN 'YES'
         ELSE 'NO'
       END AS in_chicago
FROM   caseinit 
"""
in_chicago_df = pd.read_sql_query(new_col_q, cnx)

pd.crosstab(in_chicago_df["in_chicago"], in_chicago_df["INCIDENT_CITY"])




<mysql.connector.connection.MySQLConnection at 0x1d9e68890>

INCIDENT_CITY,Unnamed: 1_level_0,Addison,Albers,Algonquin,Alsip,Antioch,Arlington Heights,Aurora,Barrington,Barrington Hills,...,Westmont,Wheaton,Wheeling,Willow Springs,Willowbrook,Wilmette,Winnetka,Wood Dale,Woodstock,Worth
in_chicago,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NO,20557,2,1,3,568,2,1291,3,136,16,...,1,4,1094,82,2,333,129,3,1,284
YES,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# your code here 2

updated_new_col_q = """
SELECT INCIDENT_CITY,
       CASE
         WHEN INCIDENT_CITY = 'Chicago' THEN 'YES'
         ELSE 'NO'
       END AS in_chicago
FROM   caseinit 
WHERE INCIDENT_CITY != ''
LIMIT 20
"""
updated_df = pd.read_sql_query(updated_new_col_q, cnx)
updated_df[["INCIDENT_CITY", "in_chicago"]].sample(10)





Unnamed: 0,INCIDENT_CITY,in_chicago
13,Chicago,YES
12,Chicago,YES
18,Calumet City,NO
15,Chicago,YES
11,Chicago,YES
4,Chicago,YES
14,Chicago,YES
16,Chicago,YES
9,Skokie,NO
2,Morton Grove,NO


In [14]:
# your code here 3
query = """
SELECT is_in_diversion,
       COUNT(*) AS count_cases
FROM caseinit
WHERE INCIDENT_CITY = 'Chicago'
GROUP BY is_in_diversion
"""
chicago_diversion_counts = pd.read_sql_query(query, cnx)
chicago_diversion_counts

Unnamed: 0,is_in_diversion,count_cases
0,False,167171
1,True,6402


In [16]:
# your code here 4
new_query = """
SELECT 
  SUM(CASE WHEN is_in_diversion IN ('TRUE', 'Yes', 'yes', 'true') THEN 1 ELSE 0 END) * 1.0 / COUNT(*) AS proportion_diverted
FROM caseinit
WHERE INCIDENT_CITY = 'Chicago'
"""
chicago_diversion_prop_counts = pd.read_sql_query(new_query, cnx)
chicago_diversion_prop_counts

Unnamed: 0,proportion_diverted
0,0.03688


In [18]:
# your code here 5
# Modify the query in step 4 to find the proportion of cases in chicago versus cases not in chicago sent to diversion 
# code from 4 below
new_query = """
SELECT 
  CASE 
    WHEN INCIDENT_CITY = 'Chicago' THEN 'Chicago'
    ELSE 'Not Chicago'
  END AS in_chicago,
  SUM(CASE WHEN is_in_diversion IN ('TRUE', 'Yes', 'yes', 'true') THEN 1 ELSE 0 END) * 1.0 / COUNT(*) AS proportion_diverted
FROM caseinit
WHERE INCIDENT_CITY IS NOT NULL AND INCIDENT_CITY != ''
GROUP BY 
  CASE 
    WHEN INCIDENT_CITY = 'Chicago' THEN 'Chicago'
    ELSE 'Not Chicago'
  END
"""
chicago_vs_not_diversion_prop_counts = pd.read_sql_query(new_query, cnx)
chicago_vs_not_diversion_prop_counts

Unnamed: 0,in_chicago,proportion_diverted
0,Not Chicago,0.04695
1,Chicago,0.03688


# Activity 2 

1. Use the following crosswalk and the `CASE` variable in the `divert` table to create a new variable `DIVERSION_PROGRAM_TEXT` that spells out the diversion programs
    - DC: Drug Court

    - DDPP: Drug Deferred Prosecution

    - DS: Drug School

    - RJCC: Restorative Justice

    - MHC: Mental Health Court

    - VC: Veteran Court

2. Build on the query from step 1 to filter to Narcotics as the `UPDATED_OFFENSE_CATEGORY` and Black or White defendants (based on race in the diversions table) (hint: you'll need to join with the caseinit table based on case_id and case_participant_id, you can do a inner join to keep only those diverted). Select the case_id, case_participant_id, case, race, and diversion_program_text columns

In [None]:
# your code here 1
q = """
SELECT *,
    CASE WHEN DIVERSION_PROGRAM = 'DC' THEN 'Drug Court'
    WHEN DIVERSION_PROGRAM = 'DDPP' THEN 'Drug Deferred Prosecution'
    WHEN DIVERSION_PROGRAM = 'DS' THEN 'Drug School'
    WHEN DIVERSION_PROGRAM = 'RJCC' THEN 'Restorative Justice'
    WHEN DIVERSION_PROGRAM = 'MHC' THEN 'Mental Health Court'
    WHEN DIVERSION_PROGRAM = 'VC' THEN 'Veteran Court'
    ELSE 'Other'
    

"""

In [None]:
# your code here 2