## Final Project Part - II

### In this part, we will be creating tables and loading data into the tables using the ERD in Part - I.

In this project, we will explore Chicago Crime Dataset and implement a relational database for storing the data. During the final week, you will complete Tasks 3-5:

1. Identify the features (attributes) in Chicago Crime dataset and design an entity-relationship model
2. Refine the model and convert each relation to BCNF (if required)
3. Using DDL implement the relations in a Postgres server
4. Load the given data to the relations
5. Execute some interesting queries on the relations

In [120]:
import pandas as pd 

In [121]:
chi = pd.read_csv('Chicago-Crime-Sample-2012.csv')
list(chi.columns)

['Unnamed: 0',
 'ID',
 'Case Number',
 'Date',
 'Block',
 'IUCR',
 'Primary Type',
 'Description',
 'Location Description',
 'Arrest',
 'Domestic',
 'Beat',
 'District',
 'Ward',
 'Community Area',
 'FBI Code',
 'X Coordinate',
 'Y Coordinate',
 'Year',
 'Updated On',
 'Latitude',
 'Longitude',
 'Location']

In [122]:
chi = chi.rename(columns={'Location Description' : 'location_description', 'Primary Type' : 'primary_type', 'Case Number' : 'case_number', 'X Coordinate': 'x_coordinate', 'Y Coordinate' : 'y_coordinate', 'Updated On':'updated_on', 'FBI Code':'fbi_code','Community Area' : 'community_area'})

In [123]:
chi.columns = map(str.lower, chi.columns)

In [25]:
list(chi.columns)

['unnamed: 0',
 'id',
 'case_number',
 'date',
 'block',
 'iucr',
 'primary_type',
 'description',
 'location_description',
 'arrest',
 'domestic',
 'beat',
 'district',
 'ward',
 'community_area',
 'fbi_code',
 'x_coordinate',
 'y_coordinate',
 'year',
 'updated_on',
 'latitude',
 'longitude',
 'location']

In [26]:
chi = chi.rename(columns={'unnamed: 0' : 'unnamed_id'})
chi.head()

Unnamed: 0,unnamed_id,id,case_number,date,block,iucr,primary_type,description,location_description,arrest,...,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
0,47398,10433096,HZ170962,1/1/2012 0:00,026XX N MC VICKER AVE,1562,SEX OFFENSE,AGG CRIMINAL SEXUAL ABUSE,RESIDENCE,True,...,29.0,19.0,17,,,2012,5/11/2016 15:48,,,
1,47420,10433124,HZ170983,1/1/2012 0:00,026XX N MC VICKER AVE,1544,SEX OFFENSE,SEXUAL EXPLOITATION OF A CHILD,RESIDENCE,True,...,29.0,19.0,17,,,2012,5/11/2016 15:48,,,
2,802910,10532867,HZ276514,1/1/2012 0:00,036XX S RHODES AVE,1563,SEX OFFENSE,CRIMINAL SEXUAL ABUSE,APARTMENT,False,...,4.0,35.0,17,,,2012,5/26/2016 15:51,,,
3,803605,10536876,HZ280873,1/1/2012 0:00,062XX S ROCKWELL ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,RESIDENCE,False,...,15.0,66.0,11,,,2012,5/27/2016 15:48,,,
4,831733,9581929,HX232501,1/1/2012 0:00,006XX W 66TH ST,1563,SEX OFFENSE,CRIMINAL SEXUAL ABUSE,RESIDENCE,False,...,6.0,68.0,17,,,2012,8/17/2015 15:03,,,


In [29]:
chi['lb_id']  = pd.factorize(chi['block'])[0]
chi['ld_id'] = pd.factorize(chi['location_description'])[0]

In [18]:
chi['ld_id'] = pd.factorize(chi['location_description'])[0]

In [156]:
chi = chi.drop(['la_id'], axis = 1 )

In [27]:
def make_identifier(df):
    str_id = df.apply(lambda x: '_'.join(map(str, x)), axis=1)
    return pd.factorize(str_id)[0]

In [157]:
chi['la_id'] = make_identifier(chi[['beat','district','ward','community_area']])

In [29]:
chi['lc_id'] = make_identifier(chi[['x_coordinate','y_coordinate','latitude','longitude','location']])

In [163]:
chi['cl_id'] = make_identifier(chi[['la_id','lb_id','lc_id','ld_id']])

In [40]:
location_block = chi[['lb_id','block']]
location_block = location_block.drop_duplicates()

In [33]:
location_description = chi[['ld_id', 'location_description']]
location_description = location_description.drop_duplicates()
location_description
location_description.location_description.unique()

array(['RESIDENCE', 'APARTMENT', 'OTHER', 'CURRENCY EXCHANGE',
       'RESIDENCE PORCH/HALLWAY', 'SIDEWALK', 'BAR OR TAVERN',
       'VEHICLE NON-COMMERCIAL', 'STREET', 'HOTEL/MOTEL',
       'COMMERCIAL / BUSINESS OFFICE', 'RESIDENCE-GARAGE',
       'TAVERN/LIQUOR STORE', 'BANK', 'RESIDENTIAL YARD (FRONT/BACK)',
       'ATM (AUTOMATIC TELLER MACHINE)', 'PARKING LOT/GARAGE(NON.RESID.)',
       'GAS STATION', 'SMALL RETAIL STORE', 'DEPARTMENT STORE',
       'PARK PROPERTY', 'POLICE FACILITY/VEH PARKING LOT',
       'MEDICAL/DENTAL OFFICE', 'ABANDONED BUILDING', 'DAY CARE CENTER',
       'CTA TRAIN', 'VEHICLE-COMMERCIAL', 'CTA PLATFORM', 'ALLEY',
       'CHA APARTMENT', 'GROCERY FOOD STORE', 'RESTAURANT', 'CEMETARY',
       'CONVENIENCE STORE', 'CHURCH/SYNAGOGUE/PLACE OF WORSHIP',
       'POOL ROOM', 'SCHOOL, PRIVATE, GROUNDS', 'CTA BUS',
       'DRIVEWAY - RESIDENTIAL', 'CTA GARAGE / OTHER PROPERTY', 'TAXICAB',
       'CAR WASH', 'SPORTS ARENA/STADIUM', 'VACANT LOT/LAND',
       'ANIMAL 

In [56]:
location_coordinate = chi[['lc_id','x_coordinate','y_coordinate','latitude','longitude','location']]
location_coordinate = location_coordinate.drop_duplicates()
location_coordinate.dtypes
location_coordinate.head()

Unnamed: 0,lc_id,x_coordinate,y_coordinate,latitude,longitude,location
0,0,,,,,
16,1,1150568.0,1905409.0,41.896342,-87.722441,"(41.896341861, -87.722440573)"
17,2,1145387.0,1901385.0,41.885399,-87.741572,"(41.8853992, -87.741571528)"
18,3,1170335.0,1922325.0,41.942351,-87.649345,"(41.942351064, -87.649344776)"
19,4,1158836.0,1861215.0,41.774903,-87.693283,"(41.774902869, -87.693283132)"


In [64]:
crime_record = chi[['id','iucr','fbi_code','cl_id','unnamed_id','case_number','arrest','domestic','date','year','updated_on']]
crime_record = crime_record.drop_duplicates()
crime_record.head()

Unnamed: 0,id,iucr,fbi_code,cl_id,unnamed_id,case_number,arrest,domestic,date,year,updated_on
0,10433096,1562,17,0,47398,HZ170962,True,False,1/1/2012 0:00,2012,5/11/2016 15:48
1,10433124,1544,17,0,47420,HZ170983,True,False,1/1/2012 0:00,2012,5/11/2016 15:48
2,10532867,1563,17,1,802910,HZ276514,False,False,1/1/2012 0:00,2012,5/26/2016 15:51
3,10536876,1153,11,2,803605,HZ280873,False,False,1/1/2012 0:00,2012,5/27/2016 15:48
4,9581929,1563,17,3,831733,HX232501,False,True,1/1/2012 0:00,2012,8/17/2015 15:03


In [128]:
crime_code = chi[['iucr','fbi_code','description','primary_type']]
crime_code =crime_code.drop_duplicates()
crime_code.fbi_code.unique()
crime_code[crime_code['description'].str.contains('CANNABIS')]

Unnamed: 0,iucr,fbi_code,description,primary_type
272,1812,18,POSS: CANNABIS MORE THAN 30GMS,NARCOTICS
320,1811,18,POSS: CANNABIS 30GMS OR LESS,NARCOTICS
2431,1821,18,MANU/DEL:CANNABIS 10GM OR LESS,NARCOTICS
2861,1822,18,MANU/DEL:CANNABIS OVER 10 GMS,NARCOTICS
16923,1850,18,CANNABIS PLANT,NARCOTICS
32323,2094,18,ATTEMPT POSSESSION CANNABIS,NARCOTICS
205967,1840,18,DELIVER CANNABIS TO PERSON <18,NARCOTICS


In [128]:
crime_code17investigation = crime_code[crime_code['fbi_code'] == '17']
crime_code17investigation

Unnamed: 0,iucr,fbi_code,description,primary_type
0,1562,17,AGG CRIMINAL SEXUAL ABUSE,SEX OFFENSE
1,1544,17,SEXUAL EXPLOITATION OF A CHILD,SEX OFFENSE
2,1563,17,CRIMINAL SEXUAL ABUSE,SEX OFFENSE
160,1565,17,INDECENT SOLICITATION/CHILD,SEX OFFENSE
249,1582,17,CHILD PORNOGRAPHY,OFFENSE INVOLVING CHILDREN
955,1564,17,CRIMINAL TRANSMISSION OF HIV,SEX OFFENSE
1380,2830,17,OBSCENE TELEPHONE CALLS,OTHER OFFENSE
3765,1576,17,BIGAMY,SEX OFFENSE
4077,1570,17,PUBLIC INDECENCY,SEX OFFENSE
5501,1541,17,SALE/DIST OBSCENE MAT TO MINOR,OBSCENITY


In [158]:
location_area = chi[['la_id','beat','district','ward','community_area']]
location_area =location_area.drop_duplicates()
location_area.head()


Unnamed: 0,la_id,beat,district,ward,community_area
23,22,713,7,16.0,67.0


In [None]:
crime_location = chi[['cl_id','la_id','lb_id','lc_id','ld_id']]
crime_location = crime_location.drop_duplicates()


## 2.1 Using DDL, create each of the relations in the postgres server. 

* Use the ERD from Module 7 to create tables (DDL).
* Add additional cells if required.
* Make sure you have good documentation for each table to explain what attributes are and anything that are worth noting.

In [1]:
import getpass
# This collects a masked password from the user
mypasswd = getpass.getpass()

········


In [154]:
del(mypasswd)

In [170]:
import psycopg2
from psycopg2.extensions import adapt, register_adapter, AsIs
connection = psycopg2.connect(database = 'dsa_student', 
                              user = 'may96c', 
                              host = 'pgsql.dsa.lan',
                              password = mypasswd)

cursor = connection.cursor()

In [53]:
import numpy as np 

In [65]:
sqlCreateTable = """ CREATE TABLE may96c.location_block (
    lb_id INT PRIMARY KEY,
    block VARCHAR);"""
cursor.execute(sqlCreateTable)

connection.commit()

In [66]:
sqlCreateTable = """ CREATE TABLE may96c.location_description ( 
    ld_id INT PRIMARY KEY,
    location_description VARCHAR);"""
cursor.execute(sqlCreateTable)

connection.commit()

In [67]:
sqlCreateTable = """ CREATE TABLE may96c.location_coordinate ( 
    lc_id INT PRIMARY KEY,
    x_coordinate NUMERIC,
    y_coordinate NUMERIC,
    latitude NUMERIC,
    longitude NUMERIC,
    location VARCHAR);"""
cursor.execute(sqlCreateTable)

connection.commit()

In [145]:
sqlCreateTable = """ CREATE TABLE IF NOT EXISTS may96c.crime_code ( 
    iucr VARCHAR ,
    fbi_code VARCHAR,
    description VARCHAR,
    primary_type VARCHAR,
    PRIMARY KEY(fbi_code,iucr));"""
cursor.execute(sqlCreateTable)

connection.commit()


In [69]:
sqlCreateTable = """ CREATE TABLE may96c.location_area ( 
    la_id INT PRIMARY KEY, 
    beat INT,
    district INT,
    ward INT,
    community_area INT );"""
    
cursor.execute(sqlCreateTable)

connection.commit()

In [78]:
sqlCreateTable = """ CREATE TABLE may96c.crime_location (
    cl_id INT PRIMARY KEY,
    la_id INT,
    lb_id INT,
    lc_id INT,
    ld_id INT,
    CONSTRAINT fk_cl_a 
        FOREIGN KEY (la_id)
        REFERENCES may96c.location_area(la_id),
    CONSTRAINT fk_cl_b
        FOREIGN KEY(lb_id)
        REFERENCES may96c.location_block(lb_id),
    CONSTRAINT fk_cl_c
        FOREIGN KEY (lc_id)
        REFERENCES may96c.location_coordinate(lc_id),
    CONSTRAINT fk_cl_d
        FOREIGN KEY (ld_id)
        REFERENCES may96c.location_description(ld_id));"""
cursor.execute(sqlCreateTable)

connection.commit()

In [172]:
sqlCreateTable = """ CREATE TABLE may96c.crime_record (
    id INT PRIMARY KEY,
    iucr VARCHAR,
    fbi_code VARCHAR,
    cl_id INT, 
    unnamed_id INT,
    case_number VARCHAR,
    arrest VARCHAR,
    domestic VARCHAR,
    date TIMESTAMP,
    year INT,
    updated_on TIMESTAMP,
    CONSTRAINT cr_fk_a
        FOREIGN KEY(iucr,fbi_code)
        REFERENCES may96c.crime_code(iucr,fbi_code),
    CONSTRAINT cr_fk_c
        FOREIGN KEY (cl_id)
        REFERENCES may96c.crime_location(cl_id));"""
cursor.execute(sqlCreateTable)

connection.commit()
    
    

## 2.2 Show the table's definition for each table using psql or querying the information_schema.colums catalog

* Add additional cells if required

In [3]:
SSO = 'may96c'
hostname ='pgsql.dsa.lan'
database ='dsa_student'







dsa_student=> \d may96c.location_block ;
                Table "may96c.location_block"
 Column |       Type        | Collation | Nullable | Default
--------+-------------------+-----------+----------+---------
 lb_id  | integer           |           | not null |
 block  | character varying |           |          |
Indexes:
    "location_block_pkey" PRIMARY KEY, btree (lb_id)
Referenced by:
    TABLE "crime_location" CONSTRAINT "fk_cl_b" FOREIGN KEY (lb_id) REFERENCES location_block(lb_id)

dsa_student=> \d may96c.location_description ;
                    Table "may96c.location_description"
        Column        |       Type        | Collation | Nullable | Default
----------------------+-------------------+-----------+----------+---------
 ld_id                | integer           |           | not null |
 location_description | character varying |           |          |
Indexes:
    "location_description_pkey" PRIMARY KEY, btree (ld_id)
Referenced by:
    TABLE "crime_location" CONSTRAINT "fk_cl_d" FOREIGN KEY (ld_id) REFERENCES location_description(ld_id)

dsa_student=> \d may96c.location_coordinate ;
                Table "may96c.location_coordinate"
    Column    |       Type        | Collation | Nullable | Default
--------------+-------------------+-----------+----------+---------
 lc_id        | integer           |           | not null |
 x_coordinate | numeric           |           |          |
 y_coordinate | numeric           |           |          |
 latitude     | numeric           |           |          |
 longitude    | numeric           |           |          |
 location     | character varying |           |          |
Indexes:
    "location_coordinate_pkey" PRIMARY KEY, btree (lc_id)
Referenced by:
    TABLE "crime_location" CONSTRAINT "fk_cl_c" FOREIGN KEY (lc_id) REFERENCES location_coordinate(lc_id)


dsa_student=> \d may96c.crime_code
                     Table "may96c.crime_code"
    Column    |       Type        | Collation | Nullable | Default
--------------+-------------------+-----------+----------+---------
 iucr         | character varying |           | not null |
 fbi_code     | character varying |           | not null |
 description  | character varying |           |          |
 primary_type | character varying |           |          |
Indexes:
    "crime_code_pkey" PRIMARY KEY, btree (fbi_code, iucr)

dsa_student=> \d may96c.location_area ;
               Table "may96c.location_area"
     Column     |  Type   | Collation | Nullable | Default
----------------+---------+-----------+----------+---------
 la_id          | integer |           | not null |
 beat           | integer |           |          |
 district       | integer |           |          |
 ward           | integer |           |          |
 community_area | integer |           |          |
Indexes:
    "location_area_pkey" PRIMARY KEY, btree (la_id)
Referenced by:
    TABLE "crime_location" CONSTRAINT "fk_cl_a" FOREIGN KEY (la_id) REFERENCES location_area(la_id)

dsa_student=> \d may96c.crime_location;
           Table "may96c.crime_location"
 Column |  Type   | Collation | Nullable | Default
--------+---------+-----------+----------+---------
 cl_id  | integer |           | not null |
 la_id  | integer |           |          |
 lb_id  | integer |           |          |
 lc_id  | integer |           |          |
 ld_id  | integer |           |          |
Indexes:
    "crime_location_pkey" PRIMARY KEY, btree (cl_id)
Foreign-key constraints:
    "fk_cl_a" FOREIGN KEY (la_id) REFERENCES location_area(la_id)
    "fk_cl_b" FOREIGN KEY (lb_id) REFERENCES location_block(lb_id)
    "fk_cl_c" FOREIGN KEY (lc_id) REFERENCES location_coordinate(lc_id)
    "fk_cl_d" FOREIGN KEY (ld_id) REFERENCES location_description(ld_id)
Referenced by:
    TABLE "crime_record" CONSTRAINT "cr_fk_c" FOREIGN KEY (cl_id) REFERENCES crime_location(cl_id)

dsa_student=> \d may96c.crime_record
                        Table "may96c.crime_record"
   Column    |            Type             | Collation | Nullable | Default
-------------+-----------------------------+-----------+----------+---------
 id          | integer                     |           | not null |
 iucr        | character varying           |           |          |
 fbi_code    | character varying           |           |          |
 cl_id       | integer                     |           |          |
 unnamed_id  | integer                     |           |          |
 case_number | character varying           |           |          |
 arrest      | character varying           |           |          |
 domestic    | character varying           |           |          |
 date        | timestamp without time zone |           |          |
 year        | integer                     |           |          |
 updated_on  | timestamp without time zone |           |          |
Indexes:
    "crime_record_pkey" PRIMARY KEY, btree (id)
Foreign-key constraints:
    "cr_fk_a" FOREIGN KEY (iucr, fbi_code) REFERENCES crime_code(iucr, fbi_code)
    "cr_fk_c" FOREIGN KEY (cl_id) REFERENCES crime_location(cl_id)

## 2.3 Load the data from the given csv file in Part 1 to the relations

* Assuming there will be multiple relations, you need to extract a subsets of data from the original csv data. As Python may not be your first choice, you can use any languages to create subsets of data. Then store these data into the Module 8 exercises folder. 
* After curating the data use the load loading skillset gained from Module 6
  * Copy and paste your command/query in the following cell or comment on what you have done to populate the data.
  
* Add additional cells if required

In [106]:
print(list(location_block))
s = ''
for i in list(location_block):
    s += '%s,'
print(s)

['lb_id', 'block']
%s,%s,


In [110]:
register_adapter(np.int64,AsIs)
register_adapter(np.float64,AsIs)
location_block = location_block.where(pd.notnull(location_block), None)
INSERT_SQL = 'INSERT INTO may96c.location_block '
INSERT_SQL += ' (lb_id,block) VALUES '
INSERT_SQL += '(%s,%s)'

with connection, connection.cursor() as cursor:
    
    for row in location_block.itertuples(index=False, name=None):  
        
        cursor.execute(INSERT_SQL,row)

In [111]:
print(list(location_description))
s = ''
for i in list(location_description):
    s += '%s,'
print(s)

['ld_id', 'location_description']
%s,%s,


In [112]:
register_adapter(np.int64,AsIs)
register_adapter(np.float64,AsIs)
location_description = location_description.where(pd.notnull(location_description), None)
INSERT_SQL = 'INSERT INTO may96c.location_description '
INSERT_SQL += ' (ld_id,location_description) VALUES '
INSERT_SQL += '(%s,%s)'

with connection, connection.cursor() as cursor:
    
    for row in location_description.itertuples(index=False, name=None):  
        
        cursor.execute(INSERT_SQL,row)

In [113]:
print(list(location_coordinate))
s = ''
for i in list(location_coordinate):
    s += '%s,'
print(s)

['lc_id', 'x_coordinate', 'y_coordinate', 'latitude', 'longitude', 'location']
%s,%s,%s,%s,%s,%s,


In [114]:
register_adapter(np.int64,AsIs)
register_adapter(np.float64,AsIs)
location_coordinate= location_coordinate.where(pd.notnull(location_coordinate), None)
INSERT_SQL = 'INSERT INTO may96c.location_coordinate '
INSERT_SQL += ' (lc_id,x_coordinate,y_coordinate,latitude,longitude,location) VALUES '
INSERT_SQL += '(%s,%s,%s,%s,%s,%s)'

with connection, connection.cursor() as cursor:
    
    for row in location_coordinate.itertuples(index=False, name=None):  
        
        cursor.execute(INSERT_SQL,row)

In [115]:
print(list(crime_code))
s = ''
for i in list(crime_code):
    s += '%s,'
print(s)

['iucr', 'fbi_code', 'description', 'primary_type']
%s,%s,%s,%s,


In [146]:
register_adapter(np.int64,AsIs)
register_adapter(np.float64,AsIs)
crime_code= crime_code.where(pd.notnull(crime_code), None)
INSERT_SQL = 'INSERT INTO may96c.crime_code '
INSERT_SQL += ' (iucr,fbi_code, description, primary_type) VALUES '
INSERT_SQL += '(%s,%s,%s,%s)'

with connection, connection.cursor() as cursor:
    
    for row in crime_code.itertuples(index=False, name=None):  
        
        cursor.execute(INSERT_SQL,row)

In [147]:
print(list(location_area))
s = ''
for i in list(location_area):
    s += '%s,'
print(s)

['la_id', 'beat', 'district', 'ward', 'community_area']
%s,%s,%s,%s,%s,


In [160]:
register_adapter(np.int64,AsIs)
register_adapter(np.float64,AsIs)
location_area= location_area.where(pd.notnull(location_area), None)
INSERT_SQL = 'INSERT INTO may96c.location_area '
INSERT_SQL += ' (la_id,beat,district,ward,community_area) VALUES '
INSERT_SQL += '(%s,%s,%s,%s,%s)'

with connection, connection.cursor() as cursor:
    
    for row in location_area.itertuples(index=False, name=None):  
        
        cursor.execute(INSERT_SQL,row)

In [161]:
print(list(crime_location))
s = ''
for i in list(crime_location):
    s += '%s,'
print(s)

['cl_id', 'la_id', 'lb_id', 'lc_id', 'ld_id']
%s,%s,%s,%s,%s,


In [166]:
register_adapter(np.int64,AsIs)
register_adapter(np.float64,AsIs)
crime_location= crime_location.where(pd.notnull(crime_location), None)
INSERT_SQL = 'INSERT INTO may96c.crime_location '
INSERT_SQL += ' (cl_id,la_id,lb_id,lc_id,ld_id) VALUES '
INSERT_SQL += '(%s,%s,%s,%s,%s)'

with connection, connection.cursor() as cursor:
    
    for row in crime_location.itertuples(index=False, name=None):  
        
        cursor.execute(INSERT_SQL,row)

In [167]:
print(list(crime_record))
s = ''
for i in list(crime_record):
    s += '%s,'
print(s)

['id', 'iucr', 'fbi_code', 'cl_id', 'unnamed_id', 'case_number', 'arrest', 'domestic', 'date', 'year', 'updated_on']
%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,


In [173]:
register_adapter(np.int64,AsIs)
register_adapter(np.float64,AsIs)
crime_record= crime_record.where(pd.notnull(crime_record), None)
INSERT_SQL = 'INSERT INTO may96c.crime_record'
INSERT_SQL += ' (id,iucr,fbi_code,cl_id,unnamed_id,case_number,arrest,domestic,date,year,updated_on) VALUES '
INSERT_SQL += '(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'

with connection, connection.cursor() as cursor:
    
    for row in crime_record.itertuples(index=False, name=None):  
        
        cursor.execute(INSERT_SQL,row)

## 2.4 For each table, show the table name and number of rows using a sql query

* Add additional cells if required

In [2]:
SSO = 'may96c'
hostname = 'pgsql.dsa.lan'
database = 'dsa_student'

In [3]:
%load_ext sql
connection_str = "postgres://{username}:{password}@{hostname}/{database}".format(
    username=SSO, password=mypasswd, hostname=hostname, database=database
)
%sql $connection_str

'Connected: may96c@dsa_student'

In [17]:
%%sql
select table_schema, 
       table_name, 
       (xpath('/row/cnt/text()', xml_count))[1]::text::int as row_count
from (
  select table_name, table_schema, 
         query_to_xml(format('select count(*) as cnt from %I.%I', table_schema, table_name), false, true, '') as xml_count
  from information_schema.tables
  where table_schema = 'may96c' AND table_name = 'location_block' OR table_name = 'crime_record' OR table_name = 'crime_code' OR table_name = 'location_area' OR table_name = 'location_description' OR table_name = 'location_coordinate' OR table_name = 'crime_location'
) t order by 3 DESC

 * postgres://may96c:***@pgsql.dsa.lan/dsa_student
7 rows affected.


table_schema,table_name,row_count
may96c,crime_record,334715
may96c,crime_location,243427
may96c,location_coordinate,162286
may96c,location_block,28340
may96c,location_area,1023
may96c,crime_code,319
may96c,location_description,98


## 2.5 Develop five distinct and useful queries that could be potentially used for policing planning, policy making, citizen awareness, etc.

* Queries should provide some analytic values and use your SQL skillset beyond simple SELECT-FROM-WHERE using multiple tables. I expect to see GROUP BY/HAVING, Nested Queries, Aggregation Operators, etc.
* Each query should have documentation to explain what this query is looking for and why it is meaningful for what purposes.

### Best final projects will be given to three students (two online and one on campus) who will join me for beers at Flat Branch (local brewery in Columbia, Missouri) during the Executive Week in March 2022 (if pandemic situation is OK) or 2023 (I seriously think it should be OK by then). 

* five pairs of cells are expected. Your output presentations have to be self explainable. (hint: use AS)

# What is the query trying to achieve - use natural language
This query returns the average, minimum, and maximum arrests made on cannabis related charges per district in the year of 2012. This data could be taken into consideration when considering cannabis decriminalization in chicago. 

In [153]:
%%sql

SELECT AVG(amount_of_cannabis_related_arrests) as avg_cannabis_arrests_per_district_2012, MIN(amount_of_cannabis_related_arrests) as min_cannabis_arrests_per_district_2012, MAX(amount_of_cannabis_related_arrests) as max_cannabis_arrests_per_district_2012
    FROM (
        SELECT COUNT(*) as amount_of_cannabis_related_arrests, district
        FROM crime_record cr JOIN crime_location cl ON cr.cl_id = cl.cl_id JOIN location_area la ON cl.la_id = la.la_id JOIN crime_code cc ON cc.fbi_code = cr.fbi_code AND cc.iucr = cr.iucr 
        WHERE description LIKE '%CANNABIS%' AND arrest = 'true'
        GROUP BY district, year
        HAVING year = 2012) as cannabis_district_info
        


 * postgres://may96c:***@pgsql.dsa.lan/dsa_student
1 rows affected.


avg_cannabis_arrests_per_district_2012,min_cannabis_arrests_per_district_2012,max_cannabis_arrests_per_district_2012
902.5,334,1580


# What is the query trying to achieve - use natural language
This query finds the most prominent crime type in chicago, based on primary_type description, and then determines how many instances of this type of crime(theft) has occured at each location description. With this information, to minimize the number one type of crime chicago experiences, cops could make an effor to be present at location types that are higher on the list (implying more theft occurs at these types of spots)

In [112]:
%%sql
SELECT location_description as location_type, count(*) amount_of_most_prominent_crime_type_that_occurs_at_this_location, primary_type as most_prominent_crime
FROM crime_code cc JOIN crime_record cr ON cc.fbi_code = cr.fbi_code AND cc.iucr = cr.iucr JOIN crime_location cl ON cr.cl_id = cl.cl_id JOIN location_description ld ON cl.ld_id = ld.ld_id
WHERE primary_type IN ( SELECT primary_type FROM (
SELECT count(*), primary_type
FROM crime_code cc JOIN crime_record cr ON cc.fbi_code = cr.fbi_code AND cc.iucr = cr.iucr
GROUP BY primary_type
ORDER BY count(*) desc 
LIMIT 1  ) as cool )
group by location_description, primary_type
ORDER BY COUNT(*) DESC


 * postgres://may96c:***@pgsql.dsa.lan/dsa_student
97 rows affected.


location_type,amount_of_most_prominent_crime_type_that_occurs_at_this_locatio,most_prominent_crime
STREET,16449,THEFT
RESIDENCE,11287,THEFT
SIDEWALK,7510,THEFT
APARTMENT,7401,THEFT
OTHER,3150,THEFT
DEPARTMENT STORE,2755,THEFT
PARKING LOT/GARAGE(NON.RESID.),2458,THEFT
SMALL RETAIL STORE,2079,THEFT
RESIDENTIAL YARD (FRONT/BACK),1987,THEFT
GROCERY FOOD STORE,1900,THEFT


# What is the query trying to achieve - use natural language
In the two cells below, I identified quartiles based on the amount of narcotic crimes according to location area, I used this so I could pull location areas that have an amount of narcotic crimes that is above the median per location_area. Essentially, I identified trouble areas in terms of narcotics. I then identified the count of manufacturing and delivering crimes based on blocks within these troubled location areas. This is so the police force can see trends in these trobuled areas and potentially work to hault the delivery/manufacturing of harmful substances 

In [30]:
%%sql
SELECT percentile_cont(.25) WITHIN GROUP (ORDER BY narcotic_count),
        percentile_cont(.5) WITHIN GROUP (ORDER BY narcotic_count),
        percentile_cont(.75) WITHIN GROUP (ORDER BY narcotic_count),
        percentile_cont(1) WITHIN GROUP (ORDER BY narcotic_count)
        FROM ( SELECT count(*) as narcotic_count, cl.la_id
                FROM crime_code cc JOIN crime_record cr ON cc.fbi_code = cr.fbi_code AND cc.iucr = cr.iucr JOIN crime_location cl ON cr.cl_id = cl.cl_id JOIN location_block lb ON cl.lb_id = lb.lb_id
                WHERE primary_type = 'NARCOTICS'                
                  GROUP BY cl.la_id ) narc_count ;
        

 * postgres://may96c:***@pgsql.dsa.lan/dsa_student
1 rows affected.


percentile_cont,percentile_cont_1,percentile_cont_2,percentile_cont_3
6.0,24.0,57.0,427.0


In [27]:
%%sql
SELECT COUNT(*) as number_of_manufacturing_or_delivering_crimes, lb.lb_id as block_id, block, la_id as location_area_with_above_median_narcotic_crimes
FROM crime_code cc JOIN crime_record cr ON cc.fbi_code = cr.fbi_code AND cc.iucr = cr.iucr JOIN crime_location cl ON cr.cl_id = cl.cl_id JOIN location_block lb ON cl.lb_id = lb.lb_id
WHERE description LIKE '%MANU/DELIVER%' AND la_id IN (
    SELECT la_id 
    FROM 
       (SELECT count(*) as narcotic_count, la_id 
        FROM crime_code cc JOIN crime_record cr ON cc.fbi_code = cr.fbi_code AND cc.iucr = cr.iucr JOIN crime_location cl ON cr.cl_id = cl.cl_id JOIN location_block lb ON cl.lb_id = lb.lb_id
        WHERE primary_type = 'NARCOTICS' 
        GROUP BY la_id
        HAVING count(*) > 24 ) as narc_count_filter )
GROUP BY lb.lb_id, block, la_id
ORDER BY count(*) DESC


 * postgres://may96c:***@pgsql.dsa.lan/dsa_student
1591 rows affected.


number_of_manufacturing_or_delivering_crimes,block_id,block,location_area_with_above_median_narcotic_crimes
9,2734,009XX N ST LOUIS AVE,114
8,3770,013XX S CHRISTIANA AVE,10
8,2072,039XX W ROOSEVELT RD,156
7,9872,041XX W WEST END AVE,85
7,1487,048XX W SUPERIOR ST,405
6,2412,010XX N SPRINGFIELD AVE,15
5,11859,028XX N LAKE SHORE DR,188
5,5883,048XX W GLADYS AVE,227
5,2316,034XX W OHIO ST,114
5,7509,037XX W GRENSHAW ST,408


# What is the query trying to achieve - use natural language
This location gives counts of domestic crimes per locations which have been noted as a residence or apartment, this could be used to identify hot spots for domestic abuse, and could be used to provide educational material regarding domestic altercations to the household in question 

In [62]:
%%sql
SELECT count(*) as number_of_domestic_crimes, lc.lc_id as location_id, location
FROM crime_record cr JOIN crime_location cl ON cr.cl_id = cl.cl_id JOIN location_coordinate lc ON cl.lc_id = lc.lc_id JOIN location_description ld ON cl.ld_id = ld.ld_id
WHERE domestic =  'true' AND location_description = 'RESIDENCE' OR location_description = 'APARTMENT' AND lc.lc_id !=0
GROUP BY lc.lc_id, location
ORDER BY count(*) DESC
LIMIT 100

 * postgres://may96c:***@pgsql.dsa.lan/dsa_student
100 rows affected.


number_of_domestic_crimes,location_id,location
69,46580,"(41.762477718, -87.640020433)"
57,51204,"(41.879267767, -87.763667741)"
55,1605,"(41.777309867, -87.640802922)"
49,0,
40,40384,"(41.912025769, -87.776529218)"
34,6398,"(41.969078381, -87.65560801)"
34,5652,"(41.908430535, -87.638509526)"
31,2081,"(41.881088655, -87.720764494)"
31,4231,"(41.705677782, -87.600944364)"
29,40747,"(41.883339174, -87.697581932)"


# What is the query trying to achieve - use natural language
lets say the city of chicago is considering hiring more officers, but aren't sure which district(s) to allocate the new help to. They could choose to do this based on the average crimes per day in the district 

In [22]:
%%sql
SELECT AVG(count_per_day) as avg_crimes_per_day, district
    FROM ( SELECT count(*) as count_per_day, district, date::date 
            FROM crime_record cr JOIN crime_location cl ON cr.cl_id = cl.cl_id JOIN location_area la ON cl.la_id = la.la_id
            GROUP BY date::date, district ) as date_data
GROUP BY district
ORDER BY AVG(count_per_day) DESC

 * postgres://may96c:***@pgsql.dsa.lan/dsa_student
23 rows affected.


avg_crimes_per_day,district
63.538251366120214,8
59.73770491803279,11
56.6775956284153,25
55.62295081967213,7
54.704918032786885,4
52.967213114754095,6
47.057377049180324,3
45.86338797814208,9
45.377049180327866,12
42.87158469945355,5


## End of Final Project

# Save your notebook, then `File > Close and Halt`