## Transforming Data

- Transforming data and construct pivot tables. 
- Combining and spliting data like addresses into city, state, and zip codes using a multitude of powerful functions



In [170]:
cursor.execute("""SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'""")
print('Table in Database:\n')
for table in cursor.fetchall():
       print(table)


Table in Database:

('film_permit',)
('parking_violation',)


In [171]:
%%sql
select COLUMN_NAME
from INFORMATION_SCHEMA.COLUMNS
where TABLE_NAME='parking_violation'

 * postgresql://postgres:***@localhost:5432/NYC_Open_Data
43 rows affected.


column_name
summons_number
plate_id
registration_state
plate_type
issue_date
violation_code
vehicle_body_type
vehicle_make
issuing_agency
street_code1


In [172]:
%%sql
SELECT *
FROM parking_violation
LIMIT 10;

 * postgresql://postgres:***@localhost:5432/NYC_Open_Data
10 rows affected.


summons_number,plate_id,registration_state,plate_type,issue_date,violation_code,vehicle_body_type,vehicle_make,issuing_agency,street_code1,street_code2,street_code3,vehicle_expiration_date,violation_location,violation_precinct,issuer_precint,issuer_code,issuer_command,issuer_squad,violation_time,time_first_observed,violation_county,violation_in_front_of_or_opposite,house_number,street_name,intersecting_street,date_first_observed,law_section,sub_division,violation_legal_code,days_parking_in_effect,from_hours_in_effect,to_hours_in_effect,vehicle_color,unregistred_vehicle,vehicle_year,meter_number,feet_from_curb,violation_post_code,violation_description,no_standing_or_stopping_violation,hydrant_violation,double_parking_violation
1447634159,HZR6973,NY,PAS,05/03/2019,20,,DODGE,X,0,0,0,20200507,122,122,835,582,835,0,1022PM,,,,,GNRA GREAT KILLS PAR,CEDAR PT BAY SIDE,0,408,D3,,BBBBBBB,0800PM,0500AM,BLA,0,2014,-,0,,,,,
1447672290,HZN3898,NY,PAS,07/13/2019,14,SUBN,JEEP,X,10010,17310,17330,20200628,13,13,839,111169,839,0,0350AM,0005A,NY,F,280.0,FIRST AVENUE,,0,408,D,,BBBBBBB,1000PM,0700AM,GREY,0,2005,-,0,,,,,
1447680091,JDN5893,NY,PAS,07/11/2019,14,SUBN,BMW,X,17270,12110,12210,20201108,13,13,839,111169,839,0,0440AM,0115A,NY,F,545.0,E 14 ST,,20190711,408,C,,BBBBBBB,1000PM,0700AM,BLUE,0,2019,-,0,,,,,
1447680133,JDR8554,NY,PAS,07/11/2019,14,SDN,BMW,X,12310,17310,10555,20210208,13,13,839,111169,839,0,0412AM,,NY,F,283.0,AVE C,,0,408,D,,BBBBBBB,1000PM,0700AM,SILVE,0,2013,-,0,,,,,
1447680194,KWX4460,PA,PAS,07/09/2019,14,SUBN,TOYOT,X,17270,12110,12210,0,13,13,839,111169,839,0,0430AM,0010A,NY,F,521.0,E 14 STREET,,0,408,C,,BBBBBBB,1000PM,0700AM,BLUE,0,0,-,0,,,,,
1447315157,HWV8487,NY,PAS,06/30/2019,14,SUBN,NISSA,P,83630,90980,53730,20200201,60,60,60,959840,60,0,0430PM,,K,O,1005.0,SURF AVE,,0,408,C,,BBBBBBB,1200AM,1159PM,,0,0,-,0,,,,,
1447316198,JEB6550,NY,PAS,07/02/2019,98,SUBN,KIA,P,36780,14080,14130,20201022,63,63,63,963432,63,0,0734PM,,K,F,1162.0,E 43 ST,,0,408,J3,,BBBBBBB,1200AM,1159PM,,0,0,-,0,,,,,
1447320359,KMJ1971,PA,PAS,07/02/2019,98,SUBN,CHRYS,P,54050,14080,14030,20190930,63,63,63,957004,63,0,0948PM,,K,F,4539.0,KINGS HWY,,0,408,F2,,BBBBBBB,1200AM,1159PM,RED,0,2006,-,0,,,,,
1447336719,GBD9462,NY,PAS,06/29/2019,19,SUBN,HONDA,P,22680,75910,22580,20200806,60,60,60,961254,60,0,0900PM,,K,F,1117.0,BRIGHTON BEACH AVENU,,0,408,F1,,BBBBBBB,1200AM,1159PM,GY,0,2016,-,0,,,,,
1422683254,IBY3257,PA,PAS,07/02/2019,21,,DODGE,P,43520,26550,76950,20210418,41,41,41,963629,41,0,0830AM,0830A,,F,970.0,KELLY ST,,20190702,408,F1,,BYBBYBB,,1030AM,,0,0,-,0,,,,,


### Combining columns
#### Tallying corner parking violations

- The `parking_violation` table has two columns (`street_name` and `intersecting_street`) with New York City streets. 
- When the values for both columns are not `NULL`, this indicates that the violation occurred on a corner where two streets intersect. 
- **Tasks:**
    - identify street `corner`s that tend to be the location of frequent parking violations and 
    - identify the `total number` of violations at each corner.

In [173]:
%%sql
SELECT 
street_name,
intersecting_street, corner
FROM
    (SELECT
    street_name,
    intersecting_street,

            street_name || ' & ' || intersecting_street AS corner
    FROM
            parking_violation) sub
WHERE corner is NOT NULL;

SELECT street_name,
intersecting_street
FROM parking_violation
WHERE street_name is NULL OR intersecting_street IS NULL;

SELECT count(street_name)
FROM parking_violation
WHERE NOT (street_name is NULL OR intersecting_street IS NULL);

 * postgresql://postgres:***@localhost:5432/NYC_Open_Data
568 rows affected.
4432 rows affected.
1 rows affected.


count
568


In [174]:
%%sql
SELECT
    corner,
    count(*) as total_number
FROM (
    SELECT
        street_name || ' & ' || intersecting_street AS corner
    FROM
        parking_violation
    ) sub
WHERE corner IS NOT NULL
GROUP BY corner
HAVING count(*) > 2
ORDER BY total_number DESC
;

 * postgresql://postgres:***@localhost:5432/NYC_Open_Data
14 rows affected.


corner,total_number
RIVERBANK STATE PARK & LOWER LEVEL,10
FR CAPODANNO BLVD & SAND LN,7
JACOB RIIS PARK & EAST PARKING LOT,6
ROCKAWAY BEACH & SHORE FRONT PKWY,5
BRUCKNER BLVD & WILKINSON AVE,5
CROTONA & CLAREMONT PKWY,5
CLAREMONT PKWY & CROTONA AVE,4
SURF AVE & W 15TH ST,4
N/S JEROME AVE & ANDERSON AVE,4
RIVERBANK STATE PARK & UPPER LEVEL,3


#### TIMESTAMP with concatenation

- Identifying overnight violations through the creation of the `violation_datetime` column populated with `TIMESTAMP` values. 
- Concatenating `issue_date` and `violation_time` and converting the resulting strings to `TIMESTAMP` values.



In [175]:
%%sql

SELECT
    issue_date,
    violation_time,
-- Concatenate issue_date and violation_time columns
    CONCAT(issue_date, ' ', violation_time) AS violation_datetime
FROM
  parking_violation
LIMIT 20;

 * postgresql://postgres:***@localhost:5432/NYC_Open_Data
20 rows affected.


issue_date,violation_time,violation_datetime
05/03/2019,1022PM,05/03/2019 1022PM
07/13/2019,0350AM,07/13/2019 0350AM
07/11/2019,0440AM,07/11/2019 0440AM
07/11/2019,0412AM,07/11/2019 0412AM
07/09/2019,0430AM,07/09/2019 0430AM
06/30/2019,0430PM,06/30/2019 0430PM
07/02/2019,0734PM,07/02/2019 0734PM
07/02/2019,0948PM,07/02/2019 0948PM
06/29/2019,0900PM,06/29/2019 0900PM
07/02/2019,0830AM,07/02/2019 0830AM


In [176]:
%%sql
SELECT
    issue_date,
    violation_time,
  -- Convert violation_time to TIMESTAMP
  TO_TIMESTAMP(violation_datetime, 'MM/DD/YYYY HH12MIPM') AS violation_datetime
FROM (
  SELECT
    issue_date,
    violation_time,
    CONCAT(issue_date, ' ', violation_time) AS violation_datetime
  FROM
    parking_violation
) sub
LIMIT 10;

 * postgresql://postgres:***@localhost:5432/NYC_Open_Data
10 rows affected.


issue_date,violation_time,violation_datetime
05/03/2019,1022PM,2019-05-03 22:22:00+02:00
07/13/2019,0350AM,2019-07-13 03:50:00+02:00
07/11/2019,0440AM,2019-07-11 04:40:00+02:00
07/11/2019,0412AM,2019-07-11 04:12:00+02:00
07/09/2019,0430AM,2019-07-09 04:30:00+02:00
06/30/2019,0430PM,2019-06-30 16:30:00+02:00
07/02/2019,0734PM,2019-07-02 19:34:00+02:00
07/02/2019,0948PM,2019-07-02 21:48:00+02:00
06/29/2019,0900PM,2019-06-29 21:00:00+02:00
07/02/2019,0830AM,2019-07-02 08:30:00+02:00


### Spliting column data
#### Extracting time units

In [177]:
%%sql
SELECT
    violation_time,
    SUBSTRING(violation_time FROM 1 FOR 2) AS hour,
    -- Define minute column
    SUBSTRING(violation_time FROM 3 FOR 2) AS minute
FROM
  parking_violation
    LIMIT 10;

 * postgresql://postgres:***@localhost:5432/NYC_Open_Data
10 rows affected.


violation_time,hour,minute
1022PM,10,22
0350AM,3,50
0440AM,4,40
0412AM,4,12
0430AM,4,30
0430PM,4,30
0734PM,7,34
0948PM,9,48
0900PM,9,0
0830AM,8,30


#### Extracting house numbers from a string

`house_number` = `cross street`_ `house_number`
> new_house_number = `house_number`

In [178]:
%%sql
SELECT
        house_number,
        SUBSTRING(
            house_number
            FROM STRPOS(house_number, '-') + 1
            FOR LENGTH(house_number) - STRPOS(house_number, '-')
            ) AS new_house_number,
        street_name
FROM  parking_violation
WHERE house_number LIKE '%-%'
    LIMIT 10;

 * postgresql://postgres:***@localhost:5432/NYC_Open_Data
10 rows affected.


house_number,new_house_number,street_name
32-11,11,98 ST
145-43,43,157 ST
107-18,18,WALTHAM ST
16-12,12,ELIZABETH ST
90-38,38,179 PL
145-59,59,3RD AVE
116-28,28,QUEENS BLVD
145-32,32,3RD AVE
145-52,52,3 AVE
129-10,10,HILLSIDE AVE


#### Splitting data with delimiters


In [179]:
%%sql
SELECT
  -- Split house_number using '-' as the delimiter
  SPLIT_PART(house_number, '-', 2) AS new_house_number,
violation_county
FROM
  parking_violation
WHERE
  violation_county = 'Q'
    LIMIT 10;

 * postgresql://postgres:***@localhost:5432/NYC_Open_Data
10 rows affected.


new_house_number,violation_county
11.0,Q
43.0,Q
18.0,Q
38.0,Q
59.0,Q
,Q
,Q
,Q
28.0,Q
32.0,Q


In [180]:
%%sql

SELECT
  -- Label daily parking restrictions for locations by day
  ROW_NUMBER() OVER(
    PARTITION BY
        house_number, 
    street_name, violation_county
    ORDER BY
        1, 7
  ) AS day_number,
  *
FROM (
  SELECT
    house_number, 
    street_name,
    violation_county,
    REGEXP_SPLIT_TO_TABLE(days_parking_in_effect, '') AS daily_parking_restriction
  FROM
    parking_violation
) sub
LIMIT 10;

 * postgresql://postgres:***@localhost:5432/NYC_Open_Data
10 rows affected.


day_number,house_number,street_name,violation_county,daily_parking_restriction
1,348,LINDEN RD,K,B
2,348,LINDEN RD,K,B
3,348,LINDEN RD,K,B
4,348,LINDEN RD,K,B
5,348,LINDEN RD,K,B
6,348,LINDEN RD,K,B
7,348,LINDEN RD,K,B
1,5,ALICE COURT,K,B
2,5,ALICE COURT,K,B
3,5,ALICE COURT,K,B


### Creating pivot table
#### Selecting data for a pivot table

`issuing_agency`: agencies responsible for different types of parking violations including `Police Department` (`P`), `Department of Sanitation` (`S`), `Parks Department` (`K`), and `Department of Transportation` (`V`)

In [181]:
%%sql
SELECT 
    -- Include the violation code in results
    violation_code, 
    -- Include the issuing agency in results
    issuing_agency, 
    -- Total violation code and issuing agency pairs
    COUNT(issuing_agency) 
FROM 
    parking_violation 
WHERE 
    -- Restrict the results to the agencies of interest
    issuing_agency IN ('P', 'S', 'K', 'V') 
GROUP BY 
    -- Define GROUP BY columns to ensure correct pair count
    violation_code, issuing_agency
ORDER BY 
    violation_code, issuing_agency
LIMIT 10;

 * postgresql://postgres:***@localhost:5432/NYC_Open_Data
10 rows affected.


violation_code,issuing_agency,count
10,P,3
11,P,1
13,K,1
14,K,4
14,P,485
14,S,4
16,P,10
17,P,69
17,S,8
18,P,9


#### Using FILTER to create a pivot table

In [182]:
%%sql
SELECT 
    violation_code, 
    -- Define the "Police" column
    count(violation_code) FILTER (WHERE issuing_agency = 'P') AS "Police",
    -- Define the "Sanitation" column
    count(violation_code) FILTER (WHERE issuing_agency = 'S') AS "Sanitation",
    -- Define the "Parks" column
    count(violation_code) FILTER (WHERE issuing_agency = 'K') AS "Parks",
    -- Define the "Transportation" column
    count(violation_code) FILTER (WHERE issuing_agency = 'V') AS "Transportation"
FROM parking_violation 
GROUP BY violation_code
ORDER BY violation_code

 * postgresql://postgres:***@localhost:5432/NYC_Open_Data
59 rows affected.


violation_code,Police,Sanitation,Parks,Transportation
10,3,0,0,0
11,1,0,0,0
13,0,0,1,0
14,485,4,4,0
16,10,0,0,0
17,69,8,0,0
18,9,0,0,0
19,203,4,1,0
20,387,8,17,0
21,42,271,0,0


### Aggregating film categories


In [183]:
%%sql
SELECT column_name 
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME='film_permit'
LIMIT 10;

 * postgresql://postgres:***@localhost:5432/NYC_Open_Data
10 rows affected.


column_name
event_id
event_type
start_date_time
end_date_time
entered_on
event_agency
parking_held
borough
community_board
police_precinct


In [184]:
%%sql
SELECT 
event_id      
,event_type
,start_date_time
,end_date_time
,entered_on
,event_agency
--,parking_held
,borough
,community_board
,police_precinct

FROM film_permit
LIMIT 10;


 * postgresql://postgres:***@localhost:5432/NYC_Open_Data
10 rows affected.


event_id,event_type,start_date_time,end_date_time,entered_on,event_agency,borough,community_board,police_precinct
446040,Shooting Permit,10/19/2018 02:00:00 PM,10/20/2018 04:00:00 AM,10/16/2018 11:57:27 AM,"Mayor's Office of Film, Theatre & Broadcasting",Manhattan,2,1
446168,Shooting Permit,10/19/2018 02:00:00 PM,10/20/2018 02:00:00 AM,10/16/2018 07:03:56 PM,"Mayor's Office of Film, Theatre & Broadcasting",Manhattan,"12, 8","34, 50"
186438,Shooting Permit,10/30/2014 07:00:00 AM,10/31/2014 02:00:00 AM,10/27/2014 12:14:15 PM,"Mayor's Office of Film, Theatre & Broadcasting",Queens,"2, 5","104, 108"
445255,Shooting Permit,10/20/2018 07:00:00 AM,10/20/2018 06:00:00 PM,10/09/2018 09:34:58 PM,"Mayor's Office of Film, Theatre & Broadcasting",Brooklyn,2,84
128794,Theater Load in and Load Outs,11/16/2013 12:01:00 AM,11/17/2013 06:00:00 AM,11/07/2013 03:48:28 PM,"Mayor's Office of Film, Theatre & Broadcasting",Manhattan,"4, 5",14
43547,Shooting Permit,01/10/2012 07:00:00 AM,01/10/2012 07:00:00 PM,01/04/2012 12:25:37 PM,"Mayor's Office of Film, Theatre & Broadcasting",Brooklyn,"1, 2","108, 94"
66846,Shooting Permit,07/27/2012 07:00:00 AM,07/27/2012 07:00:00 PM,07/23/2012 07:26:03 PM,"Mayor's Office of Film, Theatre & Broadcasting",Brooklyn,6,78
104342,Shooting Permit,06/21/2013 12:00:00 AM,06/21/2013 09:00:00 AM,05/27/2013 06:22:39 AM,"Mayor's Office of Film, Theatre & Broadcasting",Manhattan,5,14
244863,Shooting Permit,09/16/2015 07:00:00 AM,09/16/2015 09:00:00 PM,09/10/2015 10:26:27 PM,"Mayor's Office of Film, Theatre & Broadcasting",Bronx,11,49
446379,Shooting Permit,10/20/2018 06:00:00 AM,10/20/2018 10:00:00 PM,10/18/2018 11:01:48 AM,"Mayor's Office of Film, Theatre & Broadcasting",Manhattan,2,6


In [185]:
%%sql
CREATE OR REPLACE TEMP VIEW cb_categories AS 
SELECT

    REGEXP_SPLIT_TO_TABLE(community_board, ', ') AS community_board, category
FROM film_permit
WHERE category IN ('Film', 'Television', 'Documentary')
AND community_board !='N/A';


SELECT
    community_board::INT,
    -- Define pivot table columns
    COUNT(category) FILTER(where category = 'Film') AS "Film",
    COUNT(category) FILTER(where category = 'Television') AS "Television",
    COUNT(category) FILTER(where category = 'Documentary') AS "Documentary"
FROM cb_categories

GROUP BY community_board
ORDER BY community_board;

 * postgresql://postgres:***@localhost:5432/NYC_Open_Data
Done.
28 rows affected.


community_board,Film,Television,Documentary
0,0,1,0
1,166,899,5
2,157,656,4
3,90,152,3
4,84,281,3
5,131,505,5
6,93,188,2
7,76,110,0
8,55,103,2
9,64,110,1
