<h1 align="center">Setting Up</h1>

In [1]:
#set up; import necessary packages/libraries
import os
#!pip install sodapy
from sodapy import Socrata
import pandas as pd
from datetime import datetime
import calendar
import warnings
warnings.filterwarnings("ignore")

#!pip install --upgrade google-cloud-storage
#!pip install --upgrade google-cloud-bigquery
#!pip install pandas-gbq
from google.cloud import bigquery
from pandas.io import gbq


#make sure to set environment var for GOOGLE_APPLICATION_CREDENTIALS
#https://cloud.google.com/bigquery/docs/quickstarts/quickstart-client-libraries
#os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = CREDENTIALS

<h1 align="center">Extract & Transform for Collisions Dataset</h1>

## Extracting - Collisions Data

In [2]:
#credentials needed to pull data from the API
domain = "data.cityofnewyork.us"
collisions_data_id = "h9gi-nx95"
token = "TOKEN"

#set up a connection using the credentials
client_collisions = Socrata(domain,token)

In [3]:
#check metadata; make sure we're pulling the right dataset
#also helps with familiarizing ourselves with the dataset
metadata_collisions = client_collisions.get_metadata(collisions_data_id)

#metadata #<- view metadata

In [4]:
#query applied when pulling the data from the api
#filter to only pull records older than Dec 31, 2018; start from 2019
collisions_query = """
    select collision_id,
        crash_date, 
        crash_time, 
        borough, 
        zip_code,
        number_of_persons_injured, number_of_persons_killed,
        number_of_pedestrians_injured, number_of_pedestrians_killed,
        number_of_cyclist_injured, number_of_cyclist_killed,
        number_of_motorist_injured, number_of_motorist_killed,
        contributing_factor_vehicle_1, contributing_factor_vehicle_2, contributing_factor_vehicle_3,
        vehicle_type_code1, vehicle_type_code2, vehicle_type_code_3
    where crash_date > '2018-12-31'
    limit 100000000
"""

In [5]:
#pull data from the API
#pass in query towards the request
pull_collisions_data = client_collisions.get(collisions_data_id, query = collisions_query)

In [6]:
#create a dataframe for the collisions data pulled from the API
collisions_df = pd.DataFrame(pull_collisions_data)

In [7]:
#replace all NaN values with Unspecified
collisions_df.fillna("Unspecified", inplace = True)

## Transforming - Collisions Data

In [9]:
#convert crash_date col to a string
collisions_df['crash_date'] = collisions_df['crash_date'].astype(str)

#create datetime column; combine crash_dat col and crash_time col and then convert the string into a datetime
collisions_df['date_time']=pd.to_datetime(collisions_df["crash_date"]+ ' '+collisions_df["crash_time"])

In [11]:
#create Year, Month, Day, and Week columns; extract from crash_date col
collisions_df["year"] = collisions_df["date_time"].dt.year

collisions_df["month"] = collisions_df["date_time"].dt.month
collisions_df["month"] = collisions_df["month"].apply(lambda x: calendar.month_abbr[x])

collisions_df["day"] = collisions_df["date_time"].dt.day

collisions_df["week"] = collisions_df["date_time"].dt.isocalendar().week
collisions_df["week"] = collisions_df["week"].astype(int) #<---- convert dtype from UINT32 to int64; will cause issues with pyarrow if we don't

In [13]:
#create function that will denote if timestamp is AM or PM
def period_converter(x):
    hour = x.hour
    if hour < 12:
        period ="AM"
    else:
        period ="PM"
        
    return period

#create Period column; apply period_converter to all rows in crash_time column
collisions_df["period"] = collisions_df["date_time"].apply(lambda x: period_converter(x))

In [14]:
#dictionary containing current col names and what they will be changed to
col_rename = {"number_of_persons_injured": "persons_injured",
             "number_of_persons_killed": "persons_killed",
             "number_of_pedestrians_injured": "pedestrians_injured",
             "number_of_pedestrians_killed": "pedestrians_killed",
             "number_of_cyclist_injured": "cyclists_injured",
             "number_of_cyclist_killed": "cyclists_killed",
             "number_of_motorist_injured": "motorists_injured",
             "number_of_motorist_killed": "motorists_killed",
             "vehicle_type_code_3": "vehicle_type_code3",
             "contributing_factor_vehicle_1":"contributing_factor_vehicle1",
             "contributing_factor_vehicle_2":"contributing_factor_vehicle2",
             "contributing_factor_vehicle_3":"contributing_factor_vehicle3"}

#rename columns in the df
collisions_df.rename(columns = col_rename, inplace = True)

In [15]:
#rearrange columns
collisions_df = collisions_df.reindex(columns=["collision_id",
                              "crash_date", "crash_time", "date_time",
                              "year","month","week", "day", "period",
                              "borough", "zip_code",
                              "contributing_factor_vehicle1", "contributing_factor_vehicle2", "contributing_factor_vehicle3",
                              "vehicle_type_code1", "vehicle_type_code2","vehicle_type_code3",
                               "persons_injured", "persons_killed",
                               "pedestrians_injured","pedestrians_killed",
                               "cyclists_injured","cyclists_killed",
                               "motorists_injured", "motorists_killed"
                              ])

### Creating Dimension Tables (Collisions/Crashes)

In [17]:
"""
    Creating dimension table ids
"""

#get index and use the numbers for numerical portion of the dimension table ids
index_nums_collisions =collisions_df.index.tolist()

#create function for generating id col for dim tables
#df = dim table dataframe, denoter = letter portion of id, id_col_name = name of id col, index_list = list with index #s from df
def dim_id_generator(df,denoter,id_col_name,index_list): #id_col_name is a str
    id_col_name_str = str(f'{id_col_name}')
    table_id_list = [f"{denoter}{x+1}" for x in index_list]
    df[id_col_name_str] = table_id_list
    df = df
    
    return df

In [18]:
#create date dim table
dim_crashdate = collisions_df[["date_time","year", "month", "week", "day", "period"]]

#create location dimtable
dim_crashlocation = collisions_df[["borough", "zip_code"]]

#create contributing factor dim table
dim_contributingfactor = collisions_df[["contributing_factor_vehicle1", "contributing_factor_vehicle2", 
                                       "contributing_factor_vehicle3",
                                       "vehicle_type_code1", "vehicle_type_code2","vehicle_type_code3"]]
#create casualties dim table
dim_casualties = collisions_df[["persons_killed","persons_injured",
                                "pedestrians_killed","pedestrians_injured",
                                "cyclists_killed", "cyclists_injured",
                                "motorists_killed", "motorists_injured"]]

In [19]:
#generate id col for the dimension tables

dim_id_generator(dim_casualties,"CA","dim_casualties_id", index_nums_collisions)
dim_id_generator(dim_contributingfactor,"CF","dim_contributingfactor_id", index_nums_collisions)
dim_id_generator(dim_crashdate, "CD","dim_crashdate_id", index_nums_collisions)
dim_id_generator(dim_crashlocation,"CL", "dim_crashlocation_id",index_nums_collisions);

In [20]:
#rearrange columns in the dim table dataframes
dim_tables_list = [dim_crashdate,dim_crashlocation,dim_contributingfactor, dim_casualties]

#function for rearranging columns in the dim table dataframes
#dim_table = dim table df, id_col_name = col header name for id col
def column_arranger(dim_table, id_col_name):
    first_col = dim_table.pop(id_col_name)
    dim_table.insert(0,id_col_name,first_col)
    return dim_table

In [21]:
#rearrange cols so that id col is first col
column_arranger(dim_crashdate, "dim_crashdate_id")
column_arranger(dim_crashlocation, "dim_crashlocation_id")
column_arranger(dim_contributingfactor, "dim_contributingfactor_id")
column_arranger(dim_casualties, "dim_casualties_id");


<h1 align="center">Extract & Transform for 311 Traffic Lights Complaints</h1>

## Extracting - 311 Traffic Lights Complains Data

In [22]:
#credentials needed to pull data from the API
domain = "data.cityofnewyork.us"
complaints311_id = "erm2-nwe9"
token = "TOKEN"

#set up a connection using the credentials
client_complaints311 = Socrata(domain,token)

In [23]:
#query: pulling all records from the start of 2019 where the complaints were about street/traffic lights
trafficlights311_query = """
    select unique_key, complaint_type, descriptor, address_type,
        borough, incident_zip,
        created_date
    where created_date > '2018-12-31'
    and complaint_type = "Street Light Condition"
    or complaint_type = "Traffic Light Condition"
    limit 100000000
"""

In [24]:
#pulling traffic lights 311 dataset
pull_trafficlights311_data = client_complaints311.get(complaints311_id, query = trafficlights311_query)

In [25]:
#create dataframe of the trafficlights 311 data pulled from API
#replace NaN Values with "Unspecified"
trlights311_df = pd.DataFrame(pull_trafficlights311_data)

## Transforming - 311 Traffic Lights Complaints Data

In [27]:
#replace all NaN values with "Unspecified"
trlights311_df.fillna("Unspecified", inplace = True)

In [30]:
#convert created_date col to datetime
trlights311_df["created_date"] = pd.to_datetime(trlights311_df["created_date"], infer_datetime_format=True)

In [31]:
#create Year, Month, Day, and Week columns; extract from created_date col
trlights311_df["year"] = trlights311_df["created_date"].dt.year

trlights311_df["month"] = trlights311_df["created_date"].dt.month
trlights311_df["month"] = trlights311_df["month"].apply(lambda x: calendar.month_abbr[x])

trlights311_df["week"] = trlights311_df["created_date"].dt.isocalendar().week
trlights311_df["week"] = trlights311_df["week"].astype(int) #<---- convert dtype from UINT32 to int64; will cause issues with pyarrow if we don't

trlights311_df["day"] = trlights311_df["created_date"].dt.day

#create Period column; use period_converter function - apply to all rows in created_time column
trlights311_df["period"] = trlights311_df["created_date"].apply(lambda x: period_converter(x))

In [32]:
#rename incident_zip to zip_code
trlights311_df.rename(columns = {"incident_zip":"zip_code","created_date":"complaint_date"}, inplace = True)

### Creating Dimension Tables (311 Traffic Light Complaints)

In [34]:
#create complaint dim table
dim_311complaint = trlights311_df[["complaint_type","descriptor"]]

#create complaint location dim table
dim_311location = trlights311_df[["borough","zip_code"]]

#create complaint date dim table
#since pyarrow has trouble processing dataframes with multiple types we create a new data frame instead of subsetting
dim_311date = trlights311_df[["complaint_date","year","month","week","day","period"]]

In [35]:
#index of trlights_df
index_nums_trlights311 = trlights311_df.index.tolist()

#create id col for the dim tables for 311 traffic light complaints using dim_id_generator function
#df = dim table dataframe, denoter = letter portion of id, id_col_name = name of id col, index_list = list with index #s from df
#def dim_id_generator(df,denoter,id_col_name,index_list): # denoter &id_col_name are strings

dim_id_generator(dim_311complaint,"TC","dim_311complaint_id",index_nums_trlights311)
dim_id_generator(dim_311location,"TL","dim_311location_id",index_nums_trlights311)
dim_id_generator(dim_311date,"TD","dim_311date_id",index_nums_trlights311);

In [36]:
#rearranging columns in the dim table dataframes so that id col is first using column_arranger
#dim_table = dim table df, id_col_name = col header name for id col
#def column_arranger(dim_table, id_col_name): #id_col_name is a str

column_arranger(dim_311complaint, "dim_311complaint_id")
column_arranger(dim_311date, "dim_311date_id")
column_arranger(dim_311location,"dim_311location_id");


<h1 align="center">Loading Collisions/Crashes and 311 Traffic Lights Complaints to Google BigQuery</h1>

In [37]:
#instantiate bigquery client
client = bigquery.Client(project='cis4400-assignments')

In [38]:
#function for porting dim_tables to GBQ
# df = dim table, table_name = table name
def port_table_to_gbq(table_name,df): #table_name is a string
    destination_table_path = f"cis4400_finalproject.{table_name}"
    job = df.to_gbq(destination_table = destination_table_path,
             project_id = "cis4400-assignments",
             if_exists= "replace")
    return job

In [39]:
#list containing the dim tables
dim_tables_container = {"dim_311complaint":dim_311complaint, 
                        "dim_311location":dim_311location, 
                        "dim_311date":dim_311date,
                        "dim_crashdate":dim_crashdate, 
                        "dim_crashlocation":dim_crashlocation, 
                        "dim_contributingfactor":dim_contributingfactor, 
                        "dim_casualties":dim_casualties}

#port the dim tables to GBQ using the port_table_to_gbq function
for tablename,dimtable in dim_tables_container.items():
    port_table_to_gbq(tablename, dimtable)

100%|██████████| 1/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]
