In [1]:
import pandas as pd
import numpy as np
import rtree
import time
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark import sql
import traceback

from datetime import datetime

OBJECTIVES:

In this challenge, we would like to gather statistics on the number of parking violations (tickets) per street
segment in NYC over the past 5 years. In particular, for each street segment in NYC, we would like to have the
following:
1. The total number of parking violations for each year from 2015 to 2019.
2. The rate that the total number of violations change over the years using Ordinary Least Squares.

The street address is provided through the House Number; Street Name; and Violation County field.
For the parking violations data set, the Issue Date field should be used to determine which year a violationbelongs to.

In [4]:
streets = "nyc_cscl.csv"
violations = "nyc_parking_violations_2015_sample.csv"

In [3]:
sc = SparkContext()

In [122]:
sqlContext = sql.SQLContext(sc)

In [145]:
def get_county_code(county):
    # Possible values: NY, (Q, QUEENS, QNS), ST, BK, BX / BX, K, NY, Q, R
    # Boro codes: 1 = MN, 2 = BX, 3 = BK, 4 = QN, 5 = SI
    if county == 'NY':
        return 1
    if county == 'BX':
        return 2
    if county == 'K' or county == 'BK':
        return 3
    if county.startswith('Q'):
        return 4
    if county == 'R' or county == 'ST':
        return 5
    return -1

def process_violations(pid, records):
    import csv
    years = range(2015,2020)
    if pid == 0:
        next(records)
        
    reader = csv.reader(records)
    for row in reader:
        try:
            house_num = row[23].strip()
            street_name = row[24].strip().upper()
            county_code = get_county_code(row[21].strip())
            date_issued = datetime.strptime(row[4].strip(), '%m/%d/%Y')
            
            if date_issued.year in years:
#                 if street_name.startswith('REMSEN'):
                yield ((street_name, county_code, house_num, date_issued.year), 1)
        except:
            print(row)
            print(traceback.format_exc())

def process_streets(pid,records):
    import csv

    if pid == 0:
        next(records)
    
    reader = csv.reader(records)
    for row in reader: 
        physical_id = row[0]
        street_name = row[28].strip().upper()
        boro_code = int(row[13] )
        odd_lo = row[2]
        odd_hi = row[3]
        even_lo = row[4]
        even_hi = row[5]
#         if street_name.startswith('REMSEN ST'):
        yield (street_name, boro_code, odd_lo, odd_hi, even_lo, even_hi , physical_id)


In [146]:
violations_rdd = sc.textFile(violations).mapPartitionsWithIndex(process_violations) \
            .reduceByKey(lambda x,y: x+y) \

violations_rdd.take(5)

[(('REMSEN ST', 3, 'S', 2015), 1),
 (('W 172ND ST', 2, '115', 2015), 1),
 (('W 58TH ST', 1, '350', 2015), 1),
 (('135TH ST', 4, '82-46', 2015), 2),
 (('KING ST', 1, '68', 2015), 1),
 (('BRONXDALE AVE', 2, '1836', 2015), 1),
 (('JAMAICA AVE', 4, '214-47', 2015), 1),
 (('ARTHUR AVE', 2, '2311', 2015), 1),
 (('26TH ST', 4, '22-60', 2015), 1),
 (('HAMPTON AVE', 3, 'N', 2015), 1)]

In [148]:
streets_rdd = sc.textFile(streets).mapPartitionsWithIndex(process_streets) \

streets_rdd.take(5)

[('MITSUBISHI WILD WETLAND TRL', 2, '', '', '', '', '164809'),
 ('28 AVE', 4, '215-001', '215-027', '215-000', '215-026', '6110'),
 ('SCHERMERHORN ST', 3, '317', '399', '316', '360', '145494'),
 ('ARLINGTON AVE', 2, '5631', '5699', '5602', '5698', '61140'),
 ('QUEENS BLVD', 4, '120-011', '120-011', '0', '0', '12438')]

In [149]:
rdd = streets_rdd.join(violations_rdd)
rdd.take(5)

[]

In [156]:
violations_df = sqlContext.read.format("csv") \
  .option("delimiter",",") \
  .option("header", "true") \
  .option("inferSchema", "true") \
  .load(violations)

violations_df = violations_df.select("Violation County", "House Number", "Street Name", "Issue Date")

Unnamed: 0,Violation County,House Number,Street Name,Issue Date
0,NY,158,8th Ave,02/07/2015
1,NY,10,E 29th St,04/29/2015
2,NY,46,William St,10/06/2014
3,K,S,Remsen St,03/24/2015
4,Q,W,161st St,08/16/2014


In [159]:
violations_df.withColumn('BOROCODE', get_county_code(violations_df['Violation County']))

violations_simp = pd.DataFrame(violations_df.head(5), columns=violations_df.columns)
violations_simp

ValueError: Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.

In [140]:
streets_df = sqlContext.read.format("csv") \
  .option("delimiter",",") \
  .option("header", "true") \
  .option("inferSchema", "true") \
  .load(streets)

streets_df = streets_df.select("PHYSICALID","BOROCODE", "FULL_STREE", "L_LOW_HN", "L_HIGH_HN", "R_LOW_HN", "R_HIGH_HN")


In [144]:
streets_simp = pd.DataFrame(streets_df.head(3), columns=streets_df.columns)
streets_simp

Unnamed: 0,PHYSICALID,BOROCODE,FULL_STREE,L_LOW_HN,L_HIGH_HN,R_LOW_HN,R_HIGH_HN
0,164809,2,MITSUBISHI WILD WETLAND TRL,,,,
1,6110,4,28 AVE,215-001,215-027,215-000,215-026
2,145494,3,SCHERMERHORN ST,317,399,316,360


In [None]:
df.withColumn('result', example_udf(df.address1, df.address2))
