In [1]:
import pandas as pd
import numpy as np
import traceback

from datetime import datetime

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import col, udf, array, count


OBJECTIVES:

In this challenge, we would like to gather statistics on the number of parking violations (tickets) per street
segment in NYC over the past 5 years. In particular, for each street segment in NYC, we would like to have the
following:
1. The total number of parking violations for each year from 2015 to 2019.
2. The rate that the total number of violations change over the years using Ordinary Least Squares.

The street address is provided through the House Number; Street Name; and Violation County field.
For the parking violations data set, the Issue Date field should be used to determine which year a violationbelongs to.

In [2]:
sc = SparkContext()
spark = SparkSession(sc)

In [3]:
streets = "nyc_cscl.csv"
violations = "nyc_parking_violation/*.csv"

In [4]:
# streets = "hdfs:///tmp/bdm/nyc_cscl.csv"
# violations = "hdfs:///tmp/bdm/nyc_parking_violations/"

In [5]:
def to_upper(string):
    if string is None:
        return None
    return string.strip().upper()

def get_county_code(county):
    if county is not None:
        # Boro codes: 1 = MN, 2 = BX, 3 = BK, 4 = QN, 5 = SI
        if county.startswith("M") or county.startswith("N"):
            return 1
        if county in ['BRONX', 'BX', 'PBX']:
            return 2
        if county in ['BK', 'K', 'KING', 'KINGS']:
            return 3
        if county.startswith('Q'):
            return 4
        if county == 'R' or county == 'ST':
            return 5
    return -1

def get_year(string): 
    data_val = datetime.strptime(string.strip(), '%m/%d/%Y')    
    return data_val.year

def get_house_number(house_val):
    if house_val is None:
        return None
    if type(house_val) is int:
        return house_val
    elems = house_val.split("-")
    new_val = "".join(elems)
    if new_val.isdigit():
        return int(new_val)
    else:
        return None
    
def get_street_number(street_val):
    if street_val is None:
        return 0
    if type(street_val) is int:
        return street_val
    elems = street_val.split("-")
    new_val = "".join(elems)
    if new_val.isdigit():
        return int(new_val)
    else:
        return 0


get_street_number_udf = udf(get_street_number)
get_house_number_udf = udf(get_house_number)
get_county_code_udf = udf(get_county_code)
get_year_udf = udf(get_year)
to_upper_udf = udf(to_upper)

In [6]:
violations_df = spark.read.csv(violations,header=True, inferSchema=True)

violations_df = violations_df.select("Violation County", "House Number", "Street Name", "Issue Date")

violations_df = violations_df.filter((violations_df['Violation County'].isNotNull()) 
                                     & (violations_df['House Number'].isNotNull()) 
                                     & (violations_df['Street Name'].isNotNull()) 
                                     & (violations_df['Issue Date'].isNotNull())
                                    )

violations_df = violations_df.withColumn('Violation County', get_county_code_udf(violations_df['Violation County']))
violations_df = violations_df.withColumn('House Number', get_street_number_udf(violations_df['House Number']))
violations_df = violations_df.withColumn('Street Name', to_upper_udf(violations_df['Street Name']))
violations_df = violations_df.withColumn('Issue Date', get_year_udf(violations_df['Issue Date']))

violations_df = violations_df.withColumnRenamed("Violation County","COUNTY")
violations_df = violations_df.withColumnRenamed("House Number","HOUSENUM")
violations_df = violations_df.withColumnRenamed("Street Name","STREETNAME")
violations_df = violations_df.withColumnRenamed("Issue Date","YEAR")

violations_df = violations_df.where(violations_df.YEAR.isin(list(range(2015,2020))))

In [7]:
streets_df = spark.read.csv(streets,header=True, inferSchema=True)

streets_df = streets_df.select("PHYSICALID","BOROCODE", "FULL_STREE", "ST_LABEL","L_LOW_HN", "L_HIGH_HN", 
                               "R_LOW_HN", "R_HIGH_HN")

streets_df = streets_df.withColumn('FULL_STREE', to_upper_udf(streets_df['FULL_STREE']))
streets_df = streets_df.withColumn('ST_LABEL',   to_upper_udf(streets_df['ST_LABEL']))
streets_df = streets_df.withColumn('L_LOW_HN',  get_street_number_udf(streets_df['L_LOW_HN']))
streets_df = streets_df.withColumn('L_HIGH_HN', get_street_number_udf(streets_df['L_HIGH_HN']))
streets_df = streets_df.withColumn('R_LOW_HN',  get_street_number_udf(streets_df['R_LOW_HN']))
streets_df = streets_df.withColumn('R_HIGH_HN', get_street_number_udf(streets_df['R_HIGH_HN']))

streets_df = streets_df.withColumnRenamed("L_LOW_HN","OddLo")
streets_df = streets_df.withColumnRenamed("L_HIGH_HN","OddHi")
streets_df = streets_df.withColumnRenamed("R_LOW_HN","EvenLo")
streets_df = streets_df.withColumnRenamed("R_HIGH_HN","EvenHi")

In [8]:
violations_simp = pd.DataFrame(violations_df.head(5), columns=violations_df.columns)
violations_simp

Unnamed: 0,COUNTY,HOUSENUM,STREETNAME,YEAR
0,1,158,8TH AVE,2015
1,1,10,E 29TH ST,2015
2,3,0,REMSEN ST,2015
3,2,115,W 172ND ST,2015
4,1,350,W 58TH ST,2015


In [9]:
streets_simp = pd.DataFrame(streets_df.head(5), columns=streets_df.columns)
streets_simp

Unnamed: 0,PHYSICALID,BOROCODE,FULL_STREE,ST_LABEL,OddLo,OddHi,EvenLo,EvenHi
0,164809,2,MITSUBISHI WILD WETLAND TRL,MITSUBISHI WILD WETLAND TRL,0,0,0,0
1,6110,4,28 AVE,28 AV,215001,215027,215000,215026
2,145494,3,SCHERMERHORN ST,SCHERMERHORN ST,317,399,316,360
3,61140,2,ARLINGTON AVE,ARLINGTON AV,5631,5699,5602,5698
4,12438,4,QUEENS BLVD,QUEENS BLVD,120011,120011,0,0


In [10]:
streets_df = streets_df.alias('s')
violations_df = violations_df.alias('v')

In [11]:
merged_df = (
    streets_df.join(
        violations_df,
        ((col("s.BOROCODE") == col("v.County")) &
        (
            (col("s.FULL_STREE") == col("v.STREETNAME")) | 
            (col("s.ST_LABEL") == col("v.STREETNAME"))
        ) &
        (
            ((col("v.HOUSENUM") % 2 == 0)  & (col("v.HOUSENUM") >= col("s.EvenLo")) & (col("v.HOUSENUM") <= col("s.EvenHi"))) |  
            ((col("v.HOUSENUM") % 2 == 1)  & (col("v.HOUSENUM") >= col("s.OddLo"))  & (col("v.HOUSENUM") <= col("s.OddHi")))
        )
    ), how='left')
).select(col("s.PHYSICALID"),col("v.YEAR"))


In [12]:
merged_df = merged_df.alias('m')
merged_df = merged_df.groupBy("m.PHYSICALID", "m.YEAR").agg(count("*").alias("YEAR_COUNT"))

In [13]:
merged_df.createOrReplaceTempView("merged_results")

In [14]:
summaries = spark.sql(
    "select m.PHYSICALID, " +
    "MAX(CASE WHEN (YEAR = 2015) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2015, " +
    "MAX(CASE WHEN (YEAR = 2016) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2016, " +
    "MAX(CASE WHEN (YEAR = 2017) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2017, " +
    "MAX(CASE WHEN (YEAR = 2018) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2018, " +
    "MAX(CASE WHEN (YEAR = 2019) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2019  " +
    "from merged_results m  " +
    "group by m.PHYSICALID " +
    "order by m.PHYSICALID "
)

In [15]:
def getOLS(values):
    import statsmodels.api as sm
    X = sm.add_constant(np.arange(len(values)))
    fit = sm.OLS(values, X).fit()
    coef = fit.params[0]
    return float(coef)

getOLS_udf = udf(getOLS)

summaries = summaries.withColumn('OLS_COEF', 
                getOLS_udf(array('COUNT_2015', 'COUNT_2016', 'COUNT_2017', 'COUNT_2018', 'COUNT_2019')))


In [16]:
summaries.show()

+----------+----------+----------+----------+----------+----------+--------+
|PHYSICALID|COUNT_2015|COUNT_2016|COUNT_2017|COUNT_2018|COUNT_2019|OLS_COEF|
+----------+----------+----------+----------+----------+----------+--------+
|         3|         0|         0|         0|         0|         0|     0.0|
|         5|         0|         0|         0|         0|         0|     0.0|
|         6|         0|         0|         0|         0|         0|     0.0|
|         8|         0|         0|         0|         0|         0|     0.0|
|        14|         0|         0|         0|         0|         0|     0.0|
|        23|         0|         0|         0|         0|         0|     0.0|
|        24|         0|         0|         0|         0|         0|     0.0|
|        25|         0|         0|         0|         0|         0|     0.0|
|        29|         5|         0|         0|         0|         0|     3.0|
|        30|         5|         0|         0|         0|         0|     3.0|

In [17]:
# summaries.write.csv('TODO', header=False)