In [1]:
import pandas as pd
import numpy as np
import string
import traceback
import time

from datetime import datetime
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, array, count
from pyspark.sql.functions import broadcast,coalesce, lit

In [2]:
sc = SparkContext()
spark = SparkSession(sc)

In [57]:
streets = "nyc_cscl.csv"
violations = "nyc_parking_violation/*.csv"

#streets = "hdfs:///tmp/bdm/nyc_cscl.csv"
#violations = "hdfs:///tmp/bdm/nyc_parking_violation/*.csv"

In [160]:
def to_upper(string):
    if string is not None and string != "":
        return string.strip().upper()
    return None

def get_county_code(county):
    if county is not None:
        # Boro codes: 1 = MN, 2 = BX, 3 = BK, 4 = QN, 5 = SI
        if county.startswith("M") or county.startswith("N"):
            return 1
        if county in ['BRONX', 'BX', 'PBX']:
            return 2
        if county in ['BK', 'K', 'KING', 'KINGS']:
            return 3
        if county.startswith('Q'):
            return 4
        if county == 'R' or county == 'ST':
            return 5
    return None

def get_year(date_string):
    if date_string is not None:
        data_val = datetime.strptime(date_string.strip(), '%m/%d/%Y')  
        if data_val.year in list(range(2015,2020)):
            return data_val.year
    return None

def get_street_number(street_val_raw, default=None):
    if street_val_raw is not None:
        if type(street_val_raw) is int:
            return street_val
        
        street_val = street_val_raw.strip(string.ascii_letters)
        elems = street_val.split("-")  
        if len(elems) == 1 and elems[0].isdigit():
            return int(elems[0])
        elif len(elems) == 2 and elems[0].isdigit() and elems[1].isdigit():
            new_val = elems[0] + "{:03d}".format(int(elems[1]))
            if new_val.isdigit():
                return int(new_val)          
        else:
            new_val = "".join(elems)
            if new_val.isdigit():
                return int(new_val)
    return default

def as_digit(val):
    if val:
        return int(val)
    return val

def getOLS(values):
    import statsmodels.api as sm
    X = sm.add_constant([1,2,3,4,5])
    fit = sm.OLS(values, X).fit()
    coef = fit.params[1]
    return float(coef)

In [161]:
def get_violations_df(violations_file, spark):
    get_county_code_udf = udf(get_county_code)
    get_street_number_udf = udf(get_street_number)
    get_year_udf = udf(get_year)
    to_upper_udf = udf(to_upper)
    
    violations_df = spark.read.csv(violations_file, header=True, inferSchema=True)

    violations_df = violations_df.select("Violation County", "House Number", "Street Name", "Issue Date")

    violations_df = violations_df.withColumnRenamed("Violation County","COUNTY")
    violations_df = violations_df.withColumnRenamed("House Number","HOUSENUM")
    violations_df = violations_df.withColumnRenamed("Street Name","STREETNAME")
    violations_df = violations_df.withColumnRenamed("Issue Date","YEAR")
    
    violations_df = violations_df.withColumn('COUNTY', get_county_code_udf(violations_df['COUNTY']))
    violations_df = violations_df.withColumn('HOUSENUM', get_street_number_udf(violations_df['HOUSENUM'], lit(None)))
    violations_df = violations_df.withColumn('STREETNAME', to_upper_udf(violations_df['STREETNAME']))
    violations_df = violations_df.withColumn('YEAR', get_year_udf(violations_df['YEAR']))

    violations_df = violations_df.filter((violations_df['COUNTY'].isNotNull()) 
                                         & (violations_df['HOUSENUM'].isNotNull()) 
                                         & (violations_df['STREETNAME'].isNotNull()) 
                                         & (violations_df['YEAR'].isNotNull())
                                        )
    
    violations_df = violations_df.withColumn("COUNTY", violations_df["COUNTY"].cast("integer"))
    violations_df = violations_df.withColumn("HOUSENUM", violations_df["HOUSENUM"].cast("integer"))
    violations_df = violations_df.withColumn("YEAR", violations_df["YEAR"].cast("integer"))

    violations_df = violations_df.repartition(5,'COUNTY')
    violations_df = violations_df.alias('v')
    return violations_df

In [176]:
def get_streets_df(streets_file, spark):
    get_street_number_udf = udf(get_street_number)
    to_upper_udf = udf(to_upper)
    as_digit_udf = udf(as_digit)
    
    streets_df = spark.read.csv(streets_file, header=True, inferSchema=True)

    streets_df = streets_df.select("PHYSICALID","BOROCODE", "FULL_STREE", "ST_LABEL","L_LOW_HN", "L_HIGH_HN", 
                                   "R_LOW_HN", "R_HIGH_HN")

    streets_df = streets_df.withColumnRenamed("L_LOW_HN","OddLo")
    streets_df = streets_df.withColumnRenamed("L_HIGH_HN","OddHi")
    streets_df = streets_df.withColumnRenamed("R_LOW_HN","EvenLo")
    streets_df = streets_df.withColumnRenamed("R_HIGH_HN","EvenHi")
    
    streets_df = streets_df.filter((streets_df['BOROCODE'].isNotNull()) 
                                   & (streets_df['PHYSICALID'].isNotNull()))
    
    streets_df = streets_df.withColumn('BOROCODE', as_digit_udf(streets_df['BOROCODE']))
    streets_df = streets_df.withColumn('FULL_STREE', to_upper_udf(streets_df['FULL_STREE']))
    streets_df = streets_df.withColumn('ST_LABEL',   to_upper_udf(streets_df['ST_LABEL']))
    streets_df = streets_df.withColumn('OddLo', get_street_number_udf(streets_df['OddLo'], lit(0)))
    streets_df = streets_df.withColumn('OddHi', get_street_number_udf(streets_df['OddHi'], lit(0)))
    streets_df = streets_df.withColumn('EvenLo', get_street_number_udf(streets_df['EvenLo'], lit(0)))
    streets_df = streets_df.withColumn('EvenHi', get_street_number_udf(streets_df['EvenHi'], lit(0)))
    
    streets_df = streets_df.withColumn("BOROCODE", streets_df["BOROCODE"].cast("integer"))
    streets_df = streets_df.withColumn("OddLo", streets_df["OddLo"].cast("integer"))
    streets_df = streets_df.withColumn("OddHi", streets_df["OddHi"].cast("integer"))
    streets_df = streets_df.withColumn("EvenLo", streets_df["EvenLo"].cast("integer"))
    streets_df = streets_df.withColumn("EvenHi", streets_df["EvenHi"].cast("integer"))
    
    streets_df = streets_df.repartition(5, 'BOROCODE')
    streets_df = streets_df.alias('s')
    return streets_df

In [177]:
violations_df = get_violations_df(violations, spark)
streets_df = get_streets_df(streets, spark)

In [178]:
violations_df.count() # 5195, 5104, 4313

4347

In [179]:
streets_df.count()

644780

In [180]:
def mapper(row):
    if row['FULL_STREE'] == row['ST_LABEL']:
        yield ( 
                (row['BOROCODE'], row["FULL_STREE"] ), 
                [( row['EvenLo'],row['EvenHi'],row['OddLo'],row['OddHi'], row['PHYSICALID'] )] 
              ) 
    else:
        yield ( 
                (row['BOROCODE'], row["FULL_STREE"]), 
                [( row['EvenLo'],row['EvenHi'],row['OddLo'],row['OddHi'] ,row['PHYSICALID'] )] 
              ) 
        yield ( 
                (row['BOROCODE'], row["ST_LABEL"]), 
                [( row['EvenLo'],row['EvenHi'],row['OddLo'],row['OddHi'], row['PHYSICALID'] ) ]
              ) 

streets_dict = streets_df.rdd.flatMap(mapper).reduceByKey(lambda x,y: x+y).collectAsMap()
streets_dict_bc = sc.broadcast(streets_dict)
print(streets_df.count())

644780


In [181]:
def get_val(borocode, street, housenum):
    val = streets_dict_bc.value.get( (borocode, street) )
    if val:
        for item in val:
            if housenum % 2 == 0:
                if item[0] <= housenum and housenum <= item[1]:
                    return item[4]
            else:
                if item[2] <= housenum and housenum <= item[3]:
                    return item[4]     
    return None

In [182]:


get_val_udf = udf(get_val)

matched_violations = violations_df.withColumn('PHYSICALID', get_val_udf(violations_df['v.COUNTY'], 
                                                                        violations_df['v.STREETNAME'], 
                                                                        violations_df['v.HOUSENUM']
                                                          ))

unmatched_violations = matched_violations.filter( ~matched_violations['PHYSICALID'].isNotNull() )
print("Violations Unmatched:", unmatched_violations.count())
matched_violations = matched_violations.filter( matched_violations['PHYSICALID'].isNotNull() )
print("Violations Matched:", matched_violations.count())

matched_violations = matched_violations.withColumn("PHYSICALID", matched_violations["PHYSICALID"].cast("integer"))
matched_violations = matched_violations.orderBy("PHYSICALID")
# matched_violations.show(10)
# unmatched_violations.show() 302

Violations Unmatched: 298
Violations Matched: 4049


In [12]:
matched_violations = matched_violations.groupBy("PHYSICALID", "YEAR").agg(count("*").alias("YEAR_COUNT"))
matched_violations.show(10)

+----------+----+----------+
|PHYSICALID|YEAR|YEAR_COUNT|
+----------+----+----------+
|        47|2015|         2|
|        56|2015|         1|
|        61|2015|         2|
|        99|2015|         1|
|       121|2015|         1|
|       128|2015|         3|
|       135|2015|         4|
|       140|2015|         1|
|       141|2015|         1|
|       158|2015|         7|
+----------+----+----------+
only showing top 10 rows



In [13]:
matched_violations.createOrReplaceTempView("matched_violations")

In [14]:
summaries = spark.sql(
    "select PHYSICALID, " +
    "MAX(CASE WHEN (YEAR = 2015) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2015, " +
    "MAX(CASE WHEN (YEAR = 2016) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2016, " +
    "MAX(CASE WHEN (YEAR = 2017) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2017, " +
    "MAX(CASE WHEN (YEAR = 2018) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2018, " +
    "MAX(CASE WHEN (YEAR = 2019) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2019  " +
    "from matched_violations " +
    "group by PHYSICALID " +
    "order by PHYSICALID "
)
summaries.show(10)

+----------+----------+----------+----------+----------+----------+
|PHYSICALID|COUNT_2015|COUNT_2016|COUNT_2017|COUNT_2018|COUNT_2019|
+----------+----------+----------+----------+----------+----------+
|        47|         2|         0|         0|         0|         0|
|        56|         1|         0|         0|         0|         0|
|        61|         2|         0|         0|         0|         0|
|        99|         1|         0|         0|         0|         0|
|       121|         1|         0|         0|         0|         0|
|       128|         3|         0|         0|         0|         0|
|       135|         4|         0|         0|         0|         0|
|       140|         1|         0|         0|         0|         0|
|       141|         1|         0|         0|         0|         0|
|       158|         7|         0|         0|         0|         0|
+----------+----------+----------+----------+----------+----------+
only showing top 10 rows



In [15]:
getOLS_udf = udf(getOLS)
summaries = summaries.withColumn('OLS_COEF', 
                getOLS_udf(array('COUNT_2015', 'COUNT_2016', 'COUNT_2017', 'COUNT_2018', 'COUNT_2019')))


In [16]:
streets_df = streets_df.select(col("s.PHYSICALID")) \
                    .join(summaries, "PHYSICALID", how='left') \
                    .distinct() \
                    .orderBy("PHYSICALID") \

streets_df = streets_df.withColumn("COUNT_2015",coalesce("COUNT_2015", lit(0))) 
streets_df = streets_df.withColumn("COUNT_2016",coalesce("COUNT_2016", lit(0))) 
streets_df = streets_df.withColumn("COUNT_2017",coalesce("COUNT_2017", lit(0))) 
streets_df = streets_df.withColumn("COUNT_2018",coalesce("COUNT_2018", lit(0))) 
streets_df = streets_df.withColumn("COUNT_2019",coalesce("COUNT_2019", lit(0))) 
streets_df = streets_df.withColumn("OLS_COEF",  coalesce("OLS_COEF", lit(0.0))) 

In [17]:
start_time = time.time()
streets_df.show(30)
print("--- %s seconds ---" % (time.time() - start_time))

+----------+----------+----------+----------+----------+----------+--------------------+
|PHYSICALID|COUNT_2015|COUNT_2016|COUNT_2017|COUNT_2018|COUNT_2019|            OLS_COEF|
+----------+----------+----------+----------+----------+----------+--------------------+
|         3|         0|         0|         0|         0|         0|                 0.0|
|         5|         0|         0|         0|         0|         0|                 0.0|
|         6|         0|         0|         0|         0|         0|                 0.0|
|         8|         0|         0|         0|         0|         0|                 0.0|
|        14|         0|         0|         0|         0|         0|                 0.0|
|        23|         0|         0|         0|         0|         0|                 0.0|
|        24|         0|         0|         0|         0|         0|                 0.0|
|        25|         0|         0|         0|         0|         0|                 0.0|
|        29|         

In [20]:

# streets_df.write.csv('TODO', header=False)

In [172]:
# 3|      86|    WYCOFF AVE|
#  3|    9252|       5TH AVE|2015|      null
s = pd.read_csv(streets)
unmatched_violations.filter(col("STREETNAME").between(1,1)).show()

+------+--------+-------------+----+----------+
|COUNTY|HOUSENUM|   STREETNAME|YEAR|PHYSICALID|
+------+--------+-------------+----+----------+
|     1|     191|     AVENUE A|2015|      null|
|     1|    1176|      5TH AVE|2015|      null|
|     1|     390|    E 23RD ST|2015|      null|
|     1|     390|    E 23RD ST|2015|      null|
|     1| 1345049|AMSTERDAM AVE|2015|      null|
|     1|      33| WHITEHALL ST|2015|      null|
|     1|  103105|      MOTT ST|2015|      null|
|     1|      15|    E 21ST ST|2015|      null|
|     1|     440|    BROOME ST|2015|      null|
|     1|     535|    LENOX AVE|2015|      null|
|     1|     500|    E 73RD ST|2015|      null|
|     1|    3856|      10THAVE|2015|      null|
|     1|    4009|     BROADWAY|2015|      null|
|     1|    1410|     BROADWAY|2015|      null|
|     1|    7009|    E 32ND ST|2015|      null|
|     1|   13015|    W 27TH ST|2015|      null|
|     1|     635|      E 18 ST|2015|      null|
|     1|      41|    MONROE ST|2015|    

In [175]:
s[( s['L_LOW_HN'].notnull()) 
  & (s['BOROCODE'] == 1) 
#   & (s['L_LOW_HN'].str.startswith("5") | s['R_LOW_HN'].str.startswith("5"))
  & (s['FULL_STREE'].str.startswith("LEX", na=False))
 ][['PHYSICALID','L_LOW_HN', 'L_HIGH_HN', 'R_LOW_HN',
       'R_HIGH_HN', 'ST_LABEL', 'BOROCODE', 'FULL_STREE'] ].sort_values(by=['L_LOW_HN'])



Unnamed: 0,PHYSICALID,L_LOW_HN,L_HIGH_HN,R_LOW_HN,R_HIGH_HN,ST_LABEL,BOROCODE,FULL_STREE
320373,3458,10,18,11,17,LEXINGTON AV,1,LEX
469546,3458,10,18,11,17,LEXINGTON AV,1,LEX AVENUE
322564,3458,10,18,11,17,LEXINGTON AV,1,LEXINGTON
456062,3458,10,18,11,17,LEXINGTON AV,1,LEXINGTON AVE
157553,3458,10,18,11,17,LEXINGTON AV,1,LEX AV
...,...,...,...,...,...,...,...,...
255075,3384,976,998,985,999,LEXINGTON AV,1,LEXINGTON
600389,3384,976,998,985,999,LEXINGTON AV,1,LEXINGTON AVE
389871,3384,976,998,985,999,LEXINGTON AV,1,LEX AVENUE
24449,3384,976,998,985,999,LEXINGTON AV,1,LEX AVE


In [156]:
print(get_val(1, "LEXINGTON AVE", 575))
housenum = 575
for item in streets_dict.get((1, "LEXINGTON AVE")):
    print(item, item[2] <= housenum and housenum <= item[3])

None
(2105, 2125, 2108, 2118, 3438) False
(415, 435, 418, 436, 3356) False
(1709, 1733, 1714, 1730, 3420) False
(2105, 2125, 2108, 2118, 3438) False
(1195, 1209, 1190, 1208, 3394) False
(375, 391, 374, 392, 3354) False
(767, 781, 764, 782, 3373) False
(723, 741, 722, 740, 3371) False
(1585, 1615, 1590, 1614, 3414) False
(179, 199, 180, 196, 3449) False
(1551, 1565, 1550, 1568, 3412) False
(803, 821, 804, 822, 3375) False
(637, 655, 636, 654, 3367) False
(923, 941, 926, 942, 3381) False
(459, 475, 460, 474, 3358) False
(677, 699, 678, 698, 3369) False
(393, 413, 394, 416, 3355) False
(889, 899, 890, 898, 3379) False
(2105, 2125, 2108, 2118, 3438) False
(1019, 1031, 1024, 1034, 3386) False
(61, 77, 60, 78, 3455) False
(1425, 1445, 1420, 1444, 3406) False
(1525, 1549, 1528, 1548, 3411) False
(823, 839, 824, 842, 3376) False
(1, 9, 2, 8, 3459) False
(315, 333, 314, 332, 3442) False
(2105, 2125, 2108, 2118, 3438) False
(1125, 1139, 1120, 1140, 3391) False
(867, 887, 868, 888, 3378) False
(3

In [173]:
s.head()

Unnamed: 0,PHYSICALID,the_geom,L_LOW_HN,L_HIGH_HN,R_LOW_HN,R_HIGH_HN,L_ZIP,R_ZIP,L_BLKFC_ID,R_BLKFC_ID,...,PRE_MODIFI,PRE_DIRECT,PRE_TYPE,POST_TYPE,POST_DIREC,POST_MODIF,FULL_STREE,ST_NAME,BIKE_TRAFD,SHAPE_Leng
0,164809,MULTILINESTRING ((-73.87861544017795 40.861915...,,,,,10458.0,10458.0,0,0,...,,,,TRL,,,MITSUBISHI WILD WETLAND TRL,MITSUBISHI WILD WETLAND,,1026.077523
1,6110,MULTILINESTRING ((-73.7729030190404 40.7778042...,215-001,215-027,215-000,215-026,11360.0,11360.0,112261166,112262650,...,,,,AVE,,,28 AVE,28,,258.85974
2,145494,MULTILINESTRING ((-73.98181677514282 40.687329...,317,399,316,360,11217.0,11217.0,1922603730,1922612977,...,,,,ST,,,SCHERMERHORN ST,SCHERMERHORN,TW,609.424375
3,61140,MULTILINESTRING ((-73.90711253281893 40.905186...,5631,5699,5602,5698,10471.0,10471.0,1522604870,1522601877,...,,,,AVE,,,ARLINGTON AVE,ARLINGTON,,454.932922
4,12438,MULTILINESTRING ((-73.8300230194527 40.7140059...,120-011,120-011,0,0,11415.0,11415.0,92261717,92269521,...,,,,BLVD,,,QUEENS BLVD,QUEENS,,47.399228
