In [1]:
import pandas as pd
import numpy as np
import string
import traceback
import time

from datetime import datetime
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, array, count, split
from pyspark.sql.functions import broadcast,coalesce, lit

In [2]:
sc = SparkContext()
spark = SparkSession(sc)

In [3]:
streets = "nyc_cscl.csv"
violations = "nyc_parking_violation/*.csv"

#streets = "hdfs:///tmp/bdm/nyc_cscl.csv"
#violations = "hdfs:///tmp/bdm/nyc_parking_violation/*.csv"

In [4]:
def to_upper(raw_str):
    import string
    if raw_str is not None and raw_str != "":
        clean_str = raw_str.strip(string.punctuation)
        return clean_str.strip().upper()
    return None

def get_county_code(county):
    if county is not None:
        # Boro codes: 1 = MN, 2 = BX, 3 = BK, 4 = QN, 5 = SI
        if county.startswith("M") or county.startswith("N"):
            return 1
        if county in ['BRONX', 'BX', 'PBX']:
            return 2
        if county in ['BK', 'K', 'KING', 'KINGS']:
            return 3
        if county.startswith('Q'):
            return 4
        if county.startswith('R') or county.startswith('ST'):
            return 5
    return None

def get_year(date_string):
    if date_string is not None:
        data_val = datetime.strptime(date_string.strip(), '%m/%d/%Y')  
        if data_val.year in list(range(2015,2020)):
            return data_val.year
    return None

def get_street_number(street_val_raw, default=None):
    if street_val_raw is not None:
        if type(street_val_raw) is int:
            return street_val_raw
        
        street_val = street_val_raw.strip(string.ascii_letters)
        elems = street_val.split("-")  
        
        if len(elems) == 1 and elems[0].isdigit():
            return int(elems[0])
        
        elif len(elems) == 2 and elems[0].isdigit() and elems[1].isdigit():
            new_val = elems[0] + "{:01d}".format(int(elems[1]))
            if new_val.isdigit():
                return int(new_val)   
        
        elif len(elems) == 3 and elems[0].isdigit() and elems[2].isdigit():
            new_val = elems[0] + "{:01d}".format(int(elems[2]))
            if new_val.isdigit():
                return int(new_val) 
        else:
            new_val = "".join(elems)
            if new_val.isdigit():
                return int(new_val)
    return default

def as_digit(val):
    if val:
        return int(val)
    return val

def getOLS(values):
    import statsmodels.api as sm
    X = sm.add_constant([1,2,3,4,5])
    fit = sm.OLS(values, X).fit()
    coef = fit.params[1]
    return float(coef)


In [5]:
def get_violations_df(violations_file, spark):
    get_county_code_udf = udf(get_county_code)
    get_street_number_udf = udf(get_street_number)
    get_year_udf = udf(get_year)
    to_upper_udf = udf(to_upper)
    
    violations_df = spark.read.csv(violations_file, header=True, inferSchema=False)

    violations_df = violations_df.select("Violation County", "House Number", "Street Name", "Issue Date")

    violations_df = violations_df.withColumnRenamed("Violation County","COUNTY")
    violations_df = violations_df.withColumnRenamed("House Number","HOUSENUM")
    violations_df = violations_df.withColumnRenamed("Street Name","STREETNAME")
    violations_df = violations_df.withColumnRenamed("Issue Date","YEAR")
    
    violations_df = violations_df.withColumn('HOUSENUM_RAW', col("HOUSENUM"))

    split_col = split(violations_df['HOUSENUM'], '-')
    violations_df = violations_df.withColumn('NUM0', split_col.getItem(0))
    violations_df = violations_df.withColumn('NUM1', split_col.getItem(1))
    violations_df = violations_df.withColumn('NUM0', get_street_number_udf(violations_df['NUM0'],lit(0)))
    violations_df = violations_df.withColumn('NUM1', get_street_number_udf(violations_df['NUM1'],lit(0)))

    
    violations_df = violations_df.withColumn('COUNTY', get_county_code_udf(violations_df['COUNTY']))
    
    violations_df = violations_df.withColumn('HOUSENUM', get_street_number_udf(violations_df['HOUSENUM']))
    
    violations_df = violations_df.withColumn('STREETNAME', to_upper_udf(violations_df['STREETNAME']))
    violations_df = violations_df.withColumn('YEAR', get_year_udf(violations_df['YEAR']))

    violations_df = violations_df.filter((violations_df['COUNTY'].isNotNull()) 
                                         & (violations_df['HOUSENUM'].isNotNull()) 
                                         & (violations_df['STREETNAME'].isNotNull()) 
                                         & (violations_df['YEAR'].isNotNull())
                                        )
    
    violations_df = violations_df.withColumn("COUNTY", violations_df["COUNTY"].cast("integer"))
    violations_df = violations_df.withColumn("HOUSENUM", violations_df["HOUSENUM"].cast("integer"))
    violations_df = violations_df.withColumn("YEAR", violations_df["YEAR"].cast("integer"))
    
    violations_df = violations_df.withColumn('NUM0', violations_df["NUM0"].cast("integer"))
    violations_df = violations_df.withColumn('NUM1', violations_df["NUM1"].cast("integer"))

    violations_df = violations_df.repartition(5,'COUNTY')
    violations_df = violations_df.alias('v')
    return violations_df

In [6]:
def get_streets_df(streets_file, spark):
    get_street_number_udf = udf(get_street_number)
    to_upper_udf = udf(to_upper)
    as_digit_udf = udf(as_digit)
    
    streets_df = spark.read.csv(streets_file, header=True, inferSchema=False)

    streets_df = streets_df.select("PHYSICALID","BOROCODE", "FULL_STREE", "ST_LABEL","L_LOW_HN", "L_HIGH_HN", 
                                   "R_LOW_HN", "R_HIGH_HN")

    streets_df = streets_df.withColumnRenamed("L_LOW_HN","OddLo")
    streets_df = streets_df.withColumnRenamed("L_HIGH_HN","OddHi")
    streets_df = streets_df.withColumnRenamed("R_LOW_HN","EvenLo")
    streets_df = streets_df.withColumnRenamed("R_HIGH_HN","EvenHi")
    
    streets_df = streets_df.filter((streets_df['BOROCODE'].isNotNull()) 
                                   & (streets_df['PHYSICALID'].isNotNull())
                                  )
    
    streets_df = streets_df.withColumn('BOROCODE', as_digit_udf(streets_df['BOROCODE']))
    streets_df = streets_df.withColumn('FULL_STREE', to_upper_udf(streets_df['FULL_STREE']))
    streets_df = streets_df.withColumn('ST_LABEL',   to_upper_udf(streets_df['ST_LABEL']))

    streets_df = streets_df.withColumn("BOROCODE", streets_df["BOROCODE"].cast("integer"))

    streets_df = streets_df.repartition(5, 'BOROCODE')
    streets_df = streets_df.alias('s')
    return streets_df

In [7]:
violations_df = get_violations_df(violations, spark)
streets_df = get_streets_df(streets, spark)

In [8]:
violations_df.show(20)

+------+--------+------------+----+------------+----+----+
|COUNTY|HOUSENUM|  STREETNAME|YEAR|HOUSENUM_RAW|NUM0|NUM1|
+------+--------+------------+----+------------+----+----+
|     3|      12|  GRAHAM AVE|2015|          12|  12|   0|
|     3|     860|    PARK AVE|2015|         860| 860|   0|
|     3|    1269|     53RD ST|2015|        1269|1269|   0|
|     3|     140|     6TH AVE|2015|         140| 140|   0|
|     3|    1017|      FOSTER|2015|        1017|1017|   0|
|     3|     688|     5TH AVE|2015|         688| 688|   0|
|     3|     342|PROSPECT AVE|2015|         342| 342|   0|
|     5|     186|  POTTER AVE|2015|         186| 186|   0|
|     3|    4906|    13TH AVE|2015|        4906|4906|   0|
|     3|     475|  DRIGGS AVE|2015|         475| 475|   0|
|     3|    1452|     73RD ST|2015|        1452|1452|   0|
|     3|     951| EASTERN PKY|2015|         951| 951|   0|
|     3|    1096|ATLANTIC AVE|2015|        1096|1096|   0|
|     3|    7612|     5TH AVE|2015|        7612|7612|   

In [9]:
streets_df.show(10)

+----------+--------+----------------+----------------+-----+-----+------+------+
|PHYSICALID|BOROCODE|      FULL_STREE|        ST_LABEL|OddLo|OddHi|EvenLo|EvenHi|
+----------+--------+----------------+----------------+-----+-----+------+------+
|    145494|       3| SCHERMERHORN ST| SCHERMERHORN ST|  317|  399|   316|   360|
|     15432|       5|   ARTHUR KIL RD|  ARTHUR KILL RD|  555|  555|   528|   554|
|     95944|       5|        TIDES LN|        TIDES LA|   15|   15|     0|     0|
|     94775|       5|CYPRESS CREST LN|CYPRESS CREST LA| null| null|  null|  null|
|     47311|       3|     WALWORTH ST|     WALWORTH ST|   67|  131|    72|   134|
|     42619|       3|           58 ST|           58 ST|  701|  799|   700|   798|
|     94303|       5|    PRESIDENT ST|    PRESIDENT ST|   43|   69|    44|    60|
|     43450|       3|           I AVE|            AV I|  601|  699|   600|   698|
|     96186|       5|       SILVER CT|       SILVER CT|    1|   57|     2|    60|
|     39179|    

In [10]:
def process_num(str_num, default=None):
    if str_num is not None:

        str_num_clean = str_num.strip(string.ascii_letters)
        elems = str_num_clean.split("-")  
        
        if len(elems) == 1 and elems[0].isdigit():
            return (0, int(elems[0]))
        
        elif len(elems) == 2 and elems[0].isdigit() and elems[1].isdigit():
             return (int(elems[0]), int(elems[1]))
        
        elif len(elems) == 3 and elems[0].isdigit() and elems[2].isdigit():
             return (int(elems[0]), int(elems[2]))
        else:
            new_val = "".join(elems)
            if new_val.isdigit():
                return (0, int(new_val))
    return (0,0)

def mapper_2(row):
    evenLo = process_num(row['EvenLo'])
    evenHi = process_num(row['EvenHi'])
    oddLo = process_num(row['OddLo'])
    oddHi = process_num(row['OddHi'])
    
    if row['FULL_STREE'] == row['ST_LABEL']:
        yield ( 
                (row['BOROCODE'], row["FULL_STREE"] ), 
                [( evenLo, evenHi, oddLo, oddHi, row['PHYSICALID'] )] 
              ) 
    else:
        yield ( 
                (row['BOROCODE'], row["FULL_STREE"]), 
                [( evenLo, evenHi, oddLo, oddHi, row['PHYSICALID'] )] 
              ) 
        yield ( 
                (row['BOROCODE'], row["ST_LABEL"]), 
                [( evenLo, evenHi, oddLo, oddHi, row['PHYSICALID'] ) ]
              ) 
    

def mapper(row):    
    if row['FULL_STREE'] == row['ST_LABEL']:
        yield ( 
                (row['BOROCODE'], row["FULL_STREE"] ), 
                [( row['EvenLo'],row['EvenHi'],row['OddLo'],row['OddHi'], row['PHYSICALID'] )] 
              ) 
    else:
        yield ( 
                (row['BOROCODE'], row["FULL_STREE"]), 
                [( row['EvenLo'],row['EvenHi'],row['OddLo'],row['OddHi'] ,row['PHYSICALID'] )] 
              ) 
        yield ( 
                (row['BOROCODE'], row["ST_LABEL"]), 
                [( row['EvenLo'],row['EvenHi'],row['OddLo'],row['OddHi'], row['PHYSICALID'] ) ]
              ) 

streets_dict = streets_df.rdd.flatMap(mapper_2).reduceByKey(lambda x,y: x+y).collectAsMap()
streets_dict_bc = sc.broadcast(streets_dict)
list(streets_dict.values())[:10]

[[((0, 0), (0, 0), (0, 0), (0, 0), '94775'),
  ((0, 0), (0, 0), (0, 0), (0, 0), '94774'),
  ((0, 0), (0, 0), (0, 0), (0, 0), '94778'),
  ((0, 0), (0, 0), (0, 0), (0, 0), '94776'),
  ((0, 0), (0, 0), (0, 0), (0, 0), '94777'),
  ((0, 0), (0, 0), (0, 0), (0, 0), '94779')],
 [((0, 72), (0, 134), (0, 67), (0, 131), '47311'),
  ((0, 136), (0, 182), (0, 133), (0, 177), '47312'),
  ((0, 184), (0, 250), (0, 179), (0, 243), '47313'),
  ((0, 2), (0, 70), (0, 1), (0, 65), '47310'),
  ((0, 72), (0, 134), (0, 67), (0, 131), '47311'),
  ((0, 72), (0, 134), (0, 67), (0, 131), '47311'),
  ((0, 184), (0, 250), (0, 179), (0, 243), '47313'),
  ((0, 136), (0, 182), (0, 133), (0, 177), '47312'),
  ((0, 2), (0, 70), (0, 1), (0, 65), '47310'),
  ((0, 184), (0, 250), (0, 179), (0, 243), '47313'),
  ((0, 136), (0, 182), (0, 133), (0, 177), '47312'),
  ((0, 2), (0, 70), (0, 1), (0, 65), '47310')],
 [((0, 314), (0, 336), (0, 313), (0, 335), '39179'),
  ((0, 3094), (0, 3116), (0, 3093), (0, 3117), '44266'),
  ((0,

In [11]:
def search_candidates(candidates, housenum):
    for item in candidates:
        if housenum % 2 == 0:
            if item[0] <= housenum and housenum <= item[1]:
                return item[4]
        else:
            if item[2] <= housenum and housenum <= item[3]:
                return item[4]  
    return None
    
def get_val(borocode, street, housenum, housenum_raw=None):
    res = None
    candidates = streets_dict_bc.value.get( (borocode, street) )
    
    if candidates:
        res = search_candidates(candidates, housenum)
        if res is None and housenum_raw and type(housenum_raw) == str and "-" in housenum_raw:
            elems = housenum_raw.split("-")
            if len(elems) == 2 and elems[0].isdigit() and elems[1].isdigit():
                res = search_candidates(candidates, int(elems[0]+elems[1]))
                if res is None:
                    res = search_candidates(candidates, int(elems[1]))

    return res

def search_candidates_2(candidates, housenum):
    for item in candidates:
        if housenum[1] % 2 == 0:
            if item[0] <= housenum and housenum <= item[1]:
                return item[4]
        else:
            if item[2] <= housenum and housenum <= item[3]:
                return item[4]  
    return None
    
def get_val_2(borocode, street, num0, num1):
    res = None
    housenum = (num0, num1)
    if num0 != 0 and num1 == 0:
        housenum = (num1, num0)
    candidates = streets_dict_bc.value.get( (borocode, street) )
    if candidates:
        res = search_candidates_2(candidates, housenum)
        if res is None and num0 > 1000 and num1 == 0:
            housenum = (int(num0/100), num0%100)
            res = search_candidates_2(candidates, housenum)
        if res is None and num0 != 0 and num1 != 0:
            housenum = (0, (num0*100)+num1) 
            res = search_candidates_2(candidates, housenum)
        if res is None and num0 != 0 and num1 != 0:
            housenum = (0, num1)
            res = search_candidates_2(candidates, housenum)        
    return res


In [12]:
get_val_udf = udf(get_val_2)

matched_violations = violations_df.withColumn('PHYSICALID', 
                                              get_val_udf(violations_df['v.COUNTY'], violations_df['v.STREETNAME'], 
                                                                        violations_df['v.NUM0'], violations_df['v.NUM1']))

unmatched_violations = matched_violations.filter( ~matched_violations['PHYSICALID'].isNotNull() )
print("Violations Unmatched:", unmatched_violations.count())
matched_violations = matched_violations.filter( matched_violations['PHYSICALID'].isNotNull() )
print("Violations Matched:", matched_violations.count())

matched_violations = matched_violations.withColumn("PHYSICALID", matched_violations["PHYSICALID"].cast("integer"))
matched_violations = matched_violations.orderBy("PHYSICALID")

Violations Unmatched: 155
Violations Matched: 4192


In [13]:
matched_violations = matched_violations.groupBy("PHYSICALID", "YEAR").agg(count("*").alias("YEAR_COUNT"))
matched_violations.show(10)

+----------+----+----------+
|PHYSICALID|YEAR|YEAR_COUNT|
+----------+----+----------+
|        50|2015|         2|
|        62|2015|         2|
|        86|2015|         1|
|       121|2015|         1|
|       125|2015|         2|
|       131|2015|         1|
|       132|2015|         1|
|       139|2015|         1|
|       140|2015|         1|
|       145|2015|         1|
+----------+----+----------+
only showing top 10 rows



In [14]:
matched_violations.createOrReplaceTempView("matched_violations")

In [15]:
summaries = spark.sql(
    "select PHYSICALID, " +
    "MAX(CASE WHEN (YEAR = 2015) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2015, " +
    "MAX(CASE WHEN (YEAR = 2016) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2016, " +
    "MAX(CASE WHEN (YEAR = 2017) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2017, " +
    "MAX(CASE WHEN (YEAR = 2018) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2018, " +
    "MAX(CASE WHEN (YEAR = 2019) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2019  " +
    "from matched_violations " +
    "group by PHYSICALID " +
    "order by PHYSICALID "
)
summaries.show(10)

+----------+----------+----------+----------+----------+----------+
|PHYSICALID|COUNT_2015|COUNT_2016|COUNT_2017|COUNT_2018|COUNT_2019|
+----------+----------+----------+----------+----------+----------+
|        50|         2|         0|         0|         0|         0|
|        62|         2|         0|         0|         0|         0|
|        86|         1|         0|         0|         0|         0|
|       121|         1|         0|         0|         0|         0|
|       125|         2|         0|         0|         0|         0|
|       131|         1|         0|         0|         0|         0|
|       132|         1|         0|         0|         0|         0|
|       139|         1|         0|         0|         0|         0|
|       140|         1|         0|         0|         0|         0|
|       145|         1|         0|         0|         0|         0|
+----------+----------+----------+----------+----------+----------+
only showing top 10 rows



In [16]:
getOLS_udf = udf(getOLS)
summaries = summaries.withColumn('OLS_COEF', 
                getOLS_udf(array('COUNT_2015', 'COUNT_2016', 'COUNT_2017', 'COUNT_2018', 'COUNT_2019')))

In [17]:
streets_df = streets_df.select(col("s.PHYSICALID")) \
                    .join(summaries, "PHYSICALID", how='left') \
                    .distinct() \
                    .orderBy("PHYSICALID") \

streets_df = streets_df.withColumn("COUNT_2015",coalesce("COUNT_2015", lit(0))) 
streets_df = streets_df.withColumn("COUNT_2016",coalesce("COUNT_2016", lit(0))) 
streets_df = streets_df.withColumn("COUNT_2017",coalesce("COUNT_2017", lit(0))) 
streets_df = streets_df.withColumn("COUNT_2018",coalesce("COUNT_2018", lit(0))) 
streets_df = streets_df.withColumn("COUNT_2019",coalesce("COUNT_2019", lit(0))) 
streets_df = streets_df.withColumn("OLS_COEF",  coalesce("OLS_COEF", lit(0.0))) 

In [18]:
start_time = time.time()
streets_df.show(10)
print("--- %s seconds ---" % (time.time() - start_time))

+----------+----------+----------+----------+----------+----------+--------+
|PHYSICALID|COUNT_2015|COUNT_2016|COUNT_2017|COUNT_2018|COUNT_2019|OLS_COEF|
+----------+----------+----------+----------+----------+----------+--------+
|       100|         0|         0|         0|         0|         0|     0.0|
|     10000|         0|         0|         0|         0|         0|     0.0|
|    100000|         0|         0|         0|         0|         0|     0.0|
|    100001|         0|         0|         0|         0|         0|     0.0|
|    100002|         0|         0|         0|         0|         0|     0.0|
|    100003|         0|         0|         0|         0|         0|     0.0|
|    100004|         0|         0|         0|         0|         0|     0.0|
|    100005|         0|         0|         0|         0|         0|     0.0|
|    100006|         0|         0|         0|         0|         0|     0.0|
|    100007|         0|         0|         0|         0|         0|     0.0|

In [None]:
# streets_df.write.csv('TODO', header=False)