In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, min, max
from utils.constants import Paths

In [None]:
spark = SparkSession.builder.appName("CSV Worker").config("spark.executor.memory", "2g").config("spark.driver.memory", "2g").getOrCreate()

In [3]:
def split_311_df(df, keyword, column=None):
    if column:
        df = df.filter(col(column).like(f"%{keyword}%"))
    else:
        df = df.filter(
            col("Descriptor").like(f"%{keyword}%")
            | col("Complaint Type").like(f"%{keyword}%")
        )
    return df.drop("Descriptor", "Complaint Type")

In [10]:
base_311_df = spark.read.parquet(str(Paths.RAW_DATA_PARQUET / "user/base_311.parquet"))

In order to aggregate the 311 dataset with the various government action datasets, we need to split it to so that the subsets can be matched. We need the following subsets:
1. 311 pothole complaints (aggregated with pothole work orders dataset)
2. 311 parking complaints (aggregated with parking violations dataset)
3. 311 vacant lot complaitns (aggregated with vacant lot cleanings dataset)

In [69]:
pothole_311_df = split_311_df(base_311_df, "Pothole","Descriptor")

In [5]:
parking_311_df = split_311_df(base_311_df, "Parking")

23/12/03 19:36:58 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [11]:
vacant_311_df = split_311_df(base_311_df, "Vacant Lot", "Complaint Type")

This is just for brevity, in reality, you'll need to free at least base_311_df, if not the other 2, before writing because this is too much to fit into memory all at once

In [15]:
names = ["pothole","parking", "vacant"]
dfs = [pothole_311_df,parking_311_df,vacant_311_df]

In [None]:
for df, name in zip(dfs, names):
    df.write.parquet(str(Paths.RAW_DATA_PARQUET / f"user/{name}_311.parquet"))

In [14]:
spark.stop()