In [1]:
import findspark
findspark.init()

In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [6]:
from pyspark.sql import types

In [7]:
schema = types.StructType([
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),
    types.StructField('dropoff_datetime', types.TimestampType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField('SR_Flag', types.StringType(), True),
    types.StructField('Affiliated_base_number', types.StringType(), True)
])

In [8]:
df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv('data/fhvhv/fhvhv_tripdata_2021-06.csv.gz')

In [9]:
df = df.repartition(12)

In [10]:
df.write.parquet('fhvhv/pq/2021/06/')

In [12]:
df = spark.read.parquet('fhvhv/pq/2021/06/')

In [13]:
from pyspark.sql import functions as F

In [14]:
df.filter(F.to_date(df.pickup_datetime) == '2021-06-15').count()

452470

In [15]:
df.withColumn('trip_durance', df.dropoff_datetime.cast('long') -  df.pickup_datetime.cast('long')) \
    .select(F.round((F.max('trip_durance') / 3600), 2).alias('max_durance')).show()

+-----------+
|max_durance|
+-----------+
|      66.88|
+-----------+



In [16]:
df_zones = spark.read.parquet('zones/')

In [17]:
df.join(df_zones, df.PULocationID == df_zones.LocationID, 'inner')\
    .groupBy('Zone').count().orderBy(F.col('count').desc()).limit(5).show()

+-------------------+------+
|               Zone| count|
+-------------------+------+
|Crown Heights North|231279|
|       East Village|221244|
|        JFK Airport|188867|
|     Bushwick South|187929|
|      East New York|186780|
+-------------------+------+

