In [12]:
from pyspark.sql.functions import col
%reload_ext autoreload
%autoreload 2
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql import DataFrame
import pyspark.sql.types as t
import pyspark.sql.types as f
from workers.Aliases import *
from workers import FareWorker, TripWorker

In [3]:
def init_spark() -> SparkSession:
    return (SparkSession.builder
            .master("local")
            .appName("Python")
            .config(conf=SparkConf())
            .getOrCreate())

In [5]:
def create_df_example(spark: SparkSession) -> DataFrame:
    data = [("Alice", 20), ("Bob", 30), ("Charlie", 50)]
    schema = t.StructType([t.StructField("Name", t.StringType(), True),
                           t.StructField("Age", t.IntegerType(), True)])
    return spark.createDataFrame(data, schema)

In [13]:
spark = init_spark()
spark

In [16]:
tripWorker = TripWorker.TripWorker(spark)

In [17]:
tripWorker.df.show(n=10)

+--------------------+--------------------+---------+---------+------------------+-------------------+-------------------+---------------+-----------------+-------------+----------------+---------------+-----------------+----------------+
|           medallion|        hack_license|vendor_id|rate_code|store_and_fwd_flag|    pickup_datetime|   dropoff_datetime|passenger_count|trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|
+--------------------+--------------------+---------+---------+------------------+-------------------+-------------------+---------------+-----------------+-------------+----------------+---------------+-----------------+----------------+
|89D227B655E5C82AE...|BA96DE419E711691B...|      CMT|        1|                 N|2013-01-01 15:11:48|2013-01-01 15:18:10|              4|              382|          1.0|      -73.978165|      40.757977|        -73.98984|        40.75117|
|0BD7C8F5BA12B88E0...|9FD8F69F0804BDB55...| 

In [9]:
tripWorker.df.printSchema()

root
 |-- medallion: string (nullable = true)
 |-- hack_license: string (nullable = true)
 |-- vendor_id: string (nullable = true)
 |-- rate_code: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_time_in_secs: integer (nullable = true)
 |-- trip_distance: float (nullable = true)
 |-- pickup_longitude: float (nullable = true)
 |-- pickup_latitude: float (nullable = true)
 |-- dropoff_longitude: float (nullable = true)
 |-- dropoff_latitude: float (nullable = true)


In [10]:
tripWorker.schema()

root
 |-- medallion: string (nullable = true)
 |-- hack_license: string (nullable = true)
 |-- vendor_id: string (nullable = true)
 |-- rate_code: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_time_in_secs: integer (nullable = true)
 |-- trip_distance: float (nullable = true)
 |-- pickup_longitude: float (nullable = true)
 |-- pickup_latitude: float (nullable = true)
 |-- dropoff_longitude: float (nullable = true)
 |-- dropoff_latitude: float (nullable = true)


In [18]:
num_cols = [f.name for f in tripWorker.df.schema.fields if isinstance(f.dataType, t.NumericType)]
tripWorker.df.describe(num_cols).show()

+-------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+
|summary|         rate_code|   passenger_count|trip_time_in_secs|     trip_distance|  pickup_longitude|   pickup_latitude| dropoff_longitude| dropoff_latitude|
+-------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+
|  count|         171899205|         171899205|        171899205|         171899205|         171899205|         171899205|         171899205|        171899205|
|   mean|1.0270787523421065|1.7116061647870913|757.7813952775407|2.9229047508589625|-72.55375877214989|39.808676497132545|-72.55916906300426|39.81256532636291|
| stddev|0.3005756867875085|1.3771786361954093|560.5441541100786| 20.95747950872376|10.967169280209191| 8.655816513663162|10.938848778547408|8.685174083633768|
|    min|                 0|            

In [27]:
tripWorker.df.select(rate_code).distinct().show(1000)

+---------+
|rate_code|
+---------+
|        1|
|        6|
|        3|
|        5|
|        4|
|        8|
|        2|
|        0|
|      210|
|       28|
|        7|
|        9|
|       65|
|      128|
|       10|
|       13|
|       15|
|       79|
|      221|
|       17|
|      208|
|      206|
|       77|
|      239|
|      200|
|       16|
+---------+


In [28]:
tripWorker.df.select(passenger_count).distinct().show(1000)

+---------------+
|passenger_count|
+---------------+
|              1|
|              6|
|              3|
|              5|
|              4|
|              2|
|              0|
|            208|
|              9|
|            255|
|              7|
|              8|
|            129|
+---------------+


In [28]:
(tripWorker.df
 .select((col(trip_time_in_secs) / 60).cast("int").alias("trip_time_in_min"))
 .filter(col('trip_time_in_min') > 10)
 .distinct()
 .orderBy('trip_time_in_min')
 .show(10000))

+----------------+
|trip_time_in_min|
+----------------+
|              11|
|              12|
|              13|
|              14|
|              15|
|              16|
|              17|
|              18|
|              19|
|              20|
|              21|
|              22|
|              23|
|              24|
|              25|
|              26|
|              27|
|              28|
|              29|
|              30|
|              31|
|              32|
|              33|
|              34|
|              35|
|              36|
|              37|
|              38|
|              39|
|              40|
|              41|
|              42|
|              43|
|              44|
|              45|
|              46|
|              47|
|              48|
|              49|
|              50|
|              51|
|              52|
|              53|
|              54|
|              55|
|              56|
|              57|
|              58|
|              59|
|           

In [22]:
(tripWorker.df
 .select(trip_distance, (col(trip_distance) / 1000.0).alias(trip_distance + ' in km'))
 .filter(col(trip_distance + ' in km') > 1)
 .distinct().orderBy(trip_distance + ' in km').show(10000))

+-------------+-------------------+
|trip_distance|trip_distance in km|
+-------------+-------------------+
|       1000.5|             1.0005|
|       1005.0|              1.005|
|       1040.0|               1.04|
|       1047.4| 1.0474000244140624|
|       1232.0|              1.232|
|       1300.0|                1.3|
|       1300.5|             1.3005|
|       1313.1| 1.3130999755859376|
|       1320.6| 1.3205999755859374|
|       1350.0|               1.35|
|       1440.0|               1.44|
|       1483.1| 1.4830999755859375|
|       1485.2|  1.485199951171875|
|       1500.0|                1.5|
|       1566.6| 1.5665999755859374|
|       1574.6| 1.5745999755859375|
|       1600.0|                1.6|
|       1650.0|               1.65|
|       1800.0|                1.8|
|       2001.9| 2.0019000244140623|
|       2139.5|             2.1395|
|       2500.5|             2.5005|
|       2800.5|             2.8005|
|       2840.0|               2.84|
|       3008.3|  3.008300048

In [19]:
fareWorker = FareWorker.FareWorker(spark)

In [20]:
fareWorker.df.show()

+--------------------+--------------------+---------+-------------------+------------+-----------+---------+-------+----------+------------+------------+
|           medallion|        hack_license|vendor_id|    pickup_datetime|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|total_amount|
+--------------------+--------------------+---------+-------------------+------------+-----------+---------+-------+----------+------------+------------+
|89D227B655E5C82AE...|BA96DE419E711691B...|      CMT|2013-01-01 15:11:48|         CSH|        6.5|      0.0|    0.5|       0.0|         0.0|         7.0|
|0BD7C8F5BA12B88E0...|9FD8F69F0804BDB55...|      CMT|2013-01-06 00:18:35|         CSH|        6.0|      0.5|    0.5|       0.0|         0.0|         7.0|
|0BD7C8F5BA12B88E0...|9FD8F69F0804BDB55...|      CMT|2013-01-05 18:49:41|         CSH|        5.5|      1.0|    0.5|       0.0|         0.0|         7.0|
|DFD2202EE08F7A8DC...|51EE87E3205C985EF...|      CMT|2013-01-07 23:54:15|   

In [None]:
num_cols = [f.name for f in fareWorker.df.schema.fields if isinstance(f.dataType, t.NumericType)]
fareWorker.df.describe(num_cols).show()