In [1]:
import os
execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.2
      /_/

Using Python version 2.7.15 (default, Oct  2 2018 11:42:04)
SparkSession available as 'spark'.


## Extract the column names

In [2]:
# datafile = "green_tripdata_2018-06.csv" # file with locationid
datafile = "green_tripdata_2013-12.csv" # file with long/lat
with open(datafile) as f:
    col_names = f.readline()
col_names = col_names.split(",")
col_names

['VendorID',
 'lpep_pickup_datetime',
 'Lpep_dropoff_datetime',
 'Store_and_fwd_flag',
 'RateCodeID',
 'Pickup_longitude',
 'Pickup_latitude',
 'Dropoff_longitude',
 'Dropoff_latitude',
 'Passenger_count',
 'Trip_distance',
 'Fare_amount',
 'Extra',
 'MTA_tax',
 'Tip_amount',
 'Tolls_amount',
 'Ehail_fee',
 'Total_amount',
 'Payment_type',
 'Trip_type \n']

## Filter out the useful columns from dataset

In [3]:
column_map = {}
useful_columns = ["pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "pickup_datetime","pickup_latitude", "pickup_longitude","dropoff_datetime","dolocationid", "pulocationid", "fare_amount", "trip_distance", "passenger_count"]

for i, c in enumerate(col_names):
    c = c.lower()
    for u in useful_columns:
        if c in u or u in c:
            column_map[u] = i
column_map

{'dropoff_datetime': 2,
 'dropoff_latitude': 8,
 'dropoff_longitude': 7,
 'fare_amount': 11,
 'passenger_count': 9,
 'pickup_datetime': 1,
 'pickup_latitude': 6,
 'pickup_longitude': 5,
 'trip_distance': 10}

## Create a dataframes from rdds. Cast them to appropriate type. Drop null valued rows

In [4]:
from pyspark.sql import Row
from pyspark.sql.types import FloatType, IntegerType
from pyspark.sql.functions import col

rdd1 = sc.textFile(datafile).map(lambda line: line.split(",")).filter(lambda line: len(line)>1)

if 'dolocationid' in column_map:
    df = rdd1.map(lambda line: Row(pulocationid=line[column_map['pulocationid']], 
                              dolocationid=line[column_map['dolocationid']], 
                              pickup_datetime=line[column_map['pickup_datetime']], 
                              dropoff_datetime=line[column_map['dropoff_datetime']],
                              trip_distance=line[column_map['trip_distance']], 
                              fare_amount=line[column_map['fare_amount']], 
                              passenger_count=line[column_map['passenger_count']])).toDF()
    df = df.withColumn("dolocationid", df["dolocationid"].cast(IntegerType()))
    df = df.withColumn("fare_amount",df["fare_amount"].cast(FloatType()))
    df = df.withColumn("passenger_count", df["passenger_count"].cast(IntegerType()))
    df = df.withColumn("pulocationid", df["pulocationid"].cast(IntegerType()))
    df = df.withColumn("trip_distance", df["trip_distance"].cast(FloatType()))
    
    df = df.filter(col("dolocationid").isNotNull() & col("dropoff_datetime").isNotNull() & \
                   col("fare_amount").isNotNull() & col("passenger_count").isNotNull() & \
                   col("pulocationid").isNotNull() & col("trip_distance").isNotNull() & col("pickup_datetime").isNotNull())

else:
    df = rdd1.map(lambda line: Row(pickup_longitude=line[column_map['pickup_longitude']], 
                                  pickup_latitude=line[column_map['pickup_latitude']], 
                                  dropoff_longitude=line[column_map['dropoff_longitude']], 
                                  dropoff_latitude=line[column_map['dropoff_latitude']], 
                                  pickup_datetime=line[column_map['pickup_datetime']], 
                                  dropoff_datetime=line[column_map['dropoff_datetime']], 
                                  trip_distance=line[column_map['trip_distance']], 
                                  fare_amount=line[column_map['fare_amount']], 
                                  passenger_count=line[column_map['passenger_count']])).toDF()
    df = df.withColumn("dropoff_longitude", df["dropoff_longitude"].cast(FloatType()))
    df = df.withColumn("dropoff_latitude", df["dropoff_latitude"].cast(FloatType()))
    df = df.withColumn("fare_amount",df["fare_amount"].cast(FloatType()))
    df = df.withColumn("passenger_count", df["passenger_count"].cast(IntegerType()))
    df = df.withColumn("pickup_longitude", df["pickup_longitude"].cast(FloatType()))
    df = df.withColumn("pickup_latitude", df["pickup_latitude"].cast(FloatType()))
    df = df.withColumn("trip_distance", df["trip_distance"].cast(FloatType()))
    df = df.filter(col("dropoff_longitude").isNotNull() & col("dropoff_datetime").isNotNull() & \
                   col("fare_amount").isNotNull() & col("passenger_count").isNotNull() & \
                   col("dropoff_longitude").isNotNull() & col("trip_distance").isNotNull() & \
                   col("pickup_latitude").isNotNull() & col("pickup_longitude").isNotNull() & \
                   col("pickup_datetime").isNotNull())

df.show()


+-------------------+----------------+-----------------+-----------+---------------+-------------------+---------------+----------------+-------------+
|   dropoff_datetime|dropoff_latitude|dropoff_longitude|fare_amount|passenger_count|    pickup_datetime|pickup_latitude|pickup_longitude|trip_distance|
+-------------------+----------------+-----------------+-----------+---------------+-------------------+---------------+----------------+-------------+
|2013-12-01 20:44:23|       40.742355|        -73.95726|       13.0|              1|2013-12-01 00:00:00|            0.0|             0.0|          4.0|
|2013-12-01 02:53:23|             0.0|              0.0|       12.0|              1|2013-12-01 00:00:00|            0.0|             0.0|         3.17|
|2013-12-01 02:17:35|       40.729755|        -73.99153|       13.0|              1|2013-12-01 00:00:00|            0.0|             0.0|         3.61|
|2013-12-01 12:57:31|        40.75857|        -73.76845|       19.0|              1|2013

## Get the pickup and dropoff zipcode for the trip

In [7]:
if 'dolocationid' in column_map:
    import csv
    from itertools import chain
    from pyspark.sql.functions import create_map, lit
    
    mapFile = 'locationid_zipcode_map.csv'
    reader = csv.reader(open(mapFile, 'r'))
    zipcodeMap = {}
    reader.next()
    for row in reader:
        try:
            zipcodeMap[int(row[0])] = int(row[1])
        except ValueError:
            continue

    mapping_expr = create_map([lit(x) for x in chain(*zipcodeMap.items())])
    df = df.withColumn("dropoff_zipcode", mapping_expr[df["dolocationid"]])
    df = df.withColumn("pickup_zipcode", mapping_expr[df["pulocationid"]])
    df = df.drop("dolocationid", "pulocationid")
else:
    from uszipcode import SearchEngine
    from uszipcode import Zipcode
    from pyspark.sql.functions import udf, array
    import numpy as np

    search = SearchEngine(simple_zipcode=True)
    def get_zip_code(para):
        try:
            result = search.by_coordinates(para[0], para[1], radius=5, returns=1)
            if result:
                return result[0].zipcode
            else:
                return np.nan
        except ValueError as e:
            print(para)
            raise ValueError(e)
    
    
    df = df.withColumn("dropoff_zipcode", udf_get_zip_code([df["dropoff_latitude"], df["dropoff_longitude"]]))
#     df = df.withColumn("pickup_zipcode", udf_get_zip_code(array(df["pickup_latitude"], df["pickup_longitude"])))
    df = df.drop("dropoff_latitude", "dropoff_longitude", "pickup_latitude", "pickup_longitude")

Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/2.3.2/libexec/python/pyspark/cloudpickle.py", line 235, in dump
    return Pickler.dump(self, obj)
  File "/usr/local/Cellar/python@2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 224, in dump
    self.save(obj)
  File "/usr/local/Cellar/python@2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 286, in save
    f(self, obj) # Call unbound method with explicit self
  File "/usr/local/Cellar/python@2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 554, in save_tuple
    save(element)
  File "/usr/local/Cellar/python@2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 286, in save
    f(self, obj) # Call unbound method with explicit self
  File "/usr/local/Cellar/apache-spark/2.3.2/libexec/python/pyspark/cloudpickle.py", line 372, in save_function
    self.save_function_tuple(obj)
  File

PicklingError: Could not serialize object: TypeError: can't pickle wrapper_descriptor objects

In [None]:
df.show()