In [1]:
import os
execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.2
      /_/

Using Python version 2.7.15 (default, Oct  2 2018 11:42:04)
SparkSession available as 'spark'.


## Extract the column names

In [20]:
datafile = "green_tripdata_2018-06.csv"
with open(datafile) as f:
    first_line = f.readline()
col_names

[u'VendorID',
 u'lpep_pickup_datetime',
 u'lpep_dropoff_datetime',
 u'store_and_fwd_flag',
 u'RatecodeID',
 u'PULocationID',
 u'DOLocationID',
 u'passenger_count',
 u'trip_distance',
 u'fare_amount',
 u'extra',
 u'mta_tax',
 u'tip_amount',
 u'tolls_amount',
 u'ehail_fee',
 u'improvement_surcharge',
 u'total_amount',
 u'payment_type',
 u'trip_type']

## Filter out the useful columns from dataset

In [21]:
column_map = {}
useful_columns = ["pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "pickup_datetime","pickup_latitude", "pickup_longitude","dropoff_datetime","dolocationid", "pulocationid", "fare_amount", "trip_distance", "passenger_count"]

for i, c in enumerate(col_names):
    c = c.lower()
    for u in useful_columns:
        if c in u or u in c:
            column_map[u] = i
column_map

{'dolocationid': 6,
 'dropoff_datetime': 2,
 'fare_amount': 9,
 'passenger_count': 7,
 'pickup_datetime': 1,
 'pulocationid': 5,
 'trip_distance': 8}

## Create a dataframes from rdds. Cast them to appropriate type

In [27]:
from pyspark.sql import Row
from pyspark.sql.types import FloatType

rdd1 = sc.textFile(datafile).map(lambda line: line.split(",")).filter(lambda line: len(line)>1)

if 'dolocationid' in column_map:
    df = rdd1.map(lambda line: Row(dolocationid=line[column_map['dolocationid']], 
                              dropoff_datetime=line[column_map['dropoff_datetime']], 
                              fare_amount=line[column_map['fare_amount']], 
                              passenger_count=line[column_map['passenger_count']], 
                              pickup_datetime=line[column_map['pickup_datetime']], 
                              pulocationid=line[column_map['pulocationid']], 
                              trip_distance=line[column_map['trip_distance']])).toDF()
    df = df.withColumn("dolocationid", df["dolocationid"].cast(FloatType()))
    df = df.withColumn("dropoff_datetime", df["dropoff_datetime"].cast(FloatType()))
    df = df.withColumn("fare_amount",df["fare_amount"].cast(FloatType()))
    df = df.withColumn("passenger_count", df["passenger_count"].cast(FloatType()))
    df = df.withColumn("pickup_datetime", df["pickup_datetime"].cast(FloatType()))
    df = df.withColumn("pulocationid", df["pulocationid"].cast(FloatType()))
    df = df.withColumn("trip_distance", df["trip_distance"].cast(FloatType()))
else:
    df = rdd1.map(lambda line: Row(dropoff_longitude=line[column_map['dropoff_longitude']], 
                                  dropoff_latitude=line[column_map['dropoff_latitude']], 
                                  pickup_longitude=line[column_map['pickup_longitude']], 
                                  pickup_latitude=line[column_map['pickup_latitude']], 
                                  dropoff_datetime=line[column_map['dropoff_datetime']], 
                                  fare_amount=line[column_map['fare_amount']], 
                                  passenger_count=line[column_map['passenger_count']], 
                                  pickup_datetime=line[column_map['pickup_datetime']], 
                                  trip_distance=line[column_map['trip_distance']])).toDF()
    df = df.withColumn("dropoff_longitude", df["dropoff_longitude"].cast(FloatType()))
    df = df.withColumn("dropoff_latitude", df["dropoff_latitude"].cast(FloatType()))
    df = df.withColumn("dropoff_datetime", df["dropoff_datetime"].cast(FloatType()))
    df = df.withColumn("fare_amount",df["fare_amount"].cast(FloatType()))
    df = df.withColumn("passenger_count", df["passenger_count"].cast(FloatType()))
    df = df.withColumn("pickup_datetime", df["pickup_datetime"].cast(FloatType()))
    df = df.withColumn("pickup_longitude", df["pickup_longitude"].cast(FloatType()))
    df = df.withColumn("pickup_latitude", df["pickup_latitude"].cast(FloatType()))
    df = df.withColumn("trip_distance", df["trip_distance"].cast(FloatType()))

