In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[{}]".format(8)) \
    .config("spark.driver.memory", "{}g".format(16)) \
    .getOrCreate()

sc = spark.sparkContext

In [2]:
from pyspark.sql.types import *


data_path = '/home/navid/data/rides_2019-01-01_2019-07-01.csv'

ride_schema = StructType([
    StructField("originLat", FloatType(), True),
    StructField("originLon", FloatType(), True), 
    StructField("destinationLat", FloatType(), True), 
    StructField("destinationLon", FloatType(), True),
    StructField("createdAt", TimestampType(), True),
    StructField("price", FloatType(), True)])

    
ride_df_raw = spark.read.csv(data_path, schema=ride_schema, header=True)


In [3]:
#filter data

start_date = '2018-09-01'
end_date = '2018-11-01'

lat_parts = 15
lon_parts = 24

min_lat, min_lon = 35.64246, 51.168630
max_lat, max_lon = 35.809052, 51.56770

ride_df_filtered = ride_df_raw.filter(ride_df_raw["createdAt"] >= start_date).filter(ride_df_raw["createdAt"] < end_date)
ride_df_filtered = ride_df_filtered.filter(ride_df_filtered['originLat'] >= min_lat)\
                                   .filter(ride_df_filtered['originLon'] >= min_lon)\
                                   .filter(ride_df_filtered['originLat'] <= max_lat)\
                                   .filter(ride_df_filtered['originLon'] <= max_lon)
    
ride_df_filtered.persist()

DataFrame[originLat: float, originLon: float, destinationLat: float, destinationLon: float, createdAt: timestamp, price: float]

In [4]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
from datetime import datetime, timedelta

def truncate(dt):
    return dt - timedelta(
        minutes=dt.minute % 5,
        seconds=dt.second,
        microseconds=dt.microsecond
    )

trunc_udf = F.udf(truncate, TimestampType())
ride_df = ride_df_filtered.withColumn('timeId', trunc_udf(ride_df_filtered['createdAt']))
ride_df = ride_df.withColumn('rowId', ((ride_df['originLat']-min_lat)/(max_lat-min_lat)*lat_parts).cast(IntegerType()))
ride_df = ride_df.withColumn('colId', ((ride_df['originLon']-min_lon)/(max_lon-min_lon)*lon_parts).cast(IntegerType()))

ride_df = ride_df.drop('originLat', 'originLon', 'createdAt', 'price', 'destinationLat', 'destinationLon')

ride_df.persist()

DataFrame[timeId: timestamp, rowId: int, colId: int]

In [5]:
from pyspark.sql.functions import col

def generate_series(start, stop, interval):
    """
    :param start  - lower bound, inclusive
    :param stop   - upper bound, exclusive
    :interval int - increment interval in seconds
    """
    start, stop = spark.createDataFrame(
        [(start, stop)], ("start", "stop")
    ).select(
        [col(c).cast("timestamp").cast("long") for c in ("start", "stop")
    ]).first()

    return spark.range(start, stop, interval).select(
        col("id").cast("timestamp").alias("timeId")
    )

complete_date_range = generate_series(start_date, end_date, 300)

In [None]:
ride_df.show()

In [None]:
## NOTE THAT some time intervals may not have data

timeIds = ride_df.select('timeId').distinct().collect()
print('timeIds: {}, complete_timeIds: {}'.format(len(timeIds), complete_date_range.count()))    

In [None]:

rides_list = ride_df.sort('timeId').collect()


In [None]:
import numpy as np


last_time_id = None
reqs = np.zeros((lat_parts, lon_parts))
ride_windows = []

for idx, ride in enumerate(rides_list):
    
    if last_time_id != ride.timeId:
        if last_time_id is not None:
            ride_windows.append(reqs)
        reqs = np.zeros((lat_parts, lon_parts))
        last_time_id = ride.timeId
    
    reqs[ride.rowId][ride.colId] += 1
    
    if idx % 100000 == 0:
        print("{}/{} data processed".format(idx, len(rides_list)))
    
    




    

In [None]:
ride_windows = np.array(ride_windows)

print("shape of reult dataset: {}".format(ride_windows.shape))

In [None]:
import pickle

with open('./conv_lstm_sequence_data3.pc', 'wb') as f:
    pickle.dump(ride_windows, f)