In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[{}]".format(8)) \
    .config("spark.driver.memory", "{}g".format(8)) \
    .getOrCreate()

sc = spark.sparkContext

In [2]:
from pyspark.sql.types import *


data_path = '/home/navid/data/rides_2019-01-01_2019-07-01.csv'

ride_schema = StructType([
    StructField("createdAt", TimestampType(), True),
    StructField("latitude", FloatType(), True),
    StructField("longitude", FloatType(), True)])

    
ride_df_raw = spark.read.csv(data_path, schema=ride_schema, header=True)


In [3]:
# ride_df_raw.show()

In [4]:
#filter data

start_date = '2019-04-01'
end_date = '2019-07-01'

lat_parts = 20
lon_parts = 30

min_lat, min_lon = 35.64246, 51.168630
max_lat, max_lon = 35.809052, 51.56770

ride_df_filtered = ride_df_raw.filter(ride_df_raw["createdAt"] >= start_date).filter(ride_df_raw["createdAt"] < end_date)
ride_df_filtered = ride_df_filtered.filter(ride_df_filtered['latitude'] >= min_lat)\
                                   .filter(ride_df_filtered['longitude'] >= min_lon)\
                                   .filter(ride_df_filtered['latitude'] <= max_lat)\
                                   .filter(ride_df_filtered['longitude'] <= max_lon)
    
ride_df_filtered.persist()

DataFrame[createdAt: timestamp, latitude: float, longitude: float]

In [5]:
# ride_df_filtered.count()

In [6]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
from datetime import datetime, timedelta

def truncate(dt):
    return dt - timedelta(
        minutes=dt.minute % 5,
        seconds=dt.second,
        microseconds=dt.microsecond
    )

trunc_udf = F.udf(truncate, TimestampType())
ride_df = ride_df_filtered.withColumn('timeId', trunc_udf(ride_df_filtered['createdAt']))
ride_df = ride_df.withColumn('rowId', ((ride_df['latitude']-min_lat)/(max_lat-min_lat)*lat_parts).cast(IntegerType()))
ride_df = ride_df.withColumn('colId', ((ride_df['longitude']-min_lon)/(max_lon-min_lon)*lon_parts).cast(IntegerType()))

# ride_df = ride_df.drop('originLat', 'originLon', 'createdAt', 'price', 'destinationLat', 'destinationLon')

ride_df.persist()

DataFrame[createdAt: timestamp, latitude: float, longitude: float, timeId: timestamp, rowId: int, colId: int]

In [7]:
from pyspark.sql.functions import col

def generate_series(start, stop, interval):
    """
    :param start  - lower bound, inclusive
    :param stop   - upper bound, exclusive
    :interval int - increment interval in seconds
    """
    start, stop = spark.createDataFrame(
        [(start, stop)], ("start", "stop")
    ).select(
        [col(c).cast("timestamp").cast("long") for c in ("start", "stop")
    ]).first()

    return spark.range(start, stop, interval).select(
        col("id").cast("timestamp").alias("timeId")
    )

complete_date_range = generate_series(start_date, end_date, 300)

In [8]:
# ride_df.show()

In [9]:
## NOTE THAT some time intervals may not have data
"""
timeIds = ride_df.select('timeId').distinct().collect()
print('timeIds: {}, complete_timeIds: {}'.format(len(timeIds), complete_date_range.count()))    
"""

"\ntimeIds = ride_df.select('timeId').distinct().collect()\nprint('timeIds: {}, complete_timeIds: {}'.format(len(timeIds), complete_date_range.count()))    \n"

In [10]:

rides_list = ride_df.sort('timeId').collect()


In [11]:
import numpy as np


last_time_id = None
reqs = np.zeros((lat_parts, lon_parts))
ride_windows = []

for idx, ride in enumerate(rides_list):
    
    if last_time_id != ride.timeId:
        if last_time_id is not None:
            ride_windows.append(reqs)
        reqs = np.zeros((lat_parts, lon_parts))
        last_time_id = ride.timeId
    
    reqs[ride.rowId][ride.colId] += 1
    
    if idx % 100000 == 0:
        print("{}/{} data processed".format(idx, len(rides_list)))
    
    




    

0/17489093 data processed
100000/17489093 data processed
200000/17489093 data processed
300000/17489093 data processed
400000/17489093 data processed
500000/17489093 data processed
600000/17489093 data processed
700000/17489093 data processed
800000/17489093 data processed
900000/17489093 data processed
1000000/17489093 data processed
1100000/17489093 data processed
1200000/17489093 data processed
1300000/17489093 data processed
1400000/17489093 data processed
1500000/17489093 data processed
1600000/17489093 data processed
1700000/17489093 data processed
1800000/17489093 data processed
1900000/17489093 data processed
2000000/17489093 data processed
2100000/17489093 data processed
2200000/17489093 data processed
2300000/17489093 data processed
2400000/17489093 data processed
2500000/17489093 data processed
2600000/17489093 data processed
2700000/17489093 data processed
2800000/17489093 data processed
2900000/17489093 data processed
3000000/17489093 data processed
3100000/17489093 data p

In [12]:
ride_windows = np.array(ride_windows)

print("shape of reult dataset: {}".format(ride_windows.shape))

shape of reult dataset: (26129, 20, 30)


In [13]:
import pickle

with open('./conv_lstm_sequence_data6.pc', 'wb') as f:
    pickle.dump(ride_windows, f)