# Big Data Management Project 2:
## DESB GRAND CHALLENGE 2015

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp, regexp_extract, col, count, udf, window, date_format, lit, expr, current_timestamp, median, coalesce
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DoubleType, FloatType
from pyspark.sql.streaming import DataStreamWriter

from IPython.display import display, Markdown
import pandas as pd
import math
import time
from datetime import datetime

In [3]:
spark = SparkSession.builder \
    .appName('BDM_Project2') \
    .getOrCreate()

### Creating a 1GB dataset from the original

Rename the resulting file to sorted_data_sample and move original file elsewhere

In [14]:
#df = spark.read.csv("input/sorted_data.csv", header=True, inferSchema=True)

#df_5m = df.limit(5000000)
#df_5m.write.csv("input", header=True, mode="overwrite")

### Query 0
Data Cleansing and Setup

In [3]:
start_time = time.time()  

# Defining the schema for faster reading of data
schema = StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("trip_time_in_secs", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True),
    StructField("payment_type", StringType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("surcharge", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True)
])

# Creating a single dataframe of all the trip_data files
taxi_df_og = (
    spark.readStream
    .option("maxFilesPerTrigger", 1)
    .option("header", False)
    .schema(schema)
    .csv("input")
)

# Removing the trips with 0 passengers
# Transforming the data 
taxi_df = taxi_df_og.filter(
    (regexp_extract(col("medallion"), r"^[a-fA-F0-9]{32}$", 0) != "") &
    (regexp_extract(col("hack_license"), r"^[a-fA-F0-9]{32}$", 0) != "") &
    (col("pickup_datetime").isNotNull()) &
    (col("dropoff_datetime").isNotNull()) &               
    (col("trip_distance") > 0) &                    
    (col("fare_amount") > 0) &
    (col("tip_amount") >= 0)
)

# Drop remaining null values 
taxi_df = taxi_df.select("*") \
    .dropna()

# Start the streaming query with trigger(once=True) to process data once and stop
query = (
    taxi_df.writeStream
    .outputMode("append")
    .format("parquet")
    .option("path", "output/preprocessed_data")
    .option("checkpointLocation", "output/checkpoint")
    .trigger(once=True)  
    .start()
)

query.awaitTermination()

print("Execution time", time.time() - start_time)

Execution time 3.4203860759735107


### Grid Cells for Query 1

In [4]:
start_lat = 41.474937
start_long = -74.913585
cell_size = 0.044 # 500m to degrees for latitude (and longitude)

def grid_cells_q1(point_long, point_lat):

    # calculating the point values for latitude and longitude
    long = math.floor((point_long - start_long) / cell_size) + 1
    lat = math.floor((start_lat - point_lat) / cell_size) + 1
    
    # Ensure the cell is within valid grid bounds (300x300)
    if not (1 <= long <= 300 and 1 <= lat <= 300):
        return None 
    
    return float(f"{long}.{lat}") # Convert to X.X format

get_grid = udf(grid_cells_q1, FloatType())

### Query 1 part 1
Frequent Routes

In [5]:
# Reading in the stream
taxi_df = (spark.readStream
  .option("maxFilesPerTrigger", 1)
  .schema(schema)
  .parquet("output/preprocessed_data")
)

# Query 1.1: create a query to find the top most frequent routes during the last 30 minutes (Show only the 10 most frequent routes)
# The output query results must be: start_cell, end_cell, Number of Rides
# Aid from ChatGPT was used for the following code

# Adding start_cell and end_cell columns
taxi_df_q1 = taxi_df.withColumn("start_cell", get_grid(taxi_df.pickup_longitude, taxi_df.pickup_latitude))\
    .withColumn("end_cell", get_grid(taxi_df.dropoff_longitude, taxi_df.dropoff_latitude))\
    .filter(
        col("start_cell").isNotNull() & col("end_cell").isNotNull()  # Filter out trips outside of the grid
    )

# Defining a 30-minute tumbling window and aggregating by start_cell and end_cell
top_routes_df = (
    taxi_df_q1
    .withWatermark("dropoff_datetime", "30 minutes") 
    .groupBy(window(col("dropoff_datetime"), "30 minutes"), col("start_cell"), col("end_cell"))
    .agg(count("*").alias("Number of Rides"))
)

# Function for processing the data in batches
def process_batch(df, epoch_id):
    
    df = df.orderBy(col("window.start").desc(), col("Number of Rides").desc())

    # Collect unique time windows
    windows = df.select("window").distinct().orderBy("window.start").collect()
    
    # Finding the top routes for each "last 30 minutes"
    for window_row in windows:
        
        time_window = window_row["window"]
        start_time = time_window.start.strftime("%Y-%m-%d %H:%M:%S")
        end_time = time_window.end.strftime("%Y-%m-%d %H:%M:%S")

        display(Markdown(f"Time Window: {start_time} to {end_time}"))
        # Filter the top 10 rides for the given window
        top_routes = (
            df.filter(col("window.start") == time_window.start)
            .select("start_cell", "end_cell", "Number of Rides")
            .orderBy(col("Number of Rides").desc())
            .limit(10)
            .toPandas()  # Using to Pandas DataFrame to display the results in the notebook
        )

        display(top_routes)

# Streaming query
query: DataStreamWriter = (
    top_routes_df
    .writeStream
    .outputMode("update")
    .foreachBatch(process_batch)
    .start()
)

query.awaitTermination()

Time Window: 2013-01-01 00:00:00 to 2013-01-01 00:30:00

Unnamed: 0,start_cell,end_cell,Number of Rides
0,22.17,22.17,1138
1,22.16,22.16,730
2,21.17,21.17,584
3,21.17,22.17,433
4,22.17,22.16,422
5,22.17,21.17,355
6,22.16,22.17,294
7,21.17,21.18,212
8,21.18,21.18,169
9,21.18,21.17,166


Time Window: 2013-01-01 00:30:00 to 2013-01-01 01:00:00

Unnamed: 0,start_cell,end_cell,Number of Rides
0,22.17,22.17,2280
1,22.16,22.16,1338
2,22.17,22.16,1019
3,21.17,21.17,1002
4,21.17,22.17,941
5,22.17,21.17,901
6,22.16,22.17,872
7,21.17,21.18,445
8,22.17,21.18,441
9,21.18,22.17,388


Time Window: 2013-01-01 01:00:00 to 2013-01-01 01:30:00

Unnamed: 0,start_cell,end_cell,Number of Rides
0,22.17,22.17,2244
1,22.16,22.16,1396
2,22.17,22.16,1096
3,21.17,22.17,966
4,22.16,22.17,945
5,22.17,21.17,924
6,21.17,21.17,838
7,21.18,22.17,449
8,21.17,22.16,442
9,22.17,21.18,432


Time Window: 2013-01-01 01:30:00 to 2013-01-01 02:00:00

Unnamed: 0,start_cell,end_cell,Number of Rides
0,22.17,22.17,2129
1,22.16,22.16,1397
2,22.17,22.16,1074
3,21.17,22.17,942
4,22.16,22.17,884
5,22.17,21.17,777
6,21.17,21.17,611
7,21.18,22.17,433
8,21.17,22.16,360
9,22.17,21.18,359


Time Window: 2013-01-01 02:00:00 to 2013-01-01 02:30:00

Unnamed: 0,start_cell,end_cell,Number of Rides
0,22.17,22.17,2069
1,22.16,22.16,1349
2,22.17,22.16,1015
3,22.16,22.17,980
4,21.17,22.17,734
5,22.17,21.17,722
6,21.17,21.17,563
7,21.18,22.17,417
8,22.16,21.17,369
9,22.17,21.18,357


Time Window: 2013-01-01 02:30:00 to 2013-01-01 03:00:00

Unnamed: 0,start_cell,end_cell,Number of Rides
0,22.17,22.17,2068
1,22.16,22.16,1015
2,22.17,22.16,970
3,22.16,22.17,844
4,22.17,21.17,753
5,21.17,22.17,730
6,21.17,21.17,603
7,22.17,21.18,377
8,21.17,22.16,360
9,21.18,22.17,334


Time Window: 2013-01-01 03:00:00 to 2013-01-01 03:30:00

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

ERROR:py4j.clientserver:There was an exception while executing the Python Proxy on the Python Side.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 617, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/pyspark/sql/utils.py", line 120, in call
    raise e
  File "/usr/local/spark/python/pyspark/sql/utils.py", line 117, in call
    self.func(DataFrame(jdf, wrapped_session_jdf), batch_id)
  File "/tmp/ipykernel_15579/645468733.py", line 49, in process_batch
    .toPandas()  # Using to Pandas DataFrame to display the results in the notebook
     ^^^^^^^^^^
  File "/usr/local/spark/python/pyspark/sql/pandas/conversion.py", line 202, in toPandas
    rows = self.collect()
           ^^^^^^^^^^^^^^
  File "/usr/local/spark/python/pyspark/sql/dataframe.py", line 1263, in collect
    sock_info = self._

In [6]:
query.stop()

### Query 1 part 2

In [7]:
# Query 1.2: update the query with the delay attribute
# Aid from ChatGPT was used for the following code

previous_top_routes = None

def process_batch2(df, epoch_id):

    global previous_top_routes
    
    batch_start_time = datetime.now()
    
    df = df.orderBy(col("window.start").desc(), col("Number of Rides").desc())
    
    # Collect unique time windows
    windows = df.select("window").distinct().orderBy("window.start").collect()
    
    for window_row in windows:
        time_window = window_row["window"]
        start_time = time_window.start.strftime("%Y-%m-%d %H:%M:%S")
        end_time = time_window.end.strftime("%Y-%m-%d %H:%M:%S")
        
        # Measure delay
        delay = (datetime.now() - batch_start_time).total_seconds()

        
        # Retrieve top 10 routes
        top_routes = (
            df.filter(col("window.start") == time_window.start)
            .select("start_cell", "end_cell")
            .orderBy(col("Number of Rides").desc())
            .limit(10)
            .collect()
        )
        
        # Flatten the result into a list for comparing
        current_top_routes = [(r["start_cell"], r["end_cell"]) for r in top_routes]
        
        # Compare with the previous top routes, only update if there have been changes
        if current_top_routes != previous_top_routes:

            previous_top_routes = current_top_routes
            
            # Flatten results into a single row for display
            result = [start_time, end_time]
            
            for i in range(10):
                if i < len(top_routes):
                    result.append(top_routes[i]["start_cell"])
                    result.append(top_routes[i]["end_cell"])
                else:
                    # Filling with NULLs if fewer routes exist
                    result.append(None)
                    result.append(None)
            
            result.append(delay)
            
            # Convert result to a Pandas DataFrame for display
            result_df = pd.DataFrame([result], columns=[
                "pickup_datetime", "dropoff_datetime",
                "start_cell_id_1", "end_cell_id_1", "start_cell_id_2", "end_cell_id_2",
                "start_cell_id_3", "end_cell_id_3", "start_cell_id_4", "end_cell_id_4",
                "start_cell_id_5", "end_cell_id_5", "start_cell_id_6", "end_cell_id_6",
                "start_cell_id_7", "end_cell_id_7", "start_cell_id_8", "end_cell_id_8",
                "start_cell_id_9", "end_cell_id_9", "start_cell_id_10", "end_cell_id_10",
                "delay"
            ])
            
            # Display the result
            display(result_df)

# Streaming query
query: DataStreamWriter = (
    top_routes_df
    .writeStream
    .outputMode("update")
    .foreachBatch(process_batch2)
    .start()
)

query.awaitTermination()

Unnamed: 0,pickup_datetime,dropoff_datetime,start_cell_id_1,end_cell_id_1,start_cell_id_2,end_cell_id_2,start_cell_id_3,end_cell_id_3,start_cell_id_4,end_cell_id_4,...,end_cell_id_6,start_cell_id_7,end_cell_id_7,start_cell_id_8,end_cell_id_8,start_cell_id_9,end_cell_id_9,start_cell_id_10,end_cell_id_10,delay
0,2013-01-01 00:00:00,2013-01-01 00:30:00,22.17,22.17,22.16,22.16,21.17,21.17,21.17,22.17,...,21.17,22.16,22.17,21.17,21.18,21.18,21.18,21.18,21.17,9.44421


Unnamed: 0,pickup_datetime,dropoff_datetime,start_cell_id_1,end_cell_id_1,start_cell_id_2,end_cell_id_2,start_cell_id_3,end_cell_id_3,start_cell_id_4,end_cell_id_4,...,end_cell_id_6,start_cell_id_7,end_cell_id_7,start_cell_id_8,end_cell_id_8,start_cell_id_9,end_cell_id_9,start_cell_id_10,end_cell_id_10,delay
0,2013-01-01 00:30:00,2013-01-01 01:00:00,22.17,22.17,22.16,22.16,22.17,22.16,21.17,21.17,...,21.17,22.16,22.17,21.17,21.18,22.17,21.18,21.18,22.17,12.708077


Unnamed: 0,pickup_datetime,dropoff_datetime,start_cell_id_1,end_cell_id_1,start_cell_id_2,end_cell_id_2,start_cell_id_3,end_cell_id_3,start_cell_id_4,end_cell_id_4,...,end_cell_id_6,start_cell_id_7,end_cell_id_7,start_cell_id_8,end_cell_id_8,start_cell_id_9,end_cell_id_9,start_cell_id_10,end_cell_id_10,delay
0,2013-01-01 01:00:00,2013-01-01 01:30:00,22.17,22.17,22.16,22.16,22.17,22.16,21.17,22.17,...,21.17,21.17,21.17,21.18,22.17,21.17,22.16,22.17,21.18,15.792805


Unnamed: 0,pickup_datetime,dropoff_datetime,start_cell_id_1,end_cell_id_1,start_cell_id_2,end_cell_id_2,start_cell_id_3,end_cell_id_3,start_cell_id_4,end_cell_id_4,...,end_cell_id_6,start_cell_id_7,end_cell_id_7,start_cell_id_8,end_cell_id_8,start_cell_id_9,end_cell_id_9,start_cell_id_10,end_cell_id_10,delay
0,2013-01-01 02:00:00,2013-01-01 02:30:00,22.17,22.17,22.16,22.16,22.17,22.16,22.16,22.17,...,21.17,21.17,21.17,21.18,22.17,22.16,21.17,22.17,21.18,21.995907


Unnamed: 0,pickup_datetime,dropoff_datetime,start_cell_id_1,end_cell_id_1,start_cell_id_2,end_cell_id_2,start_cell_id_3,end_cell_id_3,start_cell_id_4,end_cell_id_4,...,end_cell_id_6,start_cell_id_7,end_cell_id_7,start_cell_id_8,end_cell_id_8,start_cell_id_9,end_cell_id_9,start_cell_id_10,end_cell_id_10,delay
0,2013-01-01 02:30:00,2013-01-01 03:00:00,22.17,22.17,22.16,22.16,22.17,22.16,22.16,22.17,...,22.17,21.17,21.17,22.17,21.18,21.17,22.16,21.18,22.17,25.578694


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

Unnamed: 0,pickup_datetime,dropoff_datetime,start_cell_id_1,end_cell_id_1,start_cell_id_2,end_cell_id_2,start_cell_id_3,end_cell_id_3,start_cell_id_4,end_cell_id_4,...,end_cell_id_6,start_cell_id_7,end_cell_id_7,start_cell_id_8,end_cell_id_8,start_cell_id_9,end_cell_id_9,start_cell_id_10,end_cell_id_10,delay
0,2013-01-01 03:00:00,2013-01-01 03:30:00,22.17,22.17,22.17,22.16,22.16,22.16,21.17,22.17,...,22.17,21.17,21.17,22.17,21.18,21.18,22.17,22.18,22.17,28.765715


In [8]:
query.stop()

### Grid Cells for Query 2

In [4]:
start_lat = 41.474937
start_long = -74.913585
cell_size = 0.022 # 250m to degrees for latitude (and longitude)

def grid_cells_q2(point_long, point_lat):
    
    long = math.floor((point_long - start_long) / cell_size) + 1
    lat = math.floor((start_lat - point_lat) / cell_size) + 1
    
    # Ensure the cell is within valid grid bounds (600x600)
    if not (1 <= long <= 600 and 1 <= lat <= 600):
        return None 
    
    return float(f"{long}.{lat}") # Convert to X.X format

get_grid2 = udf(grid_cells_q2, FloatType())

### Query 2 part 1
Profitable Areas

In [29]:
# Reading in the stream
taxi_df = (spark.readStream
  .option("maxFilesPerTrigger", 1)
  .schema(schema)
  .parquet("output/preprocessed_data")
)

# Adding start_cell and end_cell columns
taxi_df_q2 = taxi_df.withColumn("start_cell", get_grid2(taxi_df.pickup_longitude, taxi_df.pickup_latitude))\
    .withColumn("end_cell", get_grid2(taxi_df.dropoff_longitude, taxi_df.dropoff_latitude))\
    .filter(
        col("start_cell").isNotNull() & col("end_cell").isNotNull()  # Filter out trips outside of the grid
    )

# Step 1: Calculate median profit for trips starting in the cell (15-minute window)
taxi_profit_df = (
    taxi_df_q2
    .withWatermark("pickup_datetime", "15 minutes")  # Watermark for pickup_datetime (for trip start time)
    .groupBy(window(col("pickup_datetime"), "15 minutes"), col("start_cell"))
    .agg(median(col("fare_amount") + col("tip_amount")).alias("median_profit"))
)

# Step 2: Calculate the number of empty taxis for each 30-minute window
empty_taxis_df = (
    taxi_df_q2
    .withWatermark("dropoff_datetime", "30 minutes")  # Watermark for dropoff_datetime (for empty taxis)
    .groupBy(window(col("dropoff_datetime"), "30 minutes"), col("end_cell"))
    .agg(count("*").alias("empty_taxis"))
)

# Step 3: Join the two DataFrames to calculate profitability, with appropriate watermarking
profitability_df = (
    taxi_profit_df.alias("profit")
    .join(
        empty_taxis_df.alias("empty"),
        (
            col("profit.window.end") == col("empty.window.start")) & 
            (col("profit.start_cell") == col("empty.end_cell")),
        "inner"
    )
    .select(
        col("profit.window.start").alias("pickup_datetime"),
        col("empty.window.end").alias("dropoff_datetime"),
        col("profit.start_cell").alias("profitable_cell_id"),
        col("empty.empty_taxis").alias("empty_taxis_in_cell"),
        col("profit.median_profit").alias("median_profit_in_cell"),
        (col("profit.median_profit") / col("empty.empty_taxis")).alias("profitability_of_cell")
    )
)


# Function to process the data in batches
def process_batch(df, epoch_id):
    pandas_df = df.toPandas().nlargest(10, "profitability_of_cell")
    display(pandas_df)

# Streaming query (using Append mode)
query = (
    profitability_df
    .writeStream
    .outputMode("append")  # Use append mode instead of update
    .foreachBatch(process_batch)
    .start()
)

query.awaitTermination()

Unnamed: 0,pickup_datetime,dropoff_datetime,profitable_cell_id,empty_taxis_in_cell,median_profit_in_cell,profitability_of_cell


Unnamed: 0,pickup_datetime,dropoff_datetime,profitable_cell_id,empty_taxis_in_cell,median_profit_in_cell,profitability_of_cell
964,2013-01-01 03:15:00,2013-01-01 04:00:00,30.4,1,145.0,145.0
4004,2013-01-01 03:45:00,2013-01-01 04:30:00,34.349998,1,120.0,120.0
3367,2013-01-01 05:45:00,2013-01-01 06:30:00,54.389999,1,110.0,110.0
2527,2013-01-02 11:15:00,2013-01-02 12:00:00,51.330002,1,97.2,97.2
2297,2013-01-02 01:15:00,2013-01-02 02:00:00,41.330002,1,80.0,80.0
2174,2013-01-01 01:15:00,2013-01-01 02:00:00,35.299999,1,70.0,70.0
2030,2013-01-01 04:45:00,2013-01-01 05:30:00,43.279999,1,68.0,68.0
3088,2013-01-02 08:15:00,2013-01-02 09:00:00,50.32,3,197.25,65.75
1202,2013-01-02 05:15:00,2013-01-02 06:00:00,50.349998,1,65.0,65.0
1435,2013-01-02 04:15:00,2013-01-02 05:00:00,37.310001,1,65.0,65.0


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

Unnamed: 0,pickup_datetime,dropoff_datetime,profitable_cell_id,empty_taxis_in_cell,median_profit_in_cell,profitability_of_cell
1604,2013-01-03 17:15:00,2013-01-03 18:00:00,39.32,1,135.5,135.5
3325,2013-01-03 18:15:00,2013-01-03 19:00:00,52.369999,1,120.0,120.0
760,2013-01-02 21:15:00,2013-01-02 22:00:00,47.32,1,83.01,83.01
3133,2013-01-03 21:45:00,2013-01-03 22:30:00,46.310001,1,72.5,72.5
2765,2013-01-03 10:15:00,2013-01-03 11:00:00,50.279999,1,68.5,68.5
2498,2013-01-02 22:15:00,2013-01-02 23:00:00,40.330002,1,68.4,68.4
848,2013-01-03 14:15:00,2013-01-03 15:00:00,51.360001,1,67.75,67.75
7,2013-01-04 07:15:00,2013-01-04 08:00:00,50.360001,1,66.32,66.32
57,2013-01-03 07:15:00,2013-01-03 08:00:00,41.330002,1,63.46,63.46
965,2013-01-03 11:15:00,2013-01-03 12:00:00,51.369999,1,63.46,63.46


In [30]:
query.stop() 