In [14]:
import datetime as dt
import numpy as np 
import pandas as pd
import time
import argparse
from pathlib import Path
from pyspark import SparkContext
from pyspark.sql.session import SparkSession

In [15]:
sc = SparkContext()
spark = SparkSession(sc)

In [16]:
input_file = './data/2020-02.csv'
output_path = 'output/'

In [17]:
month_rides = sc.textFile(input_file, use_unicode=True).cache()
print("Number of partitions: ", month_rides.getNumPartitions())

Number of partitions:  2


In [95]:
def get_date_time(date, time):
    #   Format:
    #       28/02/2020,23:58:15,
    #       29/02/2020,0:01:55
    from datetime import datetime    
    datetime_object = datetime.strptime(date+" "+time, '%d/%m/%Y %H:%M:%S')
    return datetime_object

def parse_ride_records(part_id, list_of_records):
    # Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo
    # M,44,4357,442,1/2/2020,0:00:38,116,1/2/2020,0:35:17
    if part_id == 0: 
        next(list_of_records) # skipping the header line
    import csv
    reader = csv.reader(list_of_records)
    for row in reader:
        gender = row[0]
        age = int(row[1])
        bike_id = int(row[2])
        station_start = row[3]
        datetime_start = get_date_time(row[4], row[5])
        datetime_end = get_date_time(row[7], row[8])
        station_end = row[6]

        yield (bike_id, gender, age, station_start, station_end, datetime_start, datetime_end)

1. Top 5 stations by most starts
2. Most popular trips based on start station and end station
3. Rider performance based on avg. bike ride time by gender
4. What is the busiest bike in the CDMX in February 2020? 
   a. How many times was it used? 
   b. How many secs was it in use?

In [115]:
def get_rides_rdd(sc, input_file):
    print("Reading input file:", input_file)

    rides_rdd = sc.textFile(input_file, use_unicode=True) \
        .mapPartitionsWithIndex(parse_ride_records).cache()
    
    print("Number of partitions: ", rides_rdd.getNumPartitions())
    return rides_rdd

def get_top_start_stations(num, rides_rdd):
    results = rides_rdd.map(lambda x: (x[3], x[6]-x[5] ) ) \
        .filter(lambda x: x[1].total_seconds() <= 60 * 60 * 2) \
        .map(lambda x: ( x[0], 1 ) ) \
        .reduceByKey(lambda x,y: x+y) \
        .map(lambda x: (x[1], x[0])) \
        .top(num, key=lambda x: x) 
    return results

def get_top_routes(num, rides_rdd): 
    results = rides_rdd.map(lambda x: ( (x[3],x[4]), x[6]-x[5] ) ) \
        .filter(lambda x: x[1].total_seconds() <= 60 * 60 * 2) \
        .map(lambda x: ( x[0], 1 ) ) \
        .reduceByKey(lambda x,y: x + y) \
        .map(lambda x: (x[1], x[0])) \
        .top(num, key=lambda x: x)   
    return results

def get_performance_by_gender(rides_rdd):
    results = rides_rdd.map(lambda x: (x[1], x[6]-x[5] ) ) \
        .filter(lambda x: x[1].total_seconds() <= 60 * 60 * 2) \
        .mapValues(lambda x: (x.total_seconds(), 1) ) \
        .reduceByKey(lambda x, y: ((x[0] + y[0]), x[1] + y[1]) ) \
        .mapValues(lambda x: (x[0] / x[1], x[1]) ) \
        .collect()
    return results   

def get_busy_bees(num, rides_rdd, by_count=False):
    results = rides_rdd.map(lambda x: (x[0], x[6]-x[5] ) ) \
        .filter(lambda x: x[1].total_seconds() <= 60 * 60 * 2) \
        .mapValues(lambda x: (1, x.total_seconds())) \
        .reduceByKey(lambda x, y: ((x[0] + y[0]), x[1] + y[1]) ) \
        .map(lambda x: (x[1], x[0])) 
    
    if by_count: #times bike was ridden
        return results.top(num, key=lambda x: x[0])
    else: #by total time ridden
        return results.top(num, key=lambda x: x[1])   

In [100]:
print("Getting RDD of all rides...", input_file)
rides_rdd = get_rides_rdd(sc, input_file)

Getting RDD of all rides... ./data/2020-02.csv
Reading input file: ./data/2020-02.csv
Number of partitions:  2


In [132]:
start = time.time()
top_stations = get_top_start_stations(5, rides_rdd)
print("Time Elapsed: ", time.time() - start)
print("Top Starting Stations:")
for entry in top_stations:
    print("Start Stations: {:03d}, Trips: {:03d}".format(int(entry[1]), entry[0]))

Time Elapsed:  3.940264940261841
Top Starting Stations:
Start Stations: 001, Trips: 6298
Start Stations: 027, Trips: 6201
Start Stations: 271, Trips: 5262
Start Stations: 064, Trips: 4825
Start Stations: 041, Trips: 4621


In [133]:
start = time.time()
top_routes = get_top_routes(5, rides_rdd)
print("Time Elapsed: ", time.time() - start)
print("Top Bike Routes:")
for entry in top_routes:
    print("From: {:03d}, To: {:03d}, Total Trips: {:03d}".format(int(entry[1][0]), int(entry[1][1]), entry[0]))

Time Elapsed:  5.044939994812012
Top Bike Routes:
From: 033, To: 033, Total Trips: 375
From: 018, To: 001, Total Trips: 319
From: 211, To: 217, Total Trips: 303
From: 449, To: 449, Total Trips: 301
From: 208, To: 206, Total Trips: 297


In [123]:
start = time.time()
gender_perf = get_performance_by_gender(rides_rdd)
print("Time Elapsed: ", time.time() - start)
print("Gender Performance: ")
for entry in gender_perf:
    print("Gender: {}, Count: {}, Avg Ride (Mins): {:.2f}".format(entry[0], entry[1][1], entry[1][0]/60 ))


Time Elapsed:  3.7220823764801025
Gender Performance: 
Gender: M, Count: 509782, Avg Ride (Mins): 13.62
Gender: F, Count: 174808, Avg Ride (Mins): 14.30


In [121]:
start = time.time()
most_used_bikes = get_busy_bees(3, rides_rdd)
print("Time Elapsed: ", time.time() - start)
print("Top Bike Routes By Total Minutes: ")
for entry in most_used_bikes: 
    print("ID: {}, Count: {}, Minutes: {:.2f}".format(entry[1], entry[0][0], entry[0][1]/60 ))

print()

start = time.time()
most_used_bikes = get_busy_bees(3, rides_rdd, True)
print("Time Elapsed: ", time.time() - start)
print("Top Bike Routes By Count: ")
for entry in most_used_bikes: 
    print("ID: {}, Count: {}, Minutes: {:.2f}".format(entry[1], entry[0][0], entry[0][1]/60 ))


Time Elapsed:  3.997042655944824
Top Bike Routes By Total Minutes: 
ID: 15339, Count: 105, Minutes: 2162.63
ID: 15338, Count: 13, Minutes: 316.37
ID: 15337, Count: 67, Minutes: 1409.97

Time Elapsed:  3.7950901985168457
Top Bike Routes By Count: 
ID: 10771, Count: 217, Minutes: 1523.15
ID: 10810, Count: 208, Minutes: 2785.77
ID: 7854, Count: 192, Minutes: 2633.07


In [118]:
print("Done") 

Done
