In [14]:
import datetime as dt
import numpy as np 
import pandas as pd
import time
import argparse
from pathlib import Path
from pyspark import SparkContext
from pyspark.sql.session import SparkSession

# start = time.time()
# print("hello")
# end = time.time()
# print(end - start)

In [15]:
sc = SparkContext()
spark = SparkSession(sc)

In [16]:
input_file = './data/2020-02.csv'
output_path = 'output/'

In [17]:
month_rides = sc.textFile(input_file, use_unicode=True).cache()
print("Number of partitions: ", month_rides.getNumPartitions())

Number of partitions:  2


In [62]:
# Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo
# M,44,4357,442,1/2/2020,0:00:38,116,1/2/2020,0:35:17

def get_date_time(date, time):
    #   Format:
    #       28/02/2020,23:58:15,
    #       29/02/2020,0:01:55
    from datetime import datetime    
    datetime_object = datetime.strptime(date+" "+time, '%d/%m/%Y %H:%M:%S')
    return datetime_object


def parse_ride_records(part_id, list_of_records):
    if part_id == 0: 
        next(list_of_records) # skipping the header line
    import csv
    reader = csv.reader(list_of_records)
    for row in reader:
        gender = row[0]
        age = int(row[1])
        bike_id = int(row[2])
        station_start = row[3]
        datetime_start = get_date_time(row[4], row[5])
        datetime_end = get_date_time(row[7], row[8])
        station_end = row[6]

        yield (bike_id, gender, age, station_start, station_end, datetime_start, datetime_end)

def parse_start_stations(part_id, list_of_records):
    if part_id == 0: 
        next(list_of_records) # skipping the header line
    import csv
    reader = csv.reader(list_of_records)
    for row in reader:
        bike_id = int(row[2])
        station_start = row[3]
        
        datetime_start = get_date_time(row[4], row[5])
        datetime_end = get_date_time(row[7], row[8])
        
        yield (station_start, datetime_end-datetime_start)


1. Top 5 stations by most starts
   a. In the morning (6-10 am)
   b. In the evening (4-8 pm)
2. Trip duration by user type
3. Most popular trips based on start station and end station)
4. Rider performance by Gender and Age based on avg trip distance (station to station), median speed (trip duration / distance traveled)
5. What is the busiest bike in the CDMX in February 2020? How many times was it used? How many minutes was it in use?
6. Find the mean and standard dev of trip tim

In [71]:
def get_rides_rdd(sc, input_file):
    print("Reading input file:", input_file)

    rides_rdd = sc.textFile(input_file, use_unicode=True) \
        .mapPartitionsWithIndex(parse_ride_records).cache()
    
    print("Number of partitions: ", rides_rdd.getNumPartitions())
    return rides_rdd

def get_top_stations(num, rides_rdd):
    results = rides_rdd.map(lambda x: (x[3], x[6]-x[5] ) ) \
        .filter(lambda x: x[1].total_seconds() <= 60 * 60 * 2) \
        .map(lambda x: ( x[0], 1 ) ) \
        .reduceByKey(lambda x,y: x+y) \
        .map(lambda x: (x[1], x[0])) \
        .top(num, key=lambda x: x) \

    return results

In [72]:
start = time.time()

rides_rdd = get_rides_rdd(sc, input_file)

top_stations = get_top_stations(5, rides_rdd)
print(top_stations)


print(time.time() - start)
print("Done")

Reading input file: ./data/2020-02.csv
Number of partitions:  2
Saving...
[(6298, '1'), (6201, '27'), (5262, '271'), (4825, '64'), (4621, '41')]
Done
