<a href="https://colab.research.google.com/github/onmax/bike-forecasting/blob/main/src/preprocessing/dataset-creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Getting the dataset
!git clone "https://github.com/onmax/bike-forecasting"
%cd bike-forecasting/src/preprocessing

Cloning into 'bike-forecasting'...
remote: Enumerating objects: 45, done.[K
remote: Total 45 (delta 0), reused 0 (delta 0), pack-reused 45[K
Unpacking objects: 100% (45/45), done.
Checking out files: 100% (35/35), done.
/content/bike-forecasting


In [1]:
import pandas as pd
import os
import pickle


# files got from https://divvy-tripdata.s3.amazonaws.com/index.html. Data folder should look like this
'''
├── data
│   ├── files
│   │   ├── 2014
│   │   │   └── ...
│   │   ├── 2015
│   │   │   └── ...
│   │   ├── 2016
│   │   │   └── ...
│   │   ├── 2017
│   │   │   └── ...
│   │   ├── 2018
│   │   │   └── ...
│   │   └── 2019
│   │       └── ...
│   └── raw
│       ├── Divvy_Stations_Trips_2014_Q1Q2.zip
│       ├── Divvy_Stations_Trips_2014_Q3Q4.zip
│       ├── Divvy_Trips_2015-Q1Q2.zip
│       ├── Divvy_Trips_2015_Q3Q4.zip
│       ├── Divvy_Trips_2016_Q1Q2.zip
│       ├── Divvy_Trips_2016_Q3Q4.zip
│       ├── Divvy_Trips_2017_Q1Q2.zip
│       ├── Divvy_Trips_2017_Q3Q4.zip
│       ├── Divvy_Trips_2018_Q1.zip
│       ├── Divvy_Trips_2018_Q2.zip
│       ├── Divvy_Trips_2018_Q3.zip
│       ├── Divvy_Trips_2018_Q4.zip
│       ├── Divvy_Trips_2019_Q1.zip
│       ├── Divvy_Trips_2019_Q2.zip
│       ├── Divvy_Trips_2019_Q3.zip
│       └── Divvy_Trips_2019_Q4.zip
└── src 
    └── ...
'''

'\n├── data\n│\xa0\xa0 ├── files\n│\xa0\xa0 │\xa0\xa0 ├── 2014\n│\xa0\xa0 │\xa0\xa0 │   └── ...\n│\xa0\xa0 │\xa0\xa0 ├── 2015\n│\xa0\xa0 │\xa0\xa0 │   └── ...\n│\xa0\xa0 │\xa0\xa0 ├── 2016\n│\xa0\xa0 │\xa0\xa0 │   └── ...\n│\xa0\xa0 │\xa0\xa0 ├── 2017\n│\xa0\xa0 │\xa0\xa0 │   └── ...\n│\xa0\xa0 │\xa0\xa0 ├── 2018\n│\xa0\xa0 │\xa0\xa0 │   └── ...\n│\xa0\xa0 │\xa0\xa0 └── 2019\n│\xa0\xa0 │\xa0\xa0     └── ...\n│\xa0\xa0 └── raw\n│\xa0\xa0     ├── Divvy_Stations_Trips_2014_Q1Q2.zip\n│\xa0\xa0     ├── Divvy_Stations_Trips_2014_Q3Q4.zip\n│\xa0\xa0     ├── Divvy_Trips_2015-Q1Q2.zip\n│\xa0\xa0     ├── Divvy_Trips_2015_Q3Q4.zip\n│\xa0\xa0     ├── Divvy_Trips_2016_Q1Q2.zip\n│\xa0\xa0     ├── Divvy_Trips_2016_Q3Q4.zip\n│\xa0\xa0     ├── Divvy_Trips_2017_Q1Q2.zip\n│\xa0\xa0     ├── Divvy_Trips_2017_Q3Q4.zip\n│\xa0\xa0     ├── Divvy_Trips_2018_Q1.zip\n│\xa0\xa0     ├── Divvy_Trips_2018_Q2.zip\n│\xa0\xa0     ├── Divvy_Trips_2018_Q3.zip\n│\xa0\xa0     ├── Divvy_Trips_2018_Q4.zip\n│\xa0\xa0     ├── D

In [9]:
def path():
    current_path = os.getcwd()
    return f"{current_path}/../../data/files"

def data_path(year):
    return f"{path()}/{year}"

def get_pickles(years):
    current_path = path()
    paths = []
    for year in years:
        pickle_path = f"{current_path}/trips-{year}.pickle"
        paths.append(pickle_path)
    return paths

def merge_csv(inputs, output, year, usecols=["starttime", "from_station_id", "from_station_name"]):
    df = pd.DataFrame()
    for input in inputs:
        print(f"Reading {input}")
        df_temp = pd.read_csv(input, usecols=usecols)
        df = pd.concat([df, df_temp], join='outer')
    output_file = f"{output}/trips-{year}.pickle"
    print(f"Writing {output_file}")
    df.to_pickle(output_file)

def merge_years(inputs, output):
    df = pd.DataFrame()
    for input in inputs:
        print(f"Reading {input}")
        df_temp = pd.read_pickle(input)
        df = pd.concat([df, df_temp], join='outer')
    output_file = f"{output}/trips.pickle"
    print(f"Writing {output_file}")
    df.to_pickle(output_file)

def split(source, dest_folder, write_size):
    # Make a destination folder if it doesn't exist yet
    if not os.path.exists(dest_folder):
        os.mkdir(dest_folder)
    else:
        # Otherwise clean out all files in the destination folder
        for file in os.listdir(dest_folder):
            os.remove(os.path.join(dest_folder, file))
    partnum = 0
    
    with open(source, 'rb') as input:
        while True:
            chunk = input.read(write_size)
            if not chunk:
                # End the loop if we have hit EOF
                break
            partnum += 1
            # Create a new file name
            with open(f"{dest_folder}/trips-part-{partnum}", 'wb') as fd:
                fd.write(chunk)

def join(output_file, parts):
  with open(output_file, 'wb') as output:
    for part in parts:
        with open(part, 'rb') as input_file:
          print(f"Putting together {part}")
          output.write(input_file.read())

In [10]:
year = "2014"
csv_folder = data_path(year)
csv_names = ["Divvy_Trips_2014_Q1Q2.csv", "Divvy_Trips_2014-Q3-07.csv", "Divvy_Trips_2014-Q3-0809.csv", "Divvy_Trips_2014-Q4.csv"]
csv_paths = [f"{csv_folder}/{f}" for f in csv_names]
merge_csv(csv_paths, csv_folder, year)

Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2014/Divvy_Trips_2014_Q1Q2.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2014/Divvy_Trips_2014-Q3-07.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2014/Divvy_Trips_2014-Q3-0809.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2014/Divvy_Trips_2014-Q4.csv
Writing /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2014/trips-2014.pickle


In [23]:
year = "2014"
split(source=f'{path()}/trips-{year}.pickle', write_size=49500000, dest_folder=f'{path()}/parts/{year}')

In [11]:
year = "2015"
csv_folder = data_path(year)
csv_names = ["Divvy_Trips_2015-Q1.csv", "Divvy_Trips_2015-Q2.csv", "Divvy_Trips_2015_07.csv", "Divvy_Trips_2015_08.csv", "Divvy_Trips_2015_09.csv", "Divvy_Trips_2015_Q4.csv"]
csv_paths = [f"{csv_folder}/{f}" for f in csv_names]
merge_csv(csv_paths, csv_folder, year)

Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2015/Divvy_Trips_2015-Q1.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2015/Divvy_Trips_2015-Q2.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2015/Divvy_Trips_2015_07.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2015/Divvy_Trips_2015_08.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2015/Divvy_Trips_2015_09.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2015/Divvy_Trips_2015_Q4.csv
Writing /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2015/trips-2015.pickle


In [25]:
year = "2015"
split(source=f'{path()}/trips-{year}.pickle', write_size=49500000, dest_folder=f'{path()}/parts/{year}')

In [12]:
year = "2016"
csv_folder = data_path(year)
csv_names = ["Divvy_Trips_2016_Q1.csv", "Divvy_Trips_2016_04.csv", "Divvy_Trips_2016_05.csv", "Divvy_Trips_2016_06.csv","Divvy_Trips_2016_Q3.csv", "Divvy_Trips_2016_Q4.csv"]
csv_paths = [f"{csv_folder}/{f}" for f in csv_names]
merge_csv(csv_paths, csv_folder, year)

Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2016/Divvy_Trips_2016_Q1.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2016/Divvy_Trips_2016_04.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2016/Divvy_Trips_2016_05.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2016/Divvy_Trips_2016_06.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2016/Divvy_Trips_2016_Q3.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2016/Divvy_Trips_2016_Q4.csv
Writing /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2016/trips-2016.pickle


In [27]:
year = "2016"
split(source=f'{path()}/trips-{year}.pickle', write_size=49500000, dest_folder=f'{path()}/parts/{year}')

In [17]:
year = "2017"
csv_folder = data_path(year)
csv_names = ["Divvy_Trips_2017_Q1.csv", "Divvy_Trips_2017_Q2.csv", "Divvy_Trips_2017_Q3.csv", "Divvy_Trips_2017_Q4.csv"]
csv_paths = [f"{csv_folder}/{f}" for f in csv_names]
merge_csv(csv_paths, csv_folder, year, usecols=["start_time", "from_station_id", "from_station_name"])

Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2017/Divvy_Trips_2017_Q1.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2017/Divvy_Trips_2017_Q2.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2017/Divvy_Trips_2017_Q3.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2017/Divvy_Trips_2017_Q4.csv
Writing /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2017/trips-2017.pickle


In [28]:
year = "2017"
split(source=f'{path()}/trips-{year}.pickle', write_size=49500000, dest_folder=f'{path()}/parts/{year}')

In [19]:
year = "2018"
csv_folder = data_path(year)
csv_names = ["Divvy_Trips_2018_Q1.csv", "Divvy_Trips_2018_Q2.csv", "Divvy_Trips_2018_Q3.csv", "Divvy_Trips_2018_Q4.csv"]
csv_paths = [f"{csv_folder}/{f}" for f in csv_names]
merge_csv(csv_paths, csv_folder, year, usecols=["start_time", "from_station_id", "from_station_name"])

Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2018/Divvy_Trips_2018_Q1.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2018/Divvy_Trips_2018_Q2.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2018/Divvy_Trips_2018_Q3.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2018/Divvy_Trips_2018_Q4.csv
Writing /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2018/trips-2018.pickle


In [29]:
year = "2018"
split(source=f'{path()}/trips-{year}.pickle', write_size=49500000, dest_folder=f'{path()}/parts/{year}')

In [21]:
year = "2019"
csv_folder = data_path(year)
csv_names = ["Divvy_Trips_2019_Q1.csv", "Divvy_Trips_2019_Q2.csv", "Divvy_Trips_2019_Q3.csv", "Divvy_Trips_2019_Q4.csv"]
csv_paths = [f"{csv_folder}/{f}" for f in csv_names]
merge_csv(csv_paths, csv_folder, year, usecols=["start_time", "from_station_id", "from_station_name"])

Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2019/Divvy_Trips_2019_Q1.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2019/Divvy_Trips_2019_Q2.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2019/Divvy_Trips_2019_Q3.csv
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2019/Divvy_Trips_2019_Q4.csv
Writing /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/2019/trips-2019.pickle


In [30]:
year = "2019"
split(source=f'{path()}/trips-{year}.pickle', write_size=49500000, dest_folder=f'{path()}/parts/{year}')

In [22]:
# Merge all pickles
years = list(range(2014, 2020))
pickles = get_pickles(years)
merge_years(pickles, path())

Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/trips-2014.pickle
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/trips-2015.pickle
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/trips-2016.pickle
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/trips-2017.pickle
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/trips-2018.pickle
Reading /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/trips-2019.pickle
Writing /mnt/f/proyectos/bikes/src/preprocessing/../../data/files/trips.pickle


MemoryError: 

In [3]:
split(source=f'{path()}/trips.pickle', write_size=49500000, dest_folder=f'{path()}/parts')

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/f/proyectos/bikes/src/preprocessing/../../data/files/trips.pickle'

In [39]:
parts = [f"{path()}/parts/trips-part-{i}" for i in list(range(2,34))]
join(f'{path()}/trips.pickle', parts)

Putting together /content/bike-forecasting/src/preprocessing/../../data/files/parts/trips-part-2
Putting together /content/bike-forecasting/src/preprocessing/../../data/files/parts/trips-part-3
Putting together /content/bike-forecasting/src/preprocessing/../../data/files/parts/trips-part-4
Putting together /content/bike-forecasting/src/preprocessing/../../data/files/parts/trips-part-5
Putting together /content/bike-forecasting/src/preprocessing/../../data/files/parts/trips-part-6
Putting together /content/bike-forecasting/src/preprocessing/../../data/files/parts/trips-part-7
Putting together /content/bike-forecasting/src/preprocessing/../../data/files/parts/trips-part-8
Putting together /content/bike-forecasting/src/preprocessing/../../data/files/parts/trips-part-9
Putting together /content/bike-forecasting/src/preprocessing/../../data/files/parts/trips-part-10
Putting together /content/bike-forecasting/src/preprocessing/../../data/files/parts/trips-part-11
Putting together /content/bi

In [5]:
# Group trips by hour
df = pd.read_pickle(f"{path()}/trips.pickle")
df["starttime"] = pd.to_datetime(df["starttime"], format='%Y-%m-%d %H:%M:%S')

INTERVAL = "1H" # It could be also 15Min
df = df.groupby('from_station_id').resample(INTERVAL, on='start_time') \
        .size() \
        .to_frame() \
        .rename(columns={0: "quantity", "starttime": "start_time"}) \
        .reset_index() \
        .set_index("start_time")
df

ValueError: time data usertype doesn't match format specified

In [6]:
df

Unnamed: 0,starttime,from_station_id,from_station_name
"(trip_id, starttime, stoptime, bikeid, tripduration, from_station_id, from_station_name, to_station_id, to_station_name)",usertype,gender,birthyear
"(2355134, 6/30/2014 23:57, 7/1/2014 0:07, 2006, 604, 131, Lincoln Ave & Belmont Ave, 303, Broadway & Cornelia Ave)",Subscriber,Male,1988
"(2355133, 6/30/2014 23:56, 7/1/2014 0:00, 2217, 263, 282, Halsted St & Maxwell St, 22, May St & Taylor St)",Subscriber,Male,1992
"(2355130, 6/30/2014 23:33, 6/30/2014 23:35, 2798, 126, 327, Sheffield Ave & Webster Ave, 225, Halsted St & Dickens Ave)",Subscriber,Male,1993
"(2355129, 6/30/2014 23:26, 7/1/2014 0:24, 173, 3481, 134, Peoria St & Jackson Blvd, 194, State St & Wacker Dr)",Subscriber,Female,1988
...,...,...,...
"(25962900, 2019-12-31 23:56:13, 2020-01-01 00:15:45, 2196, 1,172.0, 112, Green St & Randolph St, 225, Halsted St & Dickens Ave)",Subscriber,Male,1981
"(25962901, 2019-12-31 23:56:34, 2020-01-01 00:22:08, 4877, 1,533.0, 90, Millennium Park, 90, Millennium Park)",Subscriber,Male,1992
"(25962902, 2019-12-31 23:57:05, 2020-01-01 00:05:46, 863, 520.0, 623, Michigan Ave & 8th St, 52, Michigan Ave & Lake St)",Subscriber,Male,1967
"(25962903, 2019-12-31 23:57:11, 2020-01-01 00:05:45, 2637, 514.0, 623, Michigan Ave & 8th St, 52, Michigan Ave & Lake St)",Subscriber,Female,1970


In [33]:
f"{path()}/trips.pickle"

'/content/bike-forecasting/src/preprocessing/../../data/files/trips.pickle'

In [None]:
df

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,starttime,from_station_id,from_station_name
trip_id,starttime,stoptime,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear
2355134,6/30/2014 23:57,7/1/2014 0:07,2006,604,131,Lincoln Ave & Belmont Ave,303,Broadway & Cornelia Ave,Subscriber,Male,1988
2355133,6/30/2014 23:56,7/1/2014 0:00,2217,263,282,Halsted St & Maxwell St,22,May St & Taylor St,Subscriber,Male,1992
2355130,6/30/2014 23:33,6/30/2014 23:35,2798,126,327,Sheffield Ave & Webster Ave,225,Halsted St & Dickens Ave,Subscriber,Male,1993
2355129,6/30/2014 23:26,7/1/2014 0:24,173,3481,134,Peoria St & Jackson Blvd,194,State St & Wacker Dr,Subscriber,Female,1988
...,...,...,...,...,...,...,...,...,...,...,...
25962900,2019-12-31 23:56:13,2020-01-01 00:15:45,2196,1172.0,112,Green St & Randolph St,225,Halsted St & Dickens Ave,Subscriber,Male,1981
25962901,2019-12-31 23:56:34,2020-01-01 00:22:08,4877,1533.0,90,Millennium Park,90,Millennium Park,Subscriber,Male,1992
25962902,2019-12-31 23:57:05,2020-01-01 00:05:46,863,520.0,623,Michigan Ave & 8th St,52,Michigan Ave & Lake St,Subscriber,Male,1967
25962903,2019-12-31 23:57:11,2020-01-01 00:05:45,2637,514.0,623,Michigan Ave & 8th St,52,Michigan Ave & Lake St,Subscriber,Female,1970
