In [1]:
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count
import gc
import time
gc.enable()
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import re

from useful_functions import *

warnings.filterwarnings('ignore')

# Group data into stations and days

In [2]:
# set global variable 
YEAR = 2013

In [3]:
%%time
# Load data
data = pd.read_feather(f'data/Divvy_data_{YEAR}.feather')

CPU times: user 246 ms, sys: 330 ms, total: 576 ms
Wall time: 1.19 s


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 759788 entries, 0 to 759787
Data columns (total 15 columns):
trip_id              759788 non-null int64
bikeid               759788 non-null int64
tripduration         759788 non-null int64
from_station_id      759788 non-null int64
from_station_name    759788 non-null object
to_station_id        759788 non-null int64
to_station_name      759788 non-null object
usertype             759788 non-null object
gender               403046 non-null object
birthday             402909 non-null float64
year                 759788 non-null int64
day                  759788 non-null int64
month                759788 non-null int64
hour                 759788 non-null int64
Age                  0 non-null float64
dtypes: float64(2), int64(9), object(4)
memory usage: 87.0+ MB


In [5]:
%%time
# For inward/outward data: Group data --> flatten data --> rename columns

# Inward trips collection
data_in = data[['from_station_id', 'month', 'day', 'trip_id']].groupby(['from_station_id', 'month', 'day']).count().reset_index()
data_in.rename(columns={'trip_id': 'total_in', 'from_station_id': 'station_id'}, inplace=True)

# Outward trips collection
data_out = data[['to_station_id', 'month', 'day', 'trip_id']].groupby(['to_station_id', 'month', 'day']).count().reset_index()
data_out.rename(columns={'trip_id': 'total_out', 'to_station_id': 'station_id'}, inplace=True)

# Merge data together
data_station = data_in.merge(data_out, on=['station_id', 'month', 'day'], how='outer').fillna(0)

# Add year
data_station['year'] = int(YEAR)

# clean up
del data_in
del data_out
gc.collect()

CPU times: user 183 ms, sys: 67.2 ms, total: 251 ms
Wall time: 256 ms


In [6]:
data_station.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15403 entries, 0 to 15402
Data columns (total 6 columns):
station_id    15403 non-null int64
month         15403 non-null int64
day           15403 non-null int64
total_in      15403 non-null float64
total_out     15403 non-null float64
year          15403 non-null int64
dtypes: float64(2), int64(4)
memory usage: 842.4 KB


# Combine with station data 
## Load station data 

In [7]:
%%time
# load data from feather
df_station_only = pd.read_feather('data/final_station_data.feather')

CPU times: user 4.45 ms, sys: 2.43 ms, total: 6.88 ms
Wall time: 8.22 ms


In [8]:
df_station_only.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 618 entries, 0 to 617
Data columns (total 12 columns):
index            618 non-null int64
id               618 non-null int64
lon_ave          618 non-null float64
lat_ave          618 non-null float64
dp_max           618 non-null float64
dp_min           618 non-null float64
online_month     618 non-null float64
online_day       618 non-null float64
online_year      618 non-null float64
city_Chicago     618 non-null uint8
city_Evanston    618 non-null uint8
city_Oak_Park    618 non-null uint8
dtypes: float64(7), int64(2), uint8(3)
memory usage: 45.3 KB


## Zero-padding for days without any trip record

In [9]:
# Zero padding 
import itertools

record_lst = []
for m in range(1, 13):
    for s, d in itertools.product(data_station.station_id.unique(), range(1, days_in_month(YEAR, m)+1)):
        record_lst.append((s, m, d))

zero_pad = pd.DataFrame(record_lst, columns=['station_id', 'month', 'day'])

In [10]:
zero_pad = zero_pad.merge(df_station_only, left_on='station_id', right_on='id', how='left').drop('id', axis=1)

In [11]:
zero_pad.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109500 entries, 0 to 109499
Data columns (total 14 columns):
station_id       109500 non-null int64
month            109500 non-null int64
day              109500 non-null int64
index            109500 non-null int64
lon_ave          109500 non-null float64
lat_ave          109500 non-null float64
dp_max           109500 non-null float64
dp_min           109500 non-null float64
online_month     109500 non-null float64
online_day       109500 non-null float64
online_year      109500 non-null float64
city_Chicago     109500 non-null uint8
city_Evanston    109500 non-null uint8
city_Oak_Park    109500 non-null uint8
dtypes: float64(7), int64(4), uint8(3)
memory usage: 10.3 MB


## Merge zero-padding data into station-trips dataset

In [12]:
# Join zero_pad info into data_station
data_station = data_station.merge(zero_pad, on=['station_id', 'month', 'day'], how='right')

## Fill nan data after adding zero-padding

In [13]:
# Fill year
data_station['year'].fillna(YEAR, inplace=True)

# Fill inward and outward number of trips
data_station['total_in'].fillna(0, inplace=True)
data_station['total_out'].fillna(0, inplace=True)

## Add days_online feature and drop rows with negative days online

In [14]:
data_station.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109500 entries, 0 to 109499
Data columns (total 17 columns):
station_id       109500 non-null int64
month            109500 non-null int64
day              109500 non-null int64
total_in         109500 non-null float64
total_out        109500 non-null float64
year             109500 non-null float64
index            109500 non-null int64
lon_ave          109500 non-null float64
lat_ave          109500 non-null float64
dp_max           109500 non-null float64
dp_min           109500 non-null float64
online_month     109500 non-null float64
online_day       109500 non-null float64
online_year      109500 non-null float64
city_Chicago     109500 non-null uint8
city_Evanston    109500 non-null uint8
city_Oak_Park    109500 non-null uint8
dtypes: float64(10), int64(4), uint8(3)
memory usage: 12.8 MB


In [15]:
# Clean up data without station information
data_station = data_station.dropna()

# Drop useless column
data_station = data_station.drop('index', axis=1)

In [16]:
%%time
from datetime import date

def get_days(row):
    d0 = date(int(row.year), int(row.month), int(row.day))
    d1 = date(int(row.online_year), int(row.online_month), int(row.online_day))
    return (d0-d1).days

data_station['days_online'] = data_station.apply(lambda x: get_days(x), axis=1)

CPU times: user 8.93 s, sys: 109 ms, total: 9.04 s
Wall time: 9.4 s


In [17]:
## Drop rows with negative days_online
data_station = data_station[data_station.days_online >= 0].copy()

In [18]:
## Remove columns won't be needed
data_station.drop(['online_day', 'online_month', 'online_year'], axis=1, inplace=True)

## Add day of week

In [19]:
%%time
## Add day_of_week for each datetime
import calendar

def _get_dayofweek(row):
    return calendar.weekday(int(row.year), int(row.month), int(row.day))

data_station['dayofweek'] = data_station.apply(lambda x: _get_dayofweek(x), axis=1)

CPU times: user 1.76 s, sys: 16.7 ms, total: 1.77 s
Wall time: 1.86 s


## Add weather data

In [20]:
%%time
## Load weather data
df_weather = pd.read_csv(f'weather_data/weather_{YEAR}_chicago.csv')

CPU times: user 10 ms, sys: 3.91 ms, total: 13.9 ms
Wall time: 15 ms


In [21]:
data_station = data_station.merge(df_weather, on=['month', 'day', 'year'], how='left')

In [22]:
data_station.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44036 entries, 0 to 44035
Data columns (total 57 columns):
station_id                     44036 non-null int64
month                          44036 non-null int64
day                            44036 non-null int64
total_in                       44036 non-null float64
total_out                      44036 non-null float64
year                           44036 non-null float64
lon_ave                        44036 non-null float64
lat_ave                        44036 non-null float64
dp_max                         44036 non-null float64
dp_min                         44036 non-null float64
city_Chicago                   44036 non-null uint8
city_Evanston                  44036 non-null uint8
city_Oak_Park                  44036 non-null uint8
days_online                    44036 non-null int64
dayofweek                      44036 non-null int64
apparentTemperatureHigh        44036 non-null float64
apparentTemperatureHighTime    44036 non-nu

In [23]:
%%time
## Save data
data_station.to_feather(f'data/Final_Divvy_data_{YEAR}.feather')
print(f'Data saved to feather file!')

Data saved to feather file!
CPU times: user 22.6 ms, sys: 11.2 ms, total: 33.9 ms
Wall time: 22.9 ms


## To-dos
- [X] Add number of days being on
- [ ] Add weather data
- [X] Add station data 
    - number of docks
    - lat
    - long
- [ ] **Add network adj**
- [X] Add day of week
