In [1]:
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count
import gc
import time
gc.enable()
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import re

from useful_functions import *

warnings.filterwarnings('ignore')

# Group data into stations and days

In [46]:
# set global variable 
YEAR = 2014

In [47]:
%%time
# Load data
data = pd.read_feather(f'data/Divvy_data_{YEAR}.feather')

CPU times: user 720 ms, sys: 758 ms, total: 1.48 s
Wall time: 2.83 s


In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2454634 entries, 0 to 2454633
Data columns (total 14 columns):
trip_id              int64
bikeid               int64
tripduration         int64
from_station_id      int64
from_station_name    object
to_station_id        int64
to_station_name      object
usertype             object
gender               object
year                 int64
day                  int64
month                int64
hour                 int64
Age                  float64
dtypes: float64(1), int64(9), object(4)
memory usage: 262.2+ MB


In [49]:
%%time
# For inward/outward data: Group data --> flatten data --> rename columns

# Inward trips collection
data_in = data[['from_station_id', 'month', 'day', 'trip_id']].groupby(['from_station_id', 'month', 'day']).count().reset_index()
data_in.rename(columns={'trip_id': 'total_in', 'from_station_id': 'station_id'}, inplace=True)

# Outward trips collection
data_out = data[['to_station_id', 'month', 'day', 'trip_id']].groupby(['to_station_id', 'month', 'day']).count().reset_index()
data_out.rename(columns={'trip_id': 'total_out', 'to_station_id': 'station_id'}, inplace=True)

# Merge data together
data_station = data_in.merge(data_out, on=['station_id', 'month', 'day'], how='outer').fillna(0)

# Add year
data_station['year'] = int(YEAR)

# clean up
del data_in
del data_out
gc.collect()

CPU times: user 739 ms, sys: 247 ms, total: 986 ms
Wall time: 685 ms


In [50]:
data_station.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102216 entries, 0 to 102215
Data columns (total 6 columns):
station_id    102216 non-null int64
month         102216 non-null int64
day           102216 non-null int64
total_in      102216 non-null float64
total_out     102216 non-null float64
year          102216 non-null int64
dtypes: float64(2), int64(4)
memory usage: 5.5 MB


# Combine with station data 
## Load station data 

In [51]:
%%time
# load data from feather
df_station_only = pd.read_feather('data/final_station_data.feather')

CPU times: user 4.51 ms, sys: 987 µs, total: 5.5 ms
Wall time: 6.6 ms


In [52]:
df_station_only.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 618 entries, 0 to 617
Data columns (total 12 columns):
index            618 non-null int64
id               618 non-null int64
lon_ave          618 non-null float64
lat_ave          618 non-null float64
dp_max           618 non-null float64
dp_min           618 non-null float64
online_month     618 non-null float64
online_day       618 non-null float64
online_year      618 non-null float64
city_Chicago     618 non-null uint8
city_Evanston    618 non-null uint8
city_Oak_Park    618 non-null uint8
dtypes: float64(7), int64(2), uint8(3)
memory usage: 45.3 KB


## Zero-padding for days without any trip record

In [53]:
# Zero padding 
import itertools

record_lst = []
for m in range(1, 13):
    for s, d in itertools.product(data_station.station_id.unique(), range(1, days_in_month(YEAR, m)+1)):
        record_lst.append((s, m, d))

zero_pad = pd.DataFrame(record_lst, columns=['station_id', 'month', 'day'])

In [54]:
zero_pad = zero_pad.merge(df_station_only, left_on='station_id', right_on='id', how='left').drop('id', axis=1)

In [55]:
zero_pad.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109500 entries, 0 to 109499
Data columns (total 14 columns):
station_id       109500 non-null int64
month            109500 non-null int64
day              109500 non-null int64
index            109500 non-null int64
lon_ave          109500 non-null float64
lat_ave          109500 non-null float64
dp_max           109500 non-null float64
dp_min           109500 non-null float64
online_month     109500 non-null float64
online_day       109500 non-null float64
online_year      109500 non-null float64
city_Chicago     109500 non-null uint8
city_Evanston    109500 non-null uint8
city_Oak_Park    109500 non-null uint8
dtypes: float64(7), int64(4), uint8(3)
memory usage: 10.3 MB


## Merge zero-padding data into station-trips dataset

In [56]:
# Join zero_pad info into data_station
data_station = data_station.merge(zero_pad, on=['station_id', 'month', 'day'], how='right')

## Fill nan data after adding zero-padding

In [57]:
# Fill year
data_station['year'].fillna(YEAR, inplace=True)

# Fill inward and outward number of trips
data_station['total_in'].fillna(0, inplace=True)
data_station['total_out'].fillna(0, inplace=True)

## Add days_online feature and drop rows with negative days online

In [58]:
data_station.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109500 entries, 0 to 109499
Data columns (total 17 columns):
station_id       109500 non-null int64
month            109500 non-null int64
day              109500 non-null int64
total_in         109500 non-null float64
total_out        109500 non-null float64
year             109500 non-null float64
index            109500 non-null int64
lon_ave          109500 non-null float64
lat_ave          109500 non-null float64
dp_max           109500 non-null float64
dp_min           109500 non-null float64
online_month     109500 non-null float64
online_day       109500 non-null float64
online_year      109500 non-null float64
city_Chicago     109500 non-null uint8
city_Evanston    109500 non-null uint8
city_Oak_Park    109500 non-null uint8
dtypes: float64(10), int64(4), uint8(3)
memory usage: 12.8 MB


In [59]:
# Clean up data without station information
data_station = data_station.dropna()

# Drop useless column
data_station = data_station.drop('index', axis=1)

In [60]:
%%time
from datetime import date

def get_days(row):
    d0 = date(int(row.year), int(row.month), int(row.day))
    d1 = date(int(row.online_year), int(row.online_month), int(row.online_day))
    return (d0-d1).days

data_station['days_online'] = data_station.apply(lambda x: get_days(x), axis=1)

CPU times: user 7.22 s, sys: 51.4 ms, total: 7.27 s
Wall time: 7.17 s


In [61]:
## Drop rows with negative days_online
data_station = data_station[data_station.days_online >= 0].copy()

In [62]:
## Remove columns won't be needed
data_station.drop(['online_day', 'online_month', 'online_year'], axis=1, inplace=True)

## Add day of week

In [63]:
%%time
## Add day_of_week for each datetime
import calendar

def _get_dayofweek(row):
    return calendar.weekday(int(row.year), int(row.month), int(row.day))

data_station['dayofweek'] = data_station.apply(lambda x: _get_dayofweek(x), axis=1)

CPU times: user 5.24 s, sys: 58.3 ms, total: 5.3 s
Wall time: 5.4 s


## Add weather data

In [64]:
%%time
## Load weather data
df_weather = pd.read_csv(f'weather_data/weather_{YEAR}_chicago.csv')

CPU times: user 11.2 ms, sys: 3.21 ms, total: 14.4 ms
Wall time: 14 ms


In [65]:
data_station = data_station.merge(df_weather, on=['month', 'day', 'year'], how='left')

In [66]:
data_station.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109500 entries, 0 to 109499
Data columns (total 57 columns):
station_id                     109500 non-null int64
month                          109500 non-null int64
day                            109500 non-null int64
total_in                       109500 non-null float64
total_out                      109500 non-null float64
year                           109500 non-null float64
lon_ave                        109500 non-null float64
lat_ave                        109500 non-null float64
dp_max                         109500 non-null float64
dp_min                         109500 non-null float64
city_Chicago                   109500 non-null uint8
city_Evanston                  109500 non-null uint8
city_Oak_Park                  109500 non-null uint8
days_online                    109500 non-null int64
dayofweek                      109500 non-null int64
apparentTemperatureHigh        109500 non-null float64
apparentTemperatureHighTi

In [67]:
%%time
## Save data
data_station.to_feather(f'data/Final_Divvy_data_{YEAR}.feather')
print(f'Data saved to feather file!')

Data saved to feather file!
CPU times: user 74.1 ms, sys: 26.4 ms, total: 101 ms
Wall time: 49 ms


## To-dos
- [X] Add number of days being on
- [ ] Add weather data
- [X] Add station data 
    - number of docks
    - lat
    - long
- [ ] **Add network adj**
- [X] Add day of week
