In [44]:
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count
import gc
import time
gc.enable()
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import re

from useful_functions import *

warnings.filterwarnings('ignore')

# Group data into stations and days

In [45]:
# set global variable 
YEAR = 2016

In [46]:
%%time
# Load data
data = pd.read_feather(f'data/Divvy_data_{YEAR}.feather')

CPU times: user 1.21 s, sys: 1.42 s, total: 2.63 s
Wall time: 4.87 s


In [47]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3595483 entries, 0 to 3595482
Data columns (total 14 columns):
trip_id              int64
bikeid               int64
tripduration         int64
from_station_id      int64
from_station_name    object
to_station_id        int64
to_station_name      object
usertype             object
gender               object
year                 int64
day                  int64
month                int64
hour                 int64
Age                  float64
dtypes: float64(1), int64(9), object(4)
memory usage: 384.0+ MB


In [48]:
%%time
# For inward/outward data: Group data --> flatten data --> rename columns

# Inward trips collection
data_in = data[['from_station_id', 'month', 'day', 'trip_id']].groupby(['from_station_id', 'month', 'day']).count().reset_index()
data_in.rename(columns={'trip_id': 'total_in', 'from_station_id': 'station_id'}, inplace=True)

# Outward trips collection
data_out = data[['to_station_id', 'month', 'day', 'trip_id']].groupby(['to_station_id', 'month', 'day']).count().reset_index()
data_out.rename(columns={'trip_id': 'total_out', 'to_station_id': 'station_id'}, inplace=True)

# Merge data together
data_station = data_in.merge(data_out, on=['station_id', 'month', 'day'], how='outer').fillna(0)

# Add year
data_station['year'] = int(YEAR)

# clean up
del data_in
del data_out
gc.collect()

CPU times: user 1.2 s, sys: 519 ms, total: 1.72 s
Wall time: 1.51 s


In [49]:
data_station.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 167337 entries, 0 to 167336
Data columns (total 6 columns):
station_id    167337 non-null int64
month         167337 non-null int64
day           167337 non-null int64
total_in      167337 non-null float64
total_out     167337 non-null float64
year          167337 non-null int64
dtypes: float64(2), int64(4)
memory usage: 8.9 MB


# Combine with station data 
## Load station data and parse online date

In [50]:
def _get_online_time(string, tp):
    index_dict = {
        'month': 0,
        'day': 1,
        'year': 2
    }

    return int(re.match(r'([0-9]+)/([0-9]+)/([0-9]+) [0-9]+:', string).groups()[index_dict[tp]])

# load data from csv
df_station_only = pd.read_csv('data/Divvy_Stations_2017_Q3Q4.csv', usecols=['id', 'city', 'latitude', 'longitude', 'dpcapacity', 'online_date'])

# parse online_month/day/year
df_station_only['online_month'] = df_station_only.apply(lambda x: _get_online_time(x.online_date, 'month'), axis=1)
df_station_only['online_day'] = df_station_only.apply(lambda x: _get_online_time(x.online_date, 'day'), axis=1)
df_station_only['online_year'] = df_station_only.apply(lambda x: _get_online_time(x.online_date, 'year'), axis=1)

df_station_only

df_station_only.drop('online_date', inplace=True, axis=1)

In [51]:
df_station_only['city'] = df_station_only.apply(lambda x: '_'.join(x.city.strip().split()), axis=1)

## One-hot encoding the city for each station

In [52]:
# Apply OHE using pd.get_dummies()
df_station_only = pd.get_dummies(df_station_only, columns=['city'])

In [53]:
df_station_only.online_year.value_counts()

2013    332
2015    140
2016    105
2017      5
2014      3
Name: online_year, dtype: int64

## Zero-padding for days without any trip record

In [54]:
# Zero padding 
import itertools

record_lst = []
for m in range(1, 13):
    for s, d in itertools.product(data_station.station_id.unique(), range(1, days_in_month(YEAR, m)+1)):
        record_lst.append((s, m, d))

zero_pad = pd.DataFrame(record_lst, columns=['station_id', 'month', 'day'])

In [12]:
zero_pad = zero_pad.merge(df_station_only, left_on='station_id', right_on='id', how='left').drop('id', axis=1)

In [13]:
zero_pad.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 212646 entries, 0 to 212645
Data columns (total 12 columns):
station_id       212646 non-null int64
month            212646 non-null int64
day              212646 non-null int64
latitude         212280 non-null float64
longitude        212280 non-null float64
dpcapacity       212280 non-null float64
online_month     212280 non-null float64
online_day       212280 non-null float64
online_year      212280 non-null float64
city_Chicago     212280 non-null float64
city_Evanston    212280 non-null float64
city_Oak_Park    212280 non-null float64
dtypes: float64(9), int64(3)
memory usage: 21.1 MB


In [14]:
data_station.nunique()

station_id    581
month          12
day            31
total_in      509
total_out     525
year            1
dtype: int64

## Merge zero-padding data into station-trips dataset

In [15]:
# Join zero_pad info into data_station
data_station = data_station.merge(zero_pad, on=['station_id', 'month', 'day'], how='right')

## Fill nan data after adding zero-padding

In [16]:
# Fill year
data_station['year'].fillna(YEAR, inplace=True)

# Fill inward and outward number of trips
data_station['total_in'].fillna(0, inplace=True)
data_station['total_out'].fillna(0, inplace=True)

In [43]:
# Number of entries without station information
diff = data_station.station_id.count() - data_station.dpcapacity.count()
print(f'There are {diff} stations without station information.')

There are 366 stations without station information.


## Add days_online feature and drop rows with negative days online

In [42]:
data_station.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 212646 entries, 0 to 212645
Data columns (total 15 columns):
station_id       212646 non-null int64
month            212646 non-null int64
day              212646 non-null int64
total_in         212646 non-null float64
total_out        212646 non-null float64
year             212646 non-null float64
latitude         212280 non-null float64
longitude        212280 non-null float64
dpcapacity       212280 non-null float64
online_month     212280 non-null float64
online_day       212280 non-null float64
online_year      212280 non-null float64
city_Chicago     212280 non-null float64
city_Evanston    212280 non-null float64
city_Oak_Park    212280 non-null float64
dtypes: float64(12), int64(3)
memory usage: 26.0 MB


In [18]:
%%time
from datetime import date

def get_days(row):
    d0 = date(int(row.year), int(row.month), int(row.day))
    d1 = date(int(row.online_year), int(row.online_month), int(row.online_day))
    return (d0-d1).days

data_station['days_online'] = data_station.apply(lambda x: get_days(x), axis=1)

ValueError: ('cannot convert float NaN to integer', 'occurred at index 118556')

In [19]:
## Drop rows with negative days_online
data_station = data_station[data_station.days_online >= 0].copy()

AttributeError: 'DataFrame' object has no attribute 'days_online'

In [None]:
## Remove columns won't be needed
data_station.drop(['online_day', 'online_month', 'online_year'], axis=1, inplace=True)

## Add day of week

In [None]:
%%time
## Add day_of_week for each datetime
import calendar

def _get_dayofweek(row):
    return calendar.weekday(int(row.year), int(row.month), int(row.day))

data_station['dayofweek'] = data_station.apply(lambda x: _get_dayofweek(x), axis=1)

## Add weather data

In [None]:
%%time
## Load weather data
df_weather = pd.read_csv(f'weather_data/weather_{YEAR}_chicago.csv')

In [None]:
data_station = data_station.merge(df_weather, on=['month', 'day', 'year'], how='left')

In [None]:
%%time
## Save data
data_station.to_feather(f'data/Final_Divvy_data_{YEAR}.feather')
print(f'Data saved to feather file!')

## To-dos
- [X] Add number of days being on
- [ ] Add weather data
- [X] Add station data 
    - number of docks
    - lat
    - long
- [ ] **Add network adj**
- [X] Add day of week
