In [None]:
from pandas import DataFrame, read_csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from ggplot import *

In [None]:
# formatting lambdas & functions
fnone = lambda x : '{0:g}'.format(float(x))
fshort = lambda x : '{:,.2f}'.format(x)

def convert_float(val):
    try:
        return float(val)
    except ValueError:
        return 0

# dictionary reference
seasons_dict = {
    1: 'Winter',2: 'Spring',3: 'Spring',4: 'Spring',
    5: 'Summer',6: 'Summer',7: 'Summer',8: 'Autumn',
    9: 'Autumn',10: 'Autumn',11: 'Winter',12: 'Winter'}

city_dict = {
    94107: 'San Francisco',
    94063: 'Redwood City',
    94301: 'Palo Alto', 
    94041: 'Mountain View', 
    95113: 'San Jose'}

# header reference
station_headers = ['station_id','station_name','lat','long',
                   'dock_count','city','install_date']

status_headers = ['station_id','bikes_free','docks_free','time']

trip_headers = ['trip_id','trip_time',
                'start_dt','start_station','start_id',
                'end_dt','end_station','end_id',
                'bike_num','user_type','ZIP']

weather_headers = ['date',
                   'max_temp','mean_temp','min_temp',
                   'max_dp','mean_dp','min_dp',
                   'max_hum','mean_hum','min_hum',
                   'max_sea','mean_sea','min_sea',
                   'max_vis','mean_vis','min_vis',
                   'max_wind','mean_wind','max_gust',
                   'rain_inches','cloud_cover','events',
                   'wind_dir_degrees','ZIP']

In [None]:
# STATION_DATA
raw_data_1 = pd.read_csv('201402_station_data.csv')
raw_data_1.columns = station_headers
raw_data_2 = pd.read_csv('201408_station_data.csv')
raw_data_2.columns = station_headers
raw_data_3 = pd.read_csv('201508_station_data.csv') # current dataset
raw_data_3.columns = station_headers

# merged station datasets
station_data = raw_data_3.merge((raw_data_1.merge(raw_data_2, on='station_id', how='outer', suffixes=('_201402','_201408'))),
                                 on='station_id', how='outer')

# current station dataset
station_cur = raw_data_3.set_index('station_id',drop=True)

In [None]:
# STATUS_DATA
raw_data_1 = pd.read_csv('201402_status_data.csv')
raw_data_1.columns = status_headers
raw_data_2 = pd.read_csv('201408_status_data.csv')
raw_data_2.columns = status_headers
raw_data_3 = pd.read_csv('201508_status_data.csv')
raw_data_3.columns = status_headers

status_data = pd.concat([raw_data_1, raw_data_2, raw_data_3])

In [None]:
# TRIP_DATA
raw_data_1a = pd.read_csv('201402_trip_data.csv', parse_dates=True)
raw_data_1a.columns = trip_headers
raw_data_2a = pd.read_csv('201408_trip_data.csv', parse_dates=True)
raw_data_2a.columns = trip_headers
raw_data_3a = pd.read_csv('201508_trip_data.csv', parse_dates=True)
raw_data_3a.columns = trip_headers

trip_data = pd.concat([raw_data_1a, raw_data_2a, raw_data_3a])
trip_data.drop('ZIP', axis=1, inplace=True)

trip_data['start_city'] = trip_data['start_id'].map(lambda_city)
trip_data['end_city'] = trip_data['end_id'].map(lambda_city)

In [None]:
# WEATHER_DATA
raw_data_1b = pd.read_csv('201402_weather_data.csv', parse_dates=['Date'])
raw_data_1b.columns = weather_headers
raw_data_2b = pd.read_csv('201408_weather_data.csv', parse_dates=['PDT'])
raw_data_2b.columns = weather_headers
raw_data_3b = pd.read_csv('201508_weather_data.csv', parse_dates=['PDT'])
raw_data_3b.columns = weather_headers

weather_data = pd.concat([raw_data_1b, raw_data_2b, raw_data_3b])

weather_data['season'] = weather_data['date'].dt.month.map(seasons_dict)
weather_data['city'] = weather_data['ZIP'].map(city_dict)
weather_data['max_sea'] = weather_data['max_sea'].map(fshort)
weather_data['mean_sea'] = weather_data['mean_sea'].map(fshort)
weather_data['min_sea'] = weather_data['min_sea'].map(fshort)
weather_data['rain_inches'] = weather_data['rain_inches'].map(convert_float)

weather_data.drop(['events'], axis=1, inplace=True)

weather_data.index = weather_data['date']
weather_data.drop(['date'], axis=1, inplace=True)