In [101]:
from pandas import DataFrame, read_csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from ggplot import *

In [102]:
# formatting lambdas & functions
fnone = lambda x : '{0:g}'.format(float(x))
fshort = lambda x : '{:,.2f}'.format(x)

def convert_float(val):
    try:
        return float(val)
    except ValueError:
        return 0

# dictionary reference
seasons_dict = {
    1: 'Winter',2: 'Spring',3: 'Spring',4: 'Spring',
    5: 'Summer',6: 'Summer',7: 'Summer',8: 'Autumn',
    9: 'Autumn',10: 'Autumn',11: 'Winter',12: 'Winter'}

city_dict = {
    94107: 'San Francisco',
    94063: 'Redwood City',
    94301: 'Palo Alto', 
    94041: 'Mountain View', 
    95113: 'San Jose'}

# header reference
station_headers = ['station_id','station_name','lat','long',
                   'dock_count','city','install_date']

status_headers = ['station_id','bikes_free','docks_free','time']

trip_headers = ['trip_id','trip_time',
                'start_dt','start_station','start_terminal',
                'end_dt','end_station','end_terminal',
                'bike_num','user_type','ZIP']

weather_headers = ['date',
                   'max_temp','mean_temp','min_temp',
                   'max_dp','mean_dp','min_dp',
                   'max_hum','mean_hum','min_hum',
                   'max_sea','mean_sea','min_sea',
                   'max_vis','mean_vis','min_vis',
                   'max_wind','mean_wind','max_gust',
                   'rain_inches','cloud_cover','events',
                   'wind_dir_degrees','ZIP']

In [103]:
# STATION_DATA
raw_data_1 = pd.read_csv('201402_station_data.csv')
raw_data_1.columns = station_headers
raw_data_2 = pd.read_csv('201408_station_data.csv')
raw_data_2.columns = station_headers
raw_data_3 = pd.read_csv('201508_station_data.csv') # current dataset
raw_data_3.columns = station_headers

# merged station datasets
station_data = raw_data_3.merge((raw_data_1.merge(raw_data_2, on='station_id', how='outer', suffixes=('_201402','_201408'))),
                                 on='station_id', how='outer')

# current station dataset
station_cur = raw_data_3.set_index('station_id',drop=True)

In [4]:
# STATUS_DATA
raw_data_1 = pd.read_csv('201402_status_data.csv')
raw_data_1.columns = status_headers
raw_data_2 = pd.read_csv('201408_status_data.csv')
raw_data_2.columns = status_headers
raw_data_3 = pd.read_csv('201508_status_data.csv')
raw_data_3.columns = status_headers

status_data = pd.concat([raw_data_1, raw_data_2, raw_data_3])

In [5]:
# TRIP_DATA
raw_data_1a = pd.read_csv('201402_trip_data.csv', parse_dates=True)
raw_data_1a.columns = trip_headers
raw_data_2a = pd.read_csv('201408_trip_data.csv', parse_dates=True)
raw_data_2a.columns = trip_headers
raw_data_3a = pd.read_csv('201508_trip_data.csv', parse_dates=True)
raw_data_3a.columns = trip_headers

trip_data = pd.concat([raw_data_1a, raw_data_2a, raw_data_3a])

trip_data.index = trip_data['trip_id'].astype(int)
trip_data.drop('trip_id', axis=1, inplace=True)

In [83]:
# WEATHER_DATA
raw_data_1b = pd.read_csv('201402_weather_data.csv', parse_dates=['Date'])
raw_data_1b.columns = weather_headers
raw_data_2b = pd.read_csv('201408_weather_data.csv', parse_dates=['PDT'])
raw_data_2b.columns = weather_headers
raw_data_3b = pd.read_csv('201508_weather_data.csv', parse_dates=['PDT'])
raw_data_3b.columns = weather_headers

weather_data = pd.concat([raw_data_1b, raw_data_2b, raw_data_3b])

weather_data['season'] = weather_data['date'].dt.month.map(seasons_dict)
weather_data['city'] = weather_data['ZIP'].map(city_dict)
weather_data['max_sea'] = weather_data['max_sea'].map(fshort)
weather_data['mean_sea'] = weather_data['mean_sea'].map(fshort)
weather_data['min_sea'] = weather_data['min_sea'].map(fshort)
weather_data['rain_inches'] = weather_data['rain_inches'].map(convert_float)

weather_data.drop(['events'], axis=1, inplace=True)

weather_data.index = weather_data['date']
weather_data.drop(['date'], axis=1, inplace=True)

In [94]:
rains = pd.DataFrame(weather_data.groupby(['city','season'])['rain_inches'].sum())
rains

Unnamed: 0_level_0,Unnamed: 1_level_0,rain_inches
city,season,Unnamed: 2_level_1
Mountain View,Autumn,2.76
Mountain View,Spring,8.18
Mountain View,Summer,0.76
Mountain View,Winter,10.44
Palo Alto,Autumn,0.51
Palo Alto,Spring,2.76
Palo Alto,Summer,0.0
Palo Alto,Winter,0.83
Redwood City,Autumn,0.49
Redwood City,Spring,0.75


In [8]:
station_cur.head()

Unnamed: 0_level_0,station_name,lat,long,dock_count,city,install_date
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27,San Jose,8/6/2013
3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,8/5/2013
4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,8/6/2013
5,Adobe on Almaden,37.331415,-121.8932,19,San Jose,8/5/2013
6,San Pedro Square,37.336721,-121.894074,15,San Jose,8/7/2013


In [9]:
status_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71984434 entries, 0 to 36647621
Data columns (total 4 columns):
station_id    int64
bikes_free    int64
docks_free    int64
time          object
dtypes: int64(3), object(1)
memory usage: 2.7+ GB


In [96]:
# do not use
tda = trip_data.groupby(['user_type'])['ZIP'].value_counts()
tda.head(10)

user_type  ZIP  
Customer   nil      10682
           94107     2353
           94105     1344
           94103     1239
           94102     1205
           94109     1099
           94133     1039
           95112     1001
           94111      829
           1          780
Name: ZIP, dtype: int64

In [98]:
wt = weather_data.groupby(['season']).mean()
wt

Unnamed: 0_level_0,max_temp,mean_temp,min_temp,max_dp,mean_dp,min_dp,max_hum,mean_hum,min_hum,max_vis,mean_vis,min_vis,max_wind,mean_wind,max_gust,rain_inches,cloud_cover,wind_dir_degrees,ZIP
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Autumn,76.406852,66.87152,57.178801,57.658747,53.5054,49.209503,84.88121,66.732181,45.387689,10.575107,9.943133,8.784335,16.975375,6.126338,23.048343,0.007163,2.504283,287.077088,94325
Spring,67.959551,58.657303,49.101124,51.107987,46.339708,41.229471,86.267717,67.023622,45.525309,10.583991,9.900789,8.139797,16.969663,6.233708,23.53621,0.033461,3.083146,262.058427,94325
Summer,74.670665,65.950927,56.934569,55.930131,52.658297,49.299127,83.434498,66.268559,47.318777,10.967141,10.337349,9.282585,18.816304,7.880435,23.787324,0.001707,2.951087,279.620652,94325
Winter,63.126087,53.758696,44.418478,47.573864,42.296591,37.060227,87.307955,69.073864,47.630682,10.098913,9.111957,6.709783,12.842391,3.61413,18.783309,0.038804,2.613043,237.207609,94325
