In [56]:
import pandas as pd
from pandas import DataFrame, Series
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import random

import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

from datetime import date
from dateutil.relativedelta import relativedelta

In [57]:
TRIP_FILE = ('201508_trip_data.csv')
WEATHER = ('201508_weather_data.csv')
STATION_BALANCE = ('201508_status_data.csv')
STATION_DATA = ('201508_station_data.csv')

trip_data = pd.read_csv(TRIP_FILE)
weather_data = pd.read_csv(WEATHER)
station_balance_data = pd.read_csv(STATION_BALANCE)
station_data = pd.read_csv(STATION_DATA)

trip_data.head()

Unnamed: 0,Trip ID,Duration,Start Date,Start Station,Start Terminal,End Date,End Station,End Terminal,Bike #,Subscriber Type,Zip Code
0,913460,765,8/31/2015 23:26,Harry Bridges Plaza (Ferry Building),50,8/31/2015 23:39,San Francisco Caltrain (Townsend at 4th),70,288,Subscriber,2139
1,913459,1036,8/31/2015 23:11,San Antonio Shopping Center,31,8/31/2015 23:28,Mountain View City Hall,27,35,Subscriber,95032
2,913455,307,8/31/2015 23:13,Post at Kearny,47,8/31/2015 23:18,2nd at South Park,64,468,Subscriber,94107
3,913454,409,8/31/2015 23:10,San Jose City Hall,10,8/31/2015 23:17,San Salvador at 1st,8,68,Subscriber,95113
4,913453,789,8/31/2015 23:09,Embarcadero at Folsom,51,8/31/2015 23:22,Embarcadero at Sansome,60,487,Customer,9069


In [58]:
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1825 entries, 0 to 1824
Data columns (total 24 columns):
PDT                           1825 non-null object
Max TemperatureF              1821 non-null float64
Mean TemperatureF             1821 non-null float64
Min TemperatureF              1821 non-null float64
Max Dew PointF                1775 non-null float64
MeanDew PointF                1775 non-null float64
Min DewpointF                 1775 non-null float64
Max Humidity                  1775 non-null float64
 Mean Humidity                1775 non-null float64
 Min Humidity                 1775 non-null float64
 Max Sea Level PressureIn     1824 non-null float64
 Mean Sea Level PressureIn    1824 non-null float64
 Min Sea Level PressureIn     1824 non-null float64
 Max VisibilityMiles          1820 non-null float64
 Mean VisibilityMiles         1820 non-null float64
 Min VisibilityMiles          1820 non-null float64
 Max Wind SpeedMPH            1824 non-null float64
 Mean Wind

In [59]:
trip_data.rename(columns={'Start Terminal': 'station_id'}, inplace=True)
trip_data.head()

Unnamed: 0,Trip ID,Duration,Start Date,Start Station,station_id,End Date,End Station,End Terminal,Bike #,Subscriber Type,Zip Code
0,913460,765,8/31/2015 23:26,Harry Bridges Plaza (Ferry Building),50,8/31/2015 23:39,San Francisco Caltrain (Townsend at 4th),70,288,Subscriber,2139
1,913459,1036,8/31/2015 23:11,San Antonio Shopping Center,31,8/31/2015 23:28,Mountain View City Hall,27,35,Subscriber,95032
2,913455,307,8/31/2015 23:13,Post at Kearny,47,8/31/2015 23:18,2nd at South Park,64,468,Subscriber,94107
3,913454,409,8/31/2015 23:10,San Jose City Hall,10,8/31/2015 23:17,San Salvador at 1st,8,68,Subscriber,95113
4,913453,789,8/31/2015 23:09,Embarcadero at Folsom,51,8/31/2015 23:22,Embarcadero at Sansome,60,487,Customer,9069


In [60]:
station_data_sf = station_data[station_data.landmark == 'San Francisco']
station_data_sf.head()

Unnamed: 0,station_id,name,lat,long,dockcount,landmark,installation
32,41,Clay at Battery,37.795001,-122.39997,15,San Francisco,8/19/2013
33,42,Davis at Jackson,37.79728,-122.398436,15,San Francisco,8/19/2013
34,45,Commercial at Montgomery,37.794231,-122.402923,15,San Francisco,8/19/2013
35,46,Washington at Kearney,37.795425,-122.404767,15,San Francisco,8/19/2013
36,47,Post at Kearney,37.788975,-122.403452,19,San Francisco,8/19/2013


In [61]:
trip_data_merged = trip_data.merge(station_data, on='station_id')
#trip_data.join(station_data, on='station_id')
trip_data.head()

Unnamed: 0,Trip ID,Duration,Start Date,Start Station,station_id,End Date,End Station,End Terminal,Bike #,Subscriber Type,Zip Code
0,913460,765,8/31/2015 23:26,Harry Bridges Plaza (Ferry Building),50,8/31/2015 23:39,San Francisco Caltrain (Townsend at 4th),70,288,Subscriber,2139
1,913459,1036,8/31/2015 23:11,San Antonio Shopping Center,31,8/31/2015 23:28,Mountain View City Hall,27,35,Subscriber,95032
2,913455,307,8/31/2015 23:13,Post at Kearny,47,8/31/2015 23:18,2nd at South Park,64,468,Subscriber,94107
3,913454,409,8/31/2015 23:10,San Jose City Hall,10,8/31/2015 23:17,San Salvador at 1st,8,68,Subscriber,95113
4,913453,789,8/31/2015 23:09,Embarcadero at Folsom,51,8/31/2015 23:22,Embarcadero at Sansome,60,487,Customer,9069


In [62]:
trip_data_sf = trip_data_merged[trip_data_merged.landmark == 'San Francisco']
trip_data_sf.head()

Unnamed: 0,Trip ID,Duration,Start Date,Start Station,station_id,End Date,End Station,End Terminal,Bike #,Subscriber Type,Zip Code,name,lat,long,dockcount,landmark,installation
0,913460,765,8/31/2015 23:26,Harry Bridges Plaza (Ferry Building),50,8/31/2015 23:39,San Francisco Caltrain (Townsend at 4th),70,288,Subscriber,2139,Harry Bridges Plaza (Ferry Building),37.795392,-122.394203,23,San Francisco,8/20/2013
1,913415,274,8/31/2015 20:53,Harry Bridges Plaza (Ferry Building),50,8/31/2015 20:58,Embarcadero at Bryant,54,524,Subscriber,94105,Harry Bridges Plaza (Ferry Building),37.795392,-122.394203,23,San Francisco,8/20/2013
2,913349,559,8/31/2015 20:00,Harry Bridges Plaza (Ferry Building),50,8/31/2015 20:09,2nd at Townsend,61,587,Subscriber,94107,Harry Bridges Plaza (Ferry Building),37.795392,-122.394203,23,San Francisco,8/20/2013
3,913228,886,8/31/2015 18:47,Harry Bridges Plaza (Ferry Building),50,8/31/2015 19:02,San Francisco Caltrain (Townsend at 4th),70,387,Subscriber,94107,Harry Bridges Plaza (Ferry Building),37.795392,-122.394203,23,San Francisco,8/20/2013
4,913166,591,8/31/2015 18:25,Harry Bridges Plaza (Ferry Building),50,8/31/2015 18:35,Powell at Post (Union Square),71,291,Subscriber,94111,Harry Bridges Plaza (Ferry Building),37.795392,-122.394203,23,San Francisco,8/20/2013


In [63]:
trip_data_sf.rename(columns={'lat': 'start_station_lat', 'long': 'start_station_long'}, inplace=True)
trip_data_sf.drop('name', axis=1, inplace=True)
trip_data_sf.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


Unnamed: 0,Trip ID,Duration,Start Date,Start Station,station_id,End Date,End Station,End Terminal,Bike #,Subscriber Type,Zip Code,start_station_lat,start_station_long,dockcount,landmark,installation
0,913460,765,8/31/2015 23:26,Harry Bridges Plaza (Ferry Building),50,8/31/2015 23:39,San Francisco Caltrain (Townsend at 4th),70,288,Subscriber,2139,37.795392,-122.394203,23,San Francisco,8/20/2013
1,913415,274,8/31/2015 20:53,Harry Bridges Plaza (Ferry Building),50,8/31/2015 20:58,Embarcadero at Bryant,54,524,Subscriber,94105,37.795392,-122.394203,23,San Francisco,8/20/2013
2,913349,559,8/31/2015 20:00,Harry Bridges Plaza (Ferry Building),50,8/31/2015 20:09,2nd at Townsend,61,587,Subscriber,94107,37.795392,-122.394203,23,San Francisco,8/20/2013
3,913228,886,8/31/2015 18:47,Harry Bridges Plaza (Ferry Building),50,8/31/2015 19:02,San Francisco Caltrain (Townsend at 4th),70,387,Subscriber,94107,37.795392,-122.394203,23,San Francisco,8/20/2013
4,913166,591,8/31/2015 18:25,Harry Bridges Plaza (Ferry Building),50,8/31/2015 18:35,Powell at Post (Union Square),71,291,Subscriber,94111,37.795392,-122.394203,23,San Francisco,8/20/2013


In [74]:
trip_data_sf = trip_data_sf.merge(station_data[['', left_on = 'End Terminal', right_on = 'station_id')

trip_data_sf

Unnamed: 0,Trip ID,Duration,Start Date,Start Station,station_id_x,End Date,End Station,End Terminal,Bike #,Subscriber Type,...,lat_x,long_x,dockcount_y,station_id,name_y,lat_y,long_y,dockcount,landmark,installation
0,913460,765,8/31/2015 23:26,Harry Bridges Plaza (Ferry Building),50,8/31/2015 23:39,San Francisco Caltrain (Townsend at 4th),70,288,Subscriber,...,37.776617,-122.395260,19,70,San Francisco Caltrain (Townsend at 4th),37.776617,-122.395260,19,San Francisco,8/23/2013
1,913228,886,8/31/2015 18:47,Harry Bridges Plaza (Ferry Building),50,8/31/2015 19:02,San Francisco Caltrain (Townsend at 4th),70,387,Subscriber,...,37.776617,-122.395260,19,70,San Francisco Caltrain (Townsend at 4th),37.776617,-122.395260,19,San Francisco,8/23/2013
2,913009,725,8/31/2015 17:40,Harry Bridges Plaza (Ferry Building),50,8/31/2015 17:53,San Francisco Caltrain (Townsend at 4th),70,360,Subscriber,...,37.776617,-122.395260,19,70,San Francisco Caltrain (Townsend at 4th),37.776617,-122.395260,19,San Francisco,8/23/2013
3,912956,1034,8/31/2015 17:31,Harry Bridges Plaza (Ferry Building),50,8/31/2015 17:48,San Francisco Caltrain (Townsend at 4th),70,587,Subscriber,...,37.776617,-122.395260,19,70,San Francisco Caltrain (Townsend at 4th),37.776617,-122.395260,19,San Francisco,8/23/2013
4,912776,871,8/31/2015 16:50,Harry Bridges Plaza (Ferry Building),50,8/31/2015 17:04,San Francisco Caltrain (Townsend at 4th),70,371,Subscriber,...,37.776617,-122.395260,19,70,San Francisco Caltrain (Townsend at 4th),37.776617,-122.395260,19,San Francisco,8/23/2013
5,912720,649,8/31/2015 16:33,Harry Bridges Plaza (Ferry Building),50,8/31/2015 16:44,San Francisco Caltrain (Townsend at 4th),70,604,Subscriber,...,37.776617,-122.395260,19,70,San Francisco Caltrain (Townsend at 4th),37.776617,-122.395260,19,San Francisco,8/23/2013
6,912718,641,8/31/2015 16:28,Harry Bridges Plaza (Ferry Building),50,8/31/2015 16:39,San Francisco Caltrain (Townsend at 4th),70,326,Subscriber,...,37.776617,-122.395260,19,70,San Francisco Caltrain (Townsend at 4th),37.776617,-122.395260,19,San Francisco,8/23/2013
7,912471,920,8/31/2015 13:06,Harry Bridges Plaza (Ferry Building),50,8/31/2015 13:22,San Francisco Caltrain (Townsend at 4th),70,387,Subscriber,...,37.776617,-122.395260,19,70,San Francisco Caltrain (Townsend at 4th),37.776617,-122.395260,19,San Francisco,8/23/2013
8,912046,666,8/31/2015 8:50,Harry Bridges Plaza (Ferry Building),50,8/31/2015 9:01,San Francisco Caltrain (Townsend at 4th),70,593,Subscriber,...,37.776617,-122.395260,19,70,San Francisco Caltrain (Townsend at 4th),37.776617,-122.395260,19,San Francisco,8/23/2013
9,911149,1106,8/29/2015 19:39,Harry Bridges Plaza (Ferry Building),50,8/29/2015 19:57,San Francisco Caltrain (Townsend at 4th),70,344,Subscriber,...,37.776617,-122.395260,19,70,San Francisco Caltrain (Townsend at 4th),37.776617,-122.395260,19,San Francisco,8/23/2013


In [76]:
list(trip_data_sf.columns.values)

['Trip ID',
 'Duration',
 'Start Date',
 'Start Station',
 'station_id_x',
 'End Date',
 'End Station',
 'End Terminal',
 'Bike #',
 'Subscriber Type',
 'Zip Code',
 'start_station_lat',
 'start_station_long',
 'dockcount_x',
 'station_id_y',
 'name_x',
 'lat_x',
 'long_x',
 'dockcount_y',
 'station_id_x',
 'name_y',
 'lat_y',
 'long_y',
 'dockcount_x',
 'station_id_y',
 'name_x',
 'lat_x',
 'long_x',
 'dockcount_y',
 'station_id_x',
 'name_y',
 'lat_y',
 'long_y',
 'dockcount_x',
 'station_id_y',
 'name_x',
 'lat_x',
 'long_x',
 'dockcount_y',
 'station_id',
 'name_y',
 'lat_y',
 'long_y',
 'dockcount',
 'landmark',
 'installation']

In [5]:
#Converting 'str' objects to 'int'in col='Duration'
#list(map(int, trip_data['Duration']))

In [32]:
#Converting to datetime
trip_data['Start Date'] = pd.to_datetime(trip_data['Start Date'])
trip_data['End Date'] = pd.to_datetime(trip_data['End Date'])

In [33]:
weather_data['PDT'] = pd.to_datetime(weather_data['PDT'])

In [35]:
station_balance_data.time = pd.to_datetime(station_balance_data.time)