In [21]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier

In [194]:
data = pd.read_csv('202310-divvy-tripdata.csv')
data = data[pd.to_numeric(data['start_station_id'], errors='coerce').notnull()]
data = data[pd.to_numeric(data['end_station_id'], errors='coerce').notnull()]
data = data.dropna(subset=['start_station_id', 'end_station_id'])

In [195]:

start = pd.DataFrame(data['start_station_id'].value_counts())
end = pd.DataFrame(data['end_station_id'].value_counts())
start = start.rename(columns={'count': 'startcount'})
start.head()

Unnamed: 0_level_0,startcount
start_station_id,Unnamed: 1_level_1
13022,2230
13300,1883
13216,1310
15544,1284
13008,1258


In [196]:
end = end.rename(columns={'count': 'endcount'})
end.head()

Unnamed: 0_level_0,endcount
end_station_id,Unnamed: 1_level_1
13022,2546
13300,1703
13008,1403
13216,1305
13042,1289


In [197]:
stations = pd.concat([start['startcount'], end['endcount']], axis=1)
stations['total'] = stations['startcount'] + stations['endcount']
stations.head(10)
#stations['total'] = stations['start_station_id'] + stations['end_station_id']
#stations.head(10)

Unnamed: 0,startcount,endcount,total
13022,2230.0,2546.0,4776.0
13300,1883.0,1703.0,3586.0
13216,1310.0,1305.0,2615.0
15544,1284.0,1114.0,2398.0
13008,1258.0,1403.0,2661.0
13042,1150.0,1289.0,2439.0
13217,1092.0,1067.0,2159.0
13011,1004.0,943.0,1947.0
638,988.0,1047.0,2035.0
13061,965.0,964.0,1929.0


In [198]:
stats = stations.sort_values('total', ascending=False).head(10)
stats = list(stats.index)
print(stats)

['13022', '13300', '13008', '13216', '13042', '15544', '13217', '638', '13011', '13061']


In [199]:
# take out just these top stations from the data
divd = data[(data['start_station_id'].isin(stats)) | (data['end_station_id'].isin(stats))]
divd.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
603,A1B0885683D4B4CC,classic_bike,2023-10-13 18:31:25,2023-10-13 18:54:00,Streeter Dr & Grand Ave,13022,Larrabee St & Webster Ave,13193,41.892278,-87.612043,41.921822,-87.64414,member
608,8BECFAB67F1FA81D,classic_bike,2023-10-22 17:44:50,2023-10-22 18:09:05,Streeter Dr & Grand Ave,13022,Larrabee St & Webster Ave,13193,41.892278,-87.612043,41.921822,-87.64414,casual
1042,BE2395919F04F1E0,electric_bike,2023-10-28 15:56:13,2023-10-28 16:15:34,Streeter Dr & Grand Ave,13022,Aberdeen St & Randolph St,18062,41.892319,-87.612234,41.884114,-87.654264,member
1198,8527F766D559425F,electric_bike,2023-10-28 15:55:45,2023-10-28 16:15:36,Streeter Dr & Grand Ave,13022,Aberdeen St & Randolph St,18062,41.892313,-87.612226,41.884114,-87.654264,member
1200,A4A3170B63E200BB,electric_bike,2023-10-23 18:26:25,2023-10-23 18:41:07,Streeter Dr & Grand Ave,13022,Larrabee St & Webster Ave,13193,41.892294,-87.612193,41.921822,-87.64414,member


In [200]:
print(divd.columns)

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual'],
      dtype='object')


In [201]:
# don't need ride_id, rideable_type, start_lat, start_lng, end_lat, end_lng, or member_casual
divd = divd[['started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id']]
divd.head()

Unnamed: 0,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id
603,2023-10-13 18:31:25,2023-10-13 18:54:00,Streeter Dr & Grand Ave,13022,Larrabee St & Webster Ave,13193
608,2023-10-22 17:44:50,2023-10-22 18:09:05,Streeter Dr & Grand Ave,13022,Larrabee St & Webster Ave,13193
1042,2023-10-28 15:56:13,2023-10-28 16:15:34,Streeter Dr & Grand Ave,13022,Aberdeen St & Randolph St,18062
1198,2023-10-28 15:55:45,2023-10-28 16:15:36,Streeter Dr & Grand Ave,13022,Aberdeen St & Randolph St,18062
1200,2023-10-23 18:26:25,2023-10-23 18:41:07,Streeter Dr & Grand Ave,13022,Larrabee St & Webster Ave,13193


In [202]:
# need to change started_at to month day and time
divd['started_at'] = pd.to_datetime(divd['started_at'])

divd['day'] = divd['started_at'].dt.day_name()

divd['month'] = divd['started_at'].dt.month_name()

divd['hours'] = divd['started_at'].map(lambda x: str(x)[11:13])
divd['hour'] = divd['hours']

divd.drop('started_at', axis=1, inplace=True)
divd.drop('ended_at', axis=1, inplace=True)
divd.head()

Unnamed: 0,start_station_name,start_station_id,end_station_name,end_station_id,day,month,hours,hour
603,Streeter Dr & Grand Ave,13022,Larrabee St & Webster Ave,13193,Friday,October,18,18
608,Streeter Dr & Grand Ave,13022,Larrabee St & Webster Ave,13193,Sunday,October,17,17
1042,Streeter Dr & Grand Ave,13022,Aberdeen St & Randolph St,18062,Saturday,October,15,15
1198,Streeter Dr & Grand Ave,13022,Aberdeen St & Randolph St,18062,Saturday,October,15,15
1200,Streeter Dr & Grand Ave,13022,Larrabee St & Webster Ave,13193,Monday,October,18,18


In [209]:
#divd['hour'] = divd['hours']
# group by starting station and the hour
divd = pd.DataFrame(divd.groupby(['start_station_name', 'hours'])['hour'].count())
divd.rename(columns={'hour':'count'})
divd.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,hour
start_station_name,hours,Unnamed: 2_level_1
63rd St Beach,10,1
900 W Harrison St,5,1
900 W Harrison St,6,1
900 W Harrison St,8,1
900 W Harrison St,9,1
