In [2]:
import zipfile
import numpy as np
import pandas as pd
import matplotlib.cm
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr
import matplotlib.lines as mlines
import matplotlib.dates as mdates
import matplotlib.cbook as cbook
import time

import datetime as dt
print("modules imported")



plt.style.use('ggplot')

figurepath = "//users/sajudson/Dropbox/WPI/DS504/project/figures/"
datapath = "//users/sajudson/Dropbox/WPI/DS504/project/data/"


modules imported


  ## Load data

In [3]:
t0 = time.time()
#load dataframe from CSV file
bf = pd.read_csv(datapath+"citibikeNYC.csv")
print('file loaded')
#bf.columns
t1 = time.time()
print(t1-t0)

file loaded
147.95535516738892


## Create new columns

In [4]:
#features derived from datas

t0 = time.time()
bf['starttime']  = pd.to_datetime(bf.starttime)
bf['stoptime']   = pd.to_datetime(bf.stoptime)
bf['startdate']  = bf.starttime.dt.date
bf['startday']   = bf.starttime.dt.day
bf['startdow']   = bf.starttime.dt.dayofweek
bf['startmonth'] = bf.starttime.dt.month
bf['startyear']  = bf.starttime.dt.year


t1 = time.time()
print(t1-t0)

def firstdayofmonth(row):
    year = row['startyear']
    month = row['startmonth']
    return (dt.datetime(year, month,1))

t2 = time.time()
print(t2-t1, t2-t0)
bf['monthstart'] = bf.apply(lambda r: firstdayofmonth(r), axis=1)

bf['date_hour'] = bf.starttime.dt.floor('H')
bf['startyearmon']  = bf['startyear'] *100 + bf['startmonth']


t3 = time.time()
print(t3-t2,t3-t0)

1.1772551536560059
0.00024890899658203125 1.177504062652588
43.0862398147583 44.26374387741089


In [5]:
t4 = time.time()

bf['tripduration_min'] = bf['tripduration']/60
bf['subscriber'] = np.where(bf['usertype']=="Subscriber", 1,0)
bf['customer'] = np.where(bf['usertype']=="Customer", 1,0)

t5 = time.time()
print(t5-t4)
print(bf.shape)
print(bf.columns)

0.2109687328338623
(826012, 28)
Index(['tripduration', 'starttime', 'stoptime', 'start station id',
       'start station name', 'start station latitude',
       'start station longitude', 'end station id', 'end station name',
       'end station latitude', 'end station longitude', 'bikeid', 'usertype',
       'birth year', 'gender', 'distance', 'startdate', 'startday', 'startdow',
       'startmonth', 'startyear', 'monthstart', 'date_hour', 'startyearmon',
       'tripduration_min', 'subscriber', 'customer', 'speed_mph'],
      dtype='object')



## Data QA


In [6]:
#data cleaning
#remove nulls and pathological trips
#pathological trips are either longer than a day or more than 50 miles
# duration - longer than a day are "lost bikes" that are eventually returned to system
# distance - "long distance rides" have an invalid end station (reverts to lat=0, long = 0, yields a 5000+ mile ride)
t0 = time.time()

bf = bf.dropna()

maxtripduration = 24*60
maxdistance = 50
bf = bf[bf['distance']<maxdistance]
bf = bf[bf['tripduration_min']<maxtripduration]
bf.shape

t = time.time()
print(t-t0)


1.1874709129333496


In [7]:
#create separate data frames for customers and subscribers
t0 = time.time()
bf_customers = bf[bf['customer'] == 1]
bf_subscribers = bf[bf['subscriber']==1]
print(bf_customers.shape)
print(bf_subscribers.shape)

t = time.time()
print(t-t0)


(17647, 28)
(762745, 28)
0.09274697303771973


Aggregate data by hour

In [8]:
t0 = time.time()
aggFunction = {
    'tripduration_min': ['mean','max','sum'],
    'start station id':['count', 'nunique'],
    'bikeid':['nunique'],
    'distance': ['mean','max','sum'],
    'usertype':['count'],
    'subscriber':['sum'],
    'customer':['sum'],
    'speed_mph':['mean']}


def groupdf(df,groupparameter,aggFunction):
    df1 = df.groupby([groupparameter]).agg(aggFunction)
    df1.columns = ["_".join(x) for x in df1.columns.ravel()]
    print(df1.shape)
    return(df1)

#METRICS

def calcMetric(row,num_parameter,denom_parameter):
        num = row[num_parameter]
        denom = row[denom_parameter]
        if denom == 0: metric = 0
        else: metric = num/denom
        return (metric)

    
#User Metrics
def calcUserMetrics(df):
    print(df.shape)
    
    num_parameter1 = "distance_sum"
    num_parameter2 ='tripduration_min_sum'
    denom_parameter = 'usertype_count'
    def distancePerUserTrip(row):
        return(calcMetric(row,num_parameter1,denom_parameter))
        
    
    def durationPerUserTrip(row):
        return(calcMetric(row,num_parameter2,denom_parameter))
        
        
    #df['distancePerUser'] = df.apply(lambda r: distancePerUserTrip(r), axis=1)
    #df['durationPerUser'] = df.apply(lambda r: durationPerUserTrip(r), axis=1)
    df['distancePerUser'] = df['distance_sum']/df['usertype_count']
    df['durationPerUser'] = df['tripduration_min_sum']/df['usertype_count']
    print(df.shape)   
    return


#Bike Metrics

def calcBikeMetrics(df):
    print(df.shape)   
    num_parameter1 = 'distance_sum'
    num_parameter2 = 'tripduration_min_sum'
    denom_parameter = 'bikeid_nunique'
   
    def distancePerBikeid(row):
        calcMetric(row,num_parameter1,denom_parameter)
        return 

    def durationPerBikeid(row):
        calcMetric(row,num_parameter2,denom_parameter)
        return 
    
    #df['distancePerBike'] = df.apply(lambda r: distancePerBikeid(r), axis=1)
    #df['durationPerBike'] = df.apply(lambda r: durationPerBikeid(r), axis=1)
    df['distancePerBike'] = df['distance_sum']/df['bikeid_nunique']
    df['durationPerBike'] = df['tripduration_min_sum']/df['bikeid_nunique']
   
    
    print(df.shape)   
    return

t = time.time()
print(t-t0)


0.0005130767822265625


In [13]:
t0 = time.time()

groupParameter1 = 'date_hour'
groupParameter2 = 'startdate'
groupParameter3 = 'monthstart'
dfList = [bf,bf_subscribers,bf_customers] 

def calcMetrics(df):
    calcUserMetrics(df)
    calcBikeMetrics(df)
    return

def groupandmetrics(dfList, groupParameter, aggFunction, metricsFunction):
    dfoutlist = []
    for d in range(0,len(dfList)):
        dfoutlist.append(groupdf(dfList[d],groupParameter,aggFunction))
        calcMetrics(dfoutlist[d])
    return(dfoutlist)

bf_h,bf_subscribers_h,bf_customers_h = groupandmetrics(dfList, groupParameter1, aggFunction, calcMetrics)


bf_d,bf_subscribers_d,bf_customers_d = groupandmetrics(dfList, groupParameter2, aggFunction, calcMetrics)

bf_m,bf_subscribers_m,bf_customers_m = groupandmetrics(dfList, groupParameter3, aggFunction, calcMetrics)

t = time.time()
print(t-t0)

bf_hour.head()


(24253, 13)
(24253, 13)
(24253, 15)
(24253, 15)
(24253, 17)
(24240, 13)
(24240, 13)
(24240, 15)
(24240, 15)
(24240, 17)
(4020, 13)
(4020, 13)
(4020, 15)
(4020, 15)
(4020, 17)
(1068, 13)
(1068, 13)
(1068, 15)
(1068, 15)
(1068, 17)
(1068, 13)
(1068, 13)
(1068, 15)
(1068, 15)
(1068, 17)
(490, 13)
(490, 13)
(490, 15)
(490, 15)
(490, 17)
(36, 13)
(36, 13)
(36, 15)
(36, 15)
(36, 17)
(36, 13)
(36, 13)
(36, 15)
(36, 15)
(36, 17)
(21, 13)
(21, 13)
(21, 15)
(21, 15)
(21, 17)
2.634340286254883


Unnamed: 0_level_0,tripduration_min_mean,tripduration_min_max,tripduration_min_sum,start station id_count,start station id_nunique,bikeid_nunique,distance_mean,distance_max,distance_sum,usertype_count,subscriber_sum,customer_sum,speed_mph_mean,distancePerUser,durationPerUser,distancePerBike,durationPerBike
date_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2015-09-21 14:00:00,7.395833,13.1,29.583333,4,3,4,0.138968,0.555872,0.555872,4,4,0,1.109279,0.138968,7.395833,0.138968,7.395833
2015-09-21 15:00:00,15.197333,38.733333,379.933333,25,13,21,0.578797,1.378965,14.469915,25,25,0,3.493085,0.578797,15.197333,0.689044,18.092063
2015-09-21 16:00:00,14.247619,37.566667,299.2,21,14,16,0.810091,2.014926,17.011919,21,21,0,4.149,0.810091,14.247619,1.063245,18.7
2015-09-21 17:00:00,7.892667,21.166667,197.316667,25,10,23,0.516572,2.062002,12.914296,25,25,0,4.293618,0.516572,7.892667,0.561491,8.578986
2015-09-21 18:00:00,8.981609,36.983333,520.933333,58,17,49,0.461321,1.330277,26.756594,58,58,0,4.047302,0.461321,8.981609,0.546053,10.631293


In [22]:
t0 = time.time()



dflists = [[bf_h, bf_subscribers_h, bf_customers_h],
           [bf_d, bf_subscribers_d, bf_customers_d],
           [bf_m, bf_subscribers_m, bf_customers_m]]
interval = ['hourly','daily','monthly']
scope = ['all','subscribers','customers']

for i in range(0,len(dflists)):
    label1 = interval[i]
    for j in range(0,len(dflists[i])):
        label2 = scope[j]
        dflists[i][j].to_csv(datapath+"citibikejc"+"_"+interval[i]+"_"+scope[j]+".csv", index = True)

#bf_hour.
#bf_date.to_csv(datapath+"bf_date.csv", index = True)
#bf_month.to_csv(datapath+"bf_month.csv", index = True)
print("files saved")

t = time.time()
print(t-t0)

bf_date.head()


files saved
1.8472261428833008


Unnamed: 0_level_0,tripduration_min_mean,tripduration_min_max,tripduration_min_sum,start station id_count,start station id_nunique,bikeid_nunique,distance_mean,distance_max,distance_sum,usertype_count,subscriber_sum,customer_sum,speed_mph_mean,distancePerUser,durationPerUser,distancePerBike,durationPerBike
startdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2015-09-21,10.613142,38.883333,2409.183333,227,31,115,0.51495,2.270242,116.893685,227,227,0,3.795329,0.51495,10.613142,1.016467,20.94942
2015-09-22,16.278344,944.516667,7829.883333,481,32,208,0.664856,3.132514,319.795766,481,481,0,4.865501,0.664856,16.278344,1.53748,37.64367
2015-09-23,11.482236,189.383333,6625.25,577,34,218,0.679158,2.978173,391.874034,577,577,0,4.828717,0.679158,11.482236,1.797587,30.391055
2015-09-24,10.487932,250.783333,5967.633333,569,33,207,0.649871,3.874879,369.776455,569,569,0,5.017438,0.649871,10.487932,1.78636,28.829147
2015-09-25,11.446933,640.633333,6593.433333,576,31,204,0.641516,2.644657,369.513243,576,576,0,4.852021,0.641516,11.446933,1.811339,32.320752


In [None]:
#load dataframe from CSV file
bf_hour = pd.read_csv(datapath+"bf_hour.csv", index_col=0)
bf_date = pd.read_csv(datapath+"bf_date.csv", index_col=0)
bf_month = pd.read_csv(datapath+"bf_month.csv", index_col=0)

print('files loaded')
#bf.columns

bf_hour.head()

In [None]:

import hw3module as hw3
print("hw3module imported")
       
  

In [None]:
#plot key parmeters over time
chart1Parameters = {'x':'df.index',
                    'xlabel':'date',
                    'y1':'distancePerUser',
                    'y2': 'bikeid_nunique',
                    'title1':'Stations in Use',
                    'title2':'Bike in Use',
                    'y1label':'Stations',
                    'y2label':'Bikes',
                    'filename':"_stations_bikes"
                   }

chart2Parameters = {'x':'df.index',
                    'xlabel':'date',
                    'y1':'subscriber_sum',
                    'y2': 'customer_sum',
                    'title1':'Subscriber Trips',
                    'title2':'User Trips',
                    'y1label':'Trips',
                    'y2label':'Trips',
                    'filename':"_subscribers_customers"
                   }

chart3Parameters = {'x':'df.index',
                    'xlabel':'date',
                    'y1':'tripduration_min_sum',
                    'y2': 'distance_sum',
                    'title1':'Total of Trip Durations',
                    'title2':'Total Distance',
                    'y1label':'Minutes',
                    'y2label':'Miles',
                    'filename':"_tripduration_distance"
                   }
chartlist = [chart1Parameters, chart2Parameters, chart3Parameters]
t0 = time.time()

filepath = "monthly"
plotfunction = hw3.lplotter
hw3.plotChartList(bf_month, filepath, plotfunction, chartlist)
t1 = time.time()
print(t1-t0)

filepath = "daily"
hw3.plotChartList(bf_date, filepath, plotfunction, chartlist)
t2 = time.time()
print(t2-t0)


In [None]:
#plot metrics
def lplotter2(x1,x2,y1,y2,t1,xlabel,ylabel, filename, plottype = 'scatter'):
    
    years = mdates.YearLocator()   # every year
    months = mdates.MonthLocator()  # every month
    days = mdates.DayLocator()  # every day
    yearsFmt = mdates.DateFormatter('%Y')
    monthsFmt = mdates.DateFormatter('%m-%Y')
    daysFmt = mdates.DateFormatter('%d')

    lw =1
    alpha_default = 0.40
    color1 = "blue"
    color2 = "green"
    
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 6))

    def ytickformat(x):
        return '$%1.2f' % x
    
    # round to nearest month...
    datemin = np.datetime64(x1[0], 'M')
    datemax = np.datetime64(x1[-1], 'M') + np.timedelta64(1, 'M')

    if plottype == 'line':
        # plot y1
        ax.plot(x1,y1,linewidth =lw, color = color1, alpha = alpha_default*alphamultiplier, label="Subscribers")
        ax.plot(x2,y2,linewidth=lw*.5 ,color = color2, alpha = alpha_default, label = "Customers")
    else:
        ax.scatter(x1,y1,linewidth =lw, color = color1, alpha = alpha_default*alphamultiplier, label="Subscribers")
        ax.scatter(x2,y2,linewidth=lw*.5 ,color = color2, alpha = alpha_default, label = "Customers")
    
    ax.set_title(t1)
    ax.set_ylabel(ylabel)
    ax.set_ylim(0, max(np.max(y1),np.max(y2))*1.05)
    
    ax.set_xlabel(xlabel)
    ax.format_xdata = mdates.DateFormatter('%Y-%m-%d')
    ax.format_ydata = ytickformat
    ax.grid(True)
    # format the ticks
    ax.xaxis.set_major_locator(years)
    ax.xaxis.set_major_formatter(yearsFmt)
    ax.xaxis.set_minor_locator(months)
    ax.set_xlim(datemin, datemax)
    ax.legend()
        
    # rotates and right aligns the x labels, and moves the bottom of the
    # axes up to make room for them
    fig.autofmt_xdate()
    
    #save figure as PNG
    figfilename = figurepath+filename+ figext
    plt.savefig(figfilename, bbox_inches='tight', dpi = (300))

    plt.show()
    return()

def lplotter0(x1,y1,t1,xlabel,ylabel, filename, plottype = 'scatter'):
    
    years = mdates.YearLocator()   # every year
    months = mdates.MonthLocator()  # every month
    days = mdates.DayLocator()  # every day
    yearsFmt = mdates.DateFormatter('%Y')
    monthsFmt = mdates.DateFormatter('%m-%Y')
    daysFmt = mdates.DateFormatter('%d')

    lw =1
    alpha_default = 0.4
    color1 = "blue"
    color2 = "green"
    
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 6))

    def ytickformat(x):
        return '$%1.2f' % x
    
    # round to nearest month...
    datemin = np.datetime64(x1[0], 'M')
    datemax = np.datetime64(x1[-1], 'M') + np.timedelta64(1, 'M')

    # plot y1
    
    if plottype == "line":
        ax.plot(x1,y1,linewidth =lw, color = color1, alpha = alpha_default*alphamultiplier, label="Subscribers")
    else:
        ax.scatter(x1,y1,linewidth =lw, color = color1, alpha = alpha_default*alphamultiplier, label="Subscribers")
   
    ax.set_title(t1)

    ax.set_ylabel(ylabel)
    ax.set_ylim(0, np.max(y1)*1.05)
    
    ax.set_xlabel(xlabel)
    ax.format_xdata = mdates.DateFormatter('%Y-%m-%d')
    ax.format_ydata = ytickformat
    ax.grid(True)
    # format the ticks
    ax.xaxis.set_major_locator(years)
    ax.xaxis.set_major_formatter(yearsFmt)
    ax.xaxis.set_minor_locator(months)
    ax.set_xlim(datemin, datemax)
    ax.legend()
        
    # rotates and right aligns the x labels, and moves the bottom of the
    # axes up to make room for them
    fig.autofmt_xdate()
    
    #save figure as PNG
    figfilename = figurepath+filename+ figext
    plt.savefig(figfilename, bbox_inches='tight', dpi = (300))

    plt.show()
    return()

def scatter(x1,x2,t1,xlabel,ylabel, filename, plottype = 'scatter'):
    
    lw =1
    alpha_default = 0.4
    color1 = "blue"
    color2 = "green"
    
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 6))

    def ytickformat(x):
        return '$%1.2f' % x
    
    def xtickformat(x):
        return '$%1.2f' % x


    ax.scatter(x1,y1,linewidth =lw, color = color1, alpha = alpha_default*alphamultiplier, label="Subscribers")
   
    ax.set_title(t1)

    ax.set_ylabel(ylabel)
    ax.set_ylim(0, np.max(y1)*1.05)
    ax.set_xlim(0, np.max(x1)*1.05)
    
    ax.set_xlabel(xlabel)
    ax.format_xdata = mdates.DateFormatter('%Y-%m-%d')
    ax.format_ydata = ytickformat
    ax.grid(True)
    # format the ticks
    ax.legend()
        
    # rotates and right aligns the x labels, and moves the bottom of the
    # axes up to make room for them
    fig.autofmt_xdate()
    
    #save figure as PNG
    figfilename = figurepath+filename+ figext
    plt.savefig(figfilename, bbox_inches='tight', dpi = (300))

    plt.show()
    return()

dfList = [bf_date, bf_subscribers_d, bf_customers_d]

interval = "daily"
alphamultiplier = .25
metric = 'distancePerUser'
t1 = "Average Distance per User - Daily"
xlabel = "Date"
ylabel = "Miles"
filename = interval+"_"+metric+"sc"

x1 = dfList[1].index
x2 = dfList[2].index
y1 = dfList[1][metric]
y2 = dfList[2][metric]

hw3.lplotter2(x1,x2,y1,y2,t1,xlabel,ylabel, filename)

filename = interval+"_"+metric+"s"
lplotter0(x1,y1,t1,xlabel,ylabel, filename)

metric = 'durationPerUser'
t1 = "Average Duration per User - Daily"
xlabel = "Date"
ylabel = "Minutes"
filename = interval+"_"+metric+"sc"

x1 = dfList[1].index
x2 = dfList[2].index
y1 = dfList[1][metric]
y2 = dfList[2][metric]

hw3.lplotter2(x1,x2,y1,y2,t1,xlabel,ylabel, filename)

filename = interval+"_"+metric+"s"

hw3.lplotter0(x1,y1,t1,xlabel,ylabel, filename)

In [None]:
dfList = [bf_month, bf_subscribers_m, bf_customers_m]
alphamultiplier = 1
interval = "monthly"

metric = 'distancePerUser'
t1 = "Average Distance per User - Monthly"
xlabel = "Date"
ylabel = "Miles"
filename = interval+"_"+metric+"sc"

x1 = dfList[1].index
x2 = dfList[2].index
y1 = dfList[1][metric]
y2 = dfList[2][metric]

hw3.lplotter2(x1,x2,y1,y2,t1,xlabel,ylabel, filename)


filename = interval+"_"+metric+"s"
hw3.lplotter0(x1,y1,t1,xlabel,ylabel, filename,'line')

metric = 'durationPerUser'
t1 = "Average Duration per User - Monthly"
xlabel = "Date"
ylabel = "Minutes"
filename = interval+"_"+metric+"sc"

x1 = dfList[1].index
x2 = dfList[2].index
y1 = dfList[1][metric]
y2 = dfList[2][metric]

hw3.lplotter2(x1,x2,y1,y2,t1,xlabel,ylabel, filename)

filename = interval+"_"+metric+"s"
hw3.lplotter0(x1,y1,t1,xlabel,ylabel, filename,'line')

In [None]:
#plot more metrics

dfList = [bf_date, bf_subscribers_d, bf_customers_d]

interval = "daily"
alphamultiplier = .25
metric = 'distancePerBike'
t1 = "Average Distance per Bike - Daily"
xlabel = "Date"
ylabel = "Miles"
filename = interval+"_"+metric+"sc"

x1 = dfList[1].index
x2 = dfList[2].index
y1 = dfList[1][metric]
y2 = dfList[2][metric]

hw3.lplotter2(x1,x2,y1,y2,t1,xlabel,ylabel, filename)

filename = interval+"_"+metric+"s"
hw3.lplotter0(x1,y1,t1,xlabel,ylabel, filename)

metric = 'durationPerBike'
t1 = "Average Duration per Bike - Daily"
xlabel = "Date"
ylabel = "Minutes"
filename = interval+"_"+metric+"sc"

x1 = dfList[1].index
x2 = dfList[2].index
y1 = dfList[1][metric]
y2 = dfList[2][metric]

hw3.lplotter2(x1,x2,y1,y2,t1,xlabel,ylabel, filename)

filename = interval+"_"+metric+"s"

hw3.lplotter0(x1,y1,t1,xlabel,ylabel, filename)

In [None]:
metric1 = 'distancePerBike'
metric2 = 'speed_mph_mean'
t1 = "Average Speed per Bike vs Average Distance per Bike"
xlabel = "Miles"
ylabel = "MPH"
filename = interval+"_"+metric1+metric2+"all"

x1 = dfList[0][metric1]
y1 = dfList[0][metric2]
hw3.scatter(x1,x2,t1,xlabel,ylabel, filename)

In [None]:
dfList = [bf_month, bf_subscribers_m, bf_customers_m]
alphamultiplier = 1
interval = "monthly"

metric = 'distancePerBike'
t1 = "Average Monthly Distance per Bike"
xlabel = "Date"
ylabel = "Miles"
filename = interval+"_"+metric+"all"

x1 = dfList[0].index
y1 = dfList[0][metric]

hw3.lplotter0(x1,y1,t1,xlabel,ylabel, filename,'line')

metric = 'durationPerBike'
t1 = "Average Monthly Duration per Bike"
xlabel = "Date"
ylabel = "Minutes"
filename = interval+"_"+metric+"all"

x1 = dfList[0].index
y1 = dfList[0][metric]

hw3.lplotter0(x1,x2,t1,xlabel,ylabel, filename, 'line')

metric1 = 'durationPerBike'
metric2 = 'distancePerBike'
t1 = "Average Distance per Bike vs Average Duration per Bike"
xlabel = "Minutes"
ylabel = "Miles"
filename = interval+"_"+metric1+metric2+"all"

x1 = dfList[0][metric1]
y1 = dfList[0][metric2]
hw3.scatter(x1,x2,t1,xlabel,ylabel, filename)


In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(9, 4))

# generate some random test data
all_data = [np.random.normal(0, std, 100) for std in range(6, 10)]

# plot violin plot
axes[0].violinplot(all_data,
                   showmeans=False,
                   showmedians=True)
axes[0].set_title('Violin plot')

# plot box plot
axes[1].violinplot(all_data)
axes[1].set_title('Violin plot 2')

# adding horizontal grid lines
for ax in axes:
    ax.yaxis.grid(True)
    ax.set_xticks([y + 1 for y in range(len(all_data))])
    ax.set_xlabel('Four separate samples')
    ax.set_ylabel('Observed values')

# add x-tick labels
plt.setp(axes, xticks=[y + 1 for y in range(len(all_data))],
         xticklabels=['x1', 'x2', 'x3', 'x4'])
plt.show()



In [None]:
#GRAPHS


#distribution
#  users by day
#  

#growth over time
#  users
#  bikes
#  stations

#Bike utilization
#  rides/active bikeid by day, by week, by month
#