In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Based on:
* [CitiBike System Data Stations](https://www.kaggle.com/konstantinosalatzas/citibike-system-data-stations)
* [CitiBike System Data Bike Transfers](https://www.kaggle.com/konstantinosalatzas/citibike-system-data-bike-transfers)

Define the *availability* $U(s,t)$ of a station $s$ at a moment in time $t$ as
* $$U(s,0)=0, \forall s$$
* $$U(s,t)=U(s,t-\delta t)+Arrivals(s,t)-Departures(s,t), \forall s, \forall t>0$$
where
* $Arrivals(s,t)$ is the number of observations with *end station* $s$ and *stoptime* $t$
* $Departures(s,t)$ is the number of observations with *start station* $s$ and *starttime* $t$

As we know the latitude and longitude for each station $s$, we can express $U$ as
$$U(s,t)=U(x,y,t)$$
where $x,y$ are the geographical coordinates of station $s$.

In [None]:
ds = pd.read_csv('../input/bikeshare-analysis/NYC-CitiBike-2016.csv')
ds['rentalid'] = ds.index + 1
ds.head()

We create a dataframe about $Departures(s,t)$ for each station $s$ and time $t$, grouped by station and sorted by time (ascending).

In [None]:
start = ds.groupby(['start station id', 'starttime', 'start station name', 'start station latitude',
                    'start station longitude', 'bikeid', 'usertype', 'birth year', 'gender', 'rentalid'])\
          .size().reset_index(name='counts')
start = start.sort_values(['start station id', 'starttime'])
start['event'] = "Departure"
start.head()

We create a dataframe about $Arrivals(s,t)$ for each station $s$ and time $t$, grouped by station and sorted by time (ascending).

In [None]:
end = ds.groupby(['end station id', 'stoptime', 'end station name', 'end station latitude', 'end station longitude',
                  'bikeid', 'usertype', 'birth year', 'gender', 'rentalid'])\
        .size().reset_index(name='counts')
end = end.sort_values(['end station id', 'stoptime'])
end['event'] = "Arrival"
end.head()

We convert each value to its opposite as this will help with later calculations (remember the definition of $U$).

In [None]:
start['counts'] = -start['counts']
start.head()

In [None]:
end['end station id'] = end['end station id'].astype('int64')
end.head()

As *departures* and *arrivals* now have the same structure, we rename the corresponding columns to common names and append the dataframes.

In [None]:
start = start.rename(columns={"start station id":"station id", "starttime":"time", "start station name":"station name",
                              "start station latitude":"station latitude", "start station longitude":"station longitude"})
end = end.rename(columns={"end station id":"station id", "stoptime":"time", "end station name":"station name",
                          "end station latitude":"station latitude", "end station longitude":"station longitude"})
U = start.append(end)
U.head()

We group by station and sort by time (ascending).

In [None]:
U = U.sort_values(['station id', 'time'])
U.head()

We calculate the *availability* column as the running sum for each station. (Remember that *departures* were stored as their opposites)

In [None]:
U = U.rename(columns={"counts":"availability"})
U['actor'] = "Rental"
U.head()

Assume we have a table with *bikeid*, *starttime*, *stoptime*, *start station id*, *end station id*, grouped by *bikeid* and sorted by *starttime* ascending.

Let $i$ be a row number such that
* $$bikeid(i)=bikeid(i+1)$$
* $$endstationid(i)\neq startstationid(i+1)$$
We call that a *discontinuity* in the bike's trip.

Intuitively, a discontinuity appears when a bike begins a trip from a start station which is different from the end station of its previous trip.

Assuming that the data is complete, we can hypothesize that the company redistributes the bikes by transfering bikes from stations with high *availability* to stations with low *availability*.

In [None]:
trip = pd.read_csv('../input/bikeshare-analysis/NYC-CitiBike-2016.csv')
trip.head()

In [None]:
trip = trip[['bikeid', 'starttime', 'stoptime', 'start station id', 'start station name', 'start station latitude',
             'start station longitude', 'end station id', 'end station name', 'end station latitude',
             'end station longitude', 'usertype', 'birth year', 'gender']]
trip = trip.dropna(subset=['end station id'])
trip['end station id'] = trip['end station id'].astype('int64')
trip.head()

We group by *bikeid* and sort by *starttime*, *stoptime* ascending.

In [None]:
trip = trip.sort_values(['bikeid', 'starttime', 'stoptime'])
trip.head()

Let $i$ be a row number such that $bikeid(i)=bikeid(i+1)$.

Define
$$flag(i+1)=\left\{\begin{matrix}0,startstation(i+1)=endstation(i)\\1,startstation(i+1)\neq endstation(i)\end{matrix}\right.$$
Thus, $flag(i)=1$ if and only if there is a *discontinuity* between the $i$-th and ($i+1$)-th trip of the bike.

In [None]:
air = trip
air['flag'] = trip['start station id'] - trip['end station id'].shift(periods=1)
air = air.dropna(subset=['flag'])
air['flag'] = (air['flag'] > 0)
air['flag'] = air['flag'].astype(np.int64)
air.head()

In [None]:
air['end station id'] = air['end station id'].shift(periods=1)
air['stoptime'] = air['stoptime'].shift(periods=1)
air['end station name'] = air['end station name'].shift(periods=1)
air['end station latitude'] = air['end station latitude'].shift(periods=1)
air['end station longitude'] = air['end station longitude'].shift(periods=1)
air = air.dropna(subset=['end station id'])
air['end station id'] = air['end station id'].astype(np.int64)
air = air[air['flag'] > 0]
air.head()

In [None]:
arrivals = air[['start station id', 'starttime', 'start station name', 'start station latitude', 
                'start station longitude', 'bikeid', 'flag']]
arrivals = arrivals.rename(columns={"start station id":"station id", "starttime":"time", "flag":"counts",
                                    "start station name":"station name", "start station latitude":"station latitude",
                                    "start station longitude":"station longitude"})
arrivals["event"] = "Arrival"

departures = air[['end station id', 'stoptime', 'end station name', 'end station latitude',
                  'end station longitude', 'bikeid', 'flag']]
departures['flag'] = -departures['flag']
departures = departures.rename(columns={"end station id":"station id", "stoptime":"time", "flag":"counts",
                                        "end station name":"station name", "end station latitude":"station latitude",
                                        "end station longitude":"station longitude"})
departures["event"] = "Departure"

air = arrivals.append(departures)
air.head()

In [None]:
air = air.sort_values(['station id', 'time'])
air.head()

In [None]:
air = air.rename(columns={"counts":"availability"})
air["actor"] = "Transfer"
air.head()

In [None]:
U = U.append(air)
U = U.sort_values(['station id', 'time'])
U.head()

In [None]:
U['availability'] = U.groupby('station id')['availability'].cumsum()
U.head()

In [None]:
output = U
output.to_csv('NYC-CitiBike-2016.csv', index=False)