Station elasticity
===========

We need to find station-pair elasticity and store it as a large dataframe or matrix. Every time we get a new shock event, we can include those results in our table as well, which is great! Right now, the only shock event I have looked at is the opening of the Q, and for there I'll assume that all of the stations share the resulting elasticity with the other stations, since there is no way to separate them out.

In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import pymc3 as pm

from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2

user = 'mikemoran'
database = 'stations'

engine = create_engine(f'postgres://{user}@localhost/{database}')
conn = psycopg2.connect(database=database, user=user)

In [2]:
query = '''
    select unit, c_a from station_info
        order by unit asc;
'''

query_2 = '''
    select * from station_info;
'''

station_details = pd.read_sql(query_2, conn)
station_details.drop_duplicates(inplace=True)
station_details.drop('index', axis=1, inplace=True)
station_details.set_index('unit', inplace=True)
# station_details[station_details.station.str.contains('FULTON')]
station_details.head()

Unnamed: 0_level_0,station,linename,c_a
unit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
R001,SOUTH FERRY,1R,R101
R001,SOUTH FERRY,1RW,R101
R001,WHITEHALL S-FRY,R1,A058
R001,WHITEHALL S-FRY,R1,A060
R001,WHITEHALL S-FRY,R1W,A058


In [3]:
station_details.loc['R001'].c_a.values

array(['R101', 'R101', 'A058', 'A060', 'A058', 'A060', 'A058', 'A060'], dtype=object)

In [4]:
def read_ca(table, conn):
    query = '''
        select date_time, scp,
                sum(abs(dentries)) entries,
                sum(abs(dexits)) exits
            from "{}"
            group by date_time, scp;
    '''
    df = (pd.read_sql(query.format(table), conn)
            .set_index('date_time', drop=True)
            .sort_index())
    df[['entries', 'exits']] = np.abs(df[['entries', 'exits']])
    df[(df.entries > 100000) | (df.exits > 100000)] = np.NaN
    df_summed = df.groupby('date_time')[['entries', 'exits']].sum()
    df_regularized = df_summed.resample('4H').apply(sum)
    return df_regularized

test = read_ca('R101', conn)

In [5]:
r101 = read_ca('R101', conn)
a058 = read_ca('A058', conn)
a060 = read_ca('A060', conn)

print(r101.head(), a058.head(), a060.head(), sep='\n')

                     entries   exits
date_time                           
2012-05-26 00:00:00      NaN     NaN
2012-05-26 04:00:00    175.0   381.0
2012-05-26 08:00:00    730.0  1252.0
2012-05-26 12:00:00   2639.0  6306.0
2012-05-26 16:00:00   5338.0  4458.0
                     entries  exits
date_time                          
2010-04-17 00:00:00      NaN    NaN
2010-04-17 04:00:00      1.0    0.0
2010-04-17 08:00:00      0.0    0.0
2010-04-17 12:00:00      0.0    0.0
2010-04-17 16:00:00      0.0    0.0
                     entries  exits
date_time                          
2010-04-17 00:00:00      NaN    NaN
2010-04-17 04:00:00      0.0    8.0
2010-04-17 08:00:00      1.0    5.0
2010-04-17 12:00:00      1.0    9.0
2010-04-17 16:00:00      1.0    5.0


In [6]:
test = (r101.join(a058, rsuffix='1', how='outer')
            .join(a060, rsuffix='2', how='outer'))
test['enter'] = test[['entries', 'entries1', 'entries2']].sum(axis=1)
test['exit'] = test[['exits', 'exits1', 'exits2']].sum(axis=1)
test_i = test.drop(['entries', 'entries1', 'entries2', 'exits', 'exits1', 'exits2'], axis=1)
# test_i

In [7]:
def combine_frames(frames):
    frame0 = frames.pop(0)
    for i, df in enumerate(frames):
        frame0 = frame0.join(df, rsuffix=str(i), how='outer')
    entry_cols = list(filter(lambda x: x.startswith('entries'), frame0.columns))
    exit_cols = list(filter(lambda x: x.startswith('exits'), frame0.columns))
    frame0['enter'] = frame0[entry_cols].sum(axis=1)
    frame0['exit'] = frame0[exit_cols].sum(axis=1)
    return frame0.drop([*entry_cols, *exit_cols], axis=1)

In [8]:
units = station_details.index.unique()
unit_ca_groups = {u: station_details.loc[u].values.reshape(1, -1)[0].tolist() for u in units}
# unit_ca_groups

In [9]:
station_details.loc['R014']

Unnamed: 0_level_0,station,linename,c_a
unit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
R014,FULTON ST,2345ACJZ,N095A
R014,FULTON ST,2345ACJZ,R205A
R014,FULTON ST,2345ACJZ,R206
R014,FULTON ST,2345ACJZ,R208
R014,FULTON ST,ACJZ2345,N095


In [10]:
new_database = 'fullstations'

new_engine = create_engine(f'postgres://{user}@localhost/{new_database}')
if not database_exists(new_engine.url):
    create_database(new_engine.url)
new_conn = psycopg2.connect(database=new_database, user=user)

In [164]:
station_details.to_sql('details', new_engine, if_exists='replace')

In [170]:
from datetime import datetime

for i, (unit, ca_list) in enumerate(unit_ca_groups.items()):
    if i % 25 == 0:
        print(i, unit, datetime.now())
    frames = [read_ca(ca, conn) for ca in ca_list]
    df = combine_frames(frames)
    new_name = unit.lower()
    df.to_sql(new_name, new_engine, if_exists='replace')
else:
    print('Done:', datetime.now())

0 R001 2017-06-14 19:09:29.472091
25 R028 2017-06-14 19:11:08.243777
50 R053 2017-06-14 19:12:35.866470
75 R085 2017-06-14 19:13:47.089950
100 R110 2017-06-14 19:14:54.758638
125 R135 2017-06-14 19:16:05.768106
150 R160 2017-06-14 19:17:19.188406
175 R185 2017-06-14 19:18:32.969629
200 R210 2017-06-14 19:19:37.888539
225 R235 2017-06-14 19:20:42.506983
250 R260 2017-06-14 19:21:44.824226
275 R285 2017-06-14 19:22:45.815734
300 R311 2017-06-14 19:23:46.978725
325 R336 2017-06-14 19:24:48.762043
350 R362 2017-06-14 19:25:49.454647
375 R387 2017-06-14 19:26:45.085563
400 R413 2017-06-14 19:27:41.006522
425 R438 2017-06-14 19:28:35.272685
450 R468 2017-06-14 19:29:36.338636
Done: 2017-06-14 19:30:32.705504
