# Data frameworks comparison

In [1]:
import numpy as np
import pandas as pd

from sqlalchemy import create_engine

In [2]:
DATA_PATH = 'data/311_Service_Requests_from_2010_to_Present.csv'

dtypes = {
        'Unique Key': 'int',
         'Created Date': 'object',
         'Closed Date': 'object',
         'Agency': 'object',
         'Agency Name': 'object',
         'Complaint Type': 'object',
         'Descriptor': 'object',
         'Location Type': 'object',
         'Incident Zip': 'object',
         'Incident Address': 'object',
         'Street Name': 'object',
         'Cross Street 1': 'object',
         'Cross Street 2': 'object',
         'Intersection Street 1': 'object',
         'Intersection Street 2': 'object',
         'Address Type': 'object',
         'City': 'object',
         'Landmark': 'object',
         'Facility Type': 'object',
         'Status': 'object',
         'Due Date': 'object',
         'Resolution Description': 'object',
         'Resolution Action Updated Date': 'object',
         'Community Board': 'object',
         'BBL': 'object',
         'Borough': 'object',
         'X Coordinate (State Plane)': 'object',
         'Y Coordinate (State Plane)': 'object',
         'Open Data Channel Type': 'object',
         'Park Facility Name': 'object',
         'Park Borough': 'object',
         'Vehicle Type': 'object',
         'Taxi Company Borough': 'object',
         'Taxi Pick Up Location': 'object',
         'Bridge Highway Name': 'object',
         'Bridge Highway Direction': 'object',
         'Road Ramp': 'object',
         'Bridge Highway Segment': 'object',
         'Latitude': 'float',
         'Longitude': 'float',
         'Location': 'object'
    }

## Pandas

In [3]:
%time df = pd.read_csv(DATA_PATH, usecols = ['Agency', 'Borough', 'Complaint Type'])

CPU times: user 53 s, sys: 2.4 s, total: 55.4 s
Wall time: 59.1 s


In [4]:
%time df['Complaint Type'].mode()

CPU times: user 1.13 s, sys: 4.32 ms, total: 1.13 s
Wall time: 1.13 s


0    Noise - Residential
dtype: object

In [5]:
%time df.groupby('Borough')['Complaint Type'].agg(pd.Series.mode)

CPU times: user 2.89 s, sys: 527 ms, total: 3.42 s
Wall time: 3.42 s


Borough
BRONX            Noise - Residential
BROOKLYN         Noise - Residential
MANHATTAN        Noise - Residential
QUEENS           Noise - Residential
STATEN ISLAND       Street Condition
Unspecified                  HEATING
Name: Complaint Type, dtype: object

In [6]:
most_common_complaint = df['Complaint Type'].mode().values[0]

%time df.loc[df['Complaint Type'] == most_common_complaint, 'Agency'].unique()

CPU times: user 1.19 s, sys: 160 µs, total: 1.19 s
Wall time: 1.19 s


array(['NYPD', 'DOITT'], dtype=object)

## SQLite

In [7]:
con_sqlite = create_engine('sqlite:///data.db').connect()

In [8]:
# %time df[['Complaint Type', 'Borough', 'Agency']].to_sql('data', con=con_sqlite, index=True, index_label='id', if_exists='replace')

In [9]:
query_1 = """
SELECT "Complaint Type", COUNT(*) as value 
FROM data 
GROUP BY "Complaint Type"
ORDER BY value DESC
"""

%time con_sqlite.execute(query_1).fetchone()

CPU times: user 11.2 s, sys: 872 ms, total: 12.1 s
Wall time: 12.1 s


('Noise - Residential', 2270937)

In [10]:
query_2 = """
SELECT Borough, "Complaint Type", value FROM (
    SELECT Borough, "Complaint Type", value, ROW_NUMBER() OVER(
        PARTITION BY agg_data.Borough
        ORDER BY agg_data.Borough, agg_data.value DESC
) rn
    FROM (
        SELECT 
            Borough, "Complaint Type", COUNT(*) as value
        FROM
            data
        GROUP BY Borough, "Complaint Type"    
    ) as agg_data
) s
WHERE rn = 1
"""

%time con_sqlite.execute(query_2).fetchall()

CPU times: user 26.3 s, sys: 1.09 s, total: 27.4 s
Wall time: 27.4 s


[('BRONX', 'Noise - Residential', 616749),
 ('BROOKLYN', 'Noise - Residential', 650335),
 ('MANHATTAN', 'Noise - Residential', 489085),
 ('QUEENS', 'Noise - Residential', 445077),
 ('STATEN ISLAND', 'Street Condition', 128431),
 ('Unspecified', 'HEATING', 282916)]

In [11]:
query_3 = f"""
SELECT DISTINCT Agency
FROM data
WHERE "Complaint Type" = '{most_common_complaint}'
"""

%time con_sqlite.execute(query_3).fetchall()

CPU times: user 1.4 s, sys: 276 ms, total: 1.67 s
Wall time: 1.67 s


[('NYPD',), ('DOITT',)]

## PostgreSQL

Before running this code run mysql docker with script `run_postgres_docker.sh`

In [13]:
con_postrgres = create_engine('postgres://postgres:docker@localhost/postgres').connect()

In [15]:
#%%time
#for chunk in np.array_split(df, 50):
#    print(chunk.shape)
#    chunk[['Complaint Type', 'Borough', 'Agency']].to_sql('data', con=con_postrgres, if_exists = 'append')

In [16]:
%time con_postrgres.execute(query_1).fetchone()

CPU times: user 2.75 ms, sys: 0 ns, total: 2.75 ms
Wall time: 2.18 s


('Noise - Residential', 2270937)

In [17]:
query_2 = """
SELECT "Borough", "Complaint Type", value FROM (
    SELECT "Borough", "Complaint Type", value, ROW_NUMBER() OVER(
        PARTITION BY "Borough"
        ORDER BY "Borough", value DESC
) rn
    FROM (
        SELECT 
            "Borough", "Complaint Type", COUNT(*) as value
        FROM
            data
        GROUP BY "Borough", "Complaint Type"    
    ) as agg_data
) s
WHERE rn = 1
"""

%time con_postrgres.execute(query_2).fetchall()

CPU times: user 1.11 ms, sys: 0 ns, total: 1.11 ms
Wall time: 3.2 s


[('BRONX', 'Noise - Residential', 616749),
 ('BROOKLYN', 'Noise - Residential', 650335),
 ('MANHATTAN', 'Noise - Residential', 489085),
 ('QUEENS', 'Noise - Residential', 445077),
 ('STATEN ISLAND', 'Street Condition', 128431),
 ('Unspecified', 'HEATING', 282916)]

In [18]:
query_3 = f"""
SELECT DISTINCT "Agency"
FROM data
WHERE "Complaint Type" = '{most_common_complaint}'
"""

%time con_postrgres.execute(query_3).fetchall()

CPU times: user 1.33 ms, sys: 13 µs, total: 1.34 ms
Wall time: 2.54 s


[('NYPD',), ('DOITT',)]

## Dask

In [19]:
del df

In [20]:
import dask.dataframe as dd

In [21]:
df_dask = dd.read_csv(
    DATA_PATH, 
    blocksize = 1e9,
    usecols = ['Agency', 'Borough', 'Complaint Type']
)

In [22]:
df_dask.head()

Unnamed: 0,Agency,Complaint Type,Borough
0,HPD,HEAT/HOT WATER,QUEENS
1,HPD,HEAT/HOT WATER,MANHATTAN
2,HPD,PAINT/PLASTER,BRONX
3,DSNY,Derelict Vehicles,Unspecified
4,DEP,Sewer,QUEENS


In [23]:
%time df_dask[['Complaint Type']].mode().compute()

CPU times: user 1min 47s, sys: 12.5 s, total: 1min 59s
Wall time: 31.8 s


0    Noise - Residential
Name: Complaint Type, dtype: object

In [24]:
def chunk(s):
    # for the comments, assume only a single grouping column, the 
    # implementation can handle multiple group columns.
    #
    # s is a grouped series. value_counts creates a multi-series like 
    # (group, value): count
    return s.value_counts()


def agg(s):
#     print('agg',s.apply(lambda s: s.groupby(level=-1).sum()))
    # s is a grouped multi-index series. In .apply the full sub-df will passed
    # multi-index and all. Group on the value level and sum the counts. The
    # result of the lambda function is a series. Therefore, the result of the 
    # apply is a multi-index series like (group, value): count
    return s.apply(lambda s: s.groupby(level=-1).sum())

    # faster version using pandas internals
    s = s._selected_obj
    return s.groupby(level=list(range(s.index.nlevels))).sum()


def finalize(s):
    # s is a multi-index series of the form (group, value): count. First
    # manually group on the group part of the index. The lambda will receive a
    # sub-series with multi index. Next, drop the group part from the index.
    # Finally, determine the index with the maximum value, i.e., the mode.
    level = list(range(s.index.nlevels - 1))
    return (
        s.groupby(level=level)
        .apply(lambda s: s.reset_index(level=level, drop=True).idxmax())
    )

max_occurence = dd.Aggregation('mode', chunk, agg, finalize)

In [25]:
%time df_dask.groupby(['Borough']).agg({'Complaint Type': max_occurence}).compute()

CPU times: user 1min 55s, sys: 13.5 s, total: 2min 8s
Wall time: 38.1 s


Unnamed: 0_level_0,Complaint Type
Borough,Unnamed: 1_level_1
BRONX,Noise - Residential
BROOKLYN,Noise - Residential
MANHATTAN,Noise - Residential
QUEENS,Noise - Residential
STATEN ISLAND,Street Condition
Unspecified,HEATING


In [26]:
%time df_dask.loc[df_dask['Complaint Type'] == most_common_complaint, 'Agency'].unique().compute()

CPU times: user 1min 49s, sys: 12.8 s, total: 2min 2s
Wall time: 32.7 s


0     NYPD
1    DOITT
Name: Agency, dtype: object