#### Database operations in Pandas

In [1]:
import pandas as pd

__Query__

In [2]:
# load the data
df = pd.read_csv('../datasets/nyc_weather_2018.csv')

In [3]:
df.head()

Unnamed: 0,date,datatype,station,attributes,value
0,2018-01-01T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0
1,2018-01-01T00:00:00,PRCP,GHCND:US1NJBG0015,",,N,",0.0
2,2018-01-01T00:00:00,SNOW,GHCND:US1NJBG0015,",,N,",0.0
3,2018-01-01T00:00:00,PRCP,GHCND:US1NJBG0017,",,N,",0.0
4,2018-01-01T00:00:00,SNOW,GHCND:US1NJBG0017,",,N,",0.0


In [12]:
'''
SELECT *
FROM weather
WHERE datatype = 'SNOW' AND value > 0;
'''
# in pandas query use '==' unlike '=' in where clause of sql
snow_data = df.query('datatype == "SNOW" and value > 0')

In [13]:
snow_data.head()

Unnamed: 0,date,datatype,station,attributes,value
114,2018-01-01T00:00:00,SNOW,GHCND:US1NYWC0019,",,N,",25.0
699,2018-01-04T00:00:00,SNOW,GHCND:US1NJBG0015,",,N,",229.0
702,2018-01-04T00:00:00,SNOW,GHCND:US1NJBG0017,",,N,",10.0
706,2018-01-04T00:00:00,SNOW,GHCND:US1NJBG0018,",,N,",46.0
713,2018-01-04T00:00:00,SNOW,GHCND:US1NJES0018,",,N,",10.0


In [15]:
# compare the standard filtering vs query
# query is preferable when the name of the data frame is pretty long
df[(df.datatype == 'SNOW') & (df.value > 0)].equals(snow_data)

True

__Merging dataframes__

In [16]:
stations_info = pd.read_csv('../datasets/weather_stations.csv')
stations_info.head()

Unnamed: 0,id,name,latitude,longitude,elevation
0,GHCND:US1CTFR0022,"STAMFORD 2.6 SSW, CT US",41.0641,-73.577,36.6
1,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4
2,GHCND:US1NJBG0001,"BERGENFIELD 0.3 SW, NJ US",40.921298,-74.001983,20.1
3,GHCND:US1NJBG0002,"SADDLE BROOK TWP 0.6 E, NJ US",40.902694,-74.083358,16.8
4,GHCND:US1NJBG0003,"TENAFLY 1.3 W, NJ US",40.91467,-73.9775,21.6


In [18]:
stations_info.id.describe()

count                   279
unique                  279
top       GHCND:US1CTFR0022
freq                      1
Name: id, dtype: object

In [19]:
df.station.describe()

count                 78780
unique                  110
top       GHCND:USW00094789
freq                   4270
Name: station, dtype: object

In [20]:
# check rows of the data frames
df.shape[0], stations_info.shape[0]

(78780, 279)

In [22]:
# star doesn't limit to 1 data frame and allows to pass as many as we need
def get_row_count(*dfs):
    return [df.shape[0] for df in dfs]
get_row_count(df, stations_info)

[78780, 279]

In [24]:
# more efficient function
def get_info(attr: str, *dfs):
    ''' 
    attr: data frame attribute
    dfs: data frames
    returns: list with the attribute information
    '''
    return list(map(lambda x: getattr(x, attr), dfs))

get_info('shape', df, stations_info)

[(78780, 5), (279, 5)]