In [1]:
import numpy as np 
import pandas as pd

In [2]:
weather = pd.read_csv('../datasets/nyc_weather_2018.csv', parse_dates=['date'])
fb = pd.read_csv('../datasets/fb.csv', index_col='date', parse_dates=True)

In [3]:
weather.head(2)

Unnamed: 0,date,datatype,station,attributes,value
0,2018-01-01,PRCP,GHCND:US1CTFR0039,",,N,",0.0
1,2018-01-01,PRCP,GHCND:US1NJBG0015,",,N,",0.0


In [4]:
fb.head(2)

Unnamed: 0_level_0,high,low,open,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-02,181.580002,177.550003,177.679993,181.419998,18151900
2018-01-03,184.779999,181.330002,181.880005,184.669998,16886600


In [5]:
# calculate the z-score for volume column and find where its absolute
# value is more than 3 std away
fb.assign(
    volume_z_score = lambda x: x.volume.sub(
        x.volume.mean()
    ).div(x.volume.std()).abs()
).query('volume_z_score > 3')

Unnamed: 0_level_0,high,low,open,close,volume,volume_z_score
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-03-19,177.169998,170.059998,177.009995,172.559998,88140100,3.154098
2018-03-20,170.199997,161.949997,167.470001,168.149994,129851800,5.329354
2018-03-21,173.399994,163.300003,164.800003,169.389999,106598800,4.116715
2018-03-26,161.100006,149.020004,160.820007,160.059998,126116600,5.134564
2018-07-26,180.130005,173.75,174.889999,176.259995,169803700,7.412837


Find 5 days with the largest percentage change of the volume traded from the prior day. Use `rank(ascending=False)` and `pct_change()`

In [9]:
fb.assign(
    volume_pct_change = fb.volume.pct_change(),
    change_rank = fb.volume.pct_change().abs().rank(ascending=False)
    # same result: but access the column that we've just created
    #change_rank = lambda x: x.volume_pct_change.abs().rank(ascending=False)
).nsmallest(5, 'change_rank')

Unnamed: 0_level_0,high,low,open,close,volume,volume_pct_change,change_rank
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-12,181.479996,177.399994,178.059998,179.369995,77551300,7.087865,1.0
2018-03-19,177.169998,170.059998,177.009995,172.559998,88140100,2.611796,2.0
2018-07-26,180.130005,173.75,174.889999,176.259995,169803700,1.880265,3.0
2018-09-21,167.25,162.809998,166.639999,162.929993,45994800,1.428961,4.0
2018-03-26,161.100006,149.020004,160.820007,160.059998,126116600,1.352496,5.0


In [10]:
# slice the biggest pct_change
fb['2018-01-11':'2018-01-12']

Unnamed: 0_level_0,high,low,open,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-11,188.399994,187.380005,188.399994,187.770004,9588600
2018-01-12,181.479996,177.399994,178.059998,179.369995,77551300


In [11]:
# check if any of prices high/low/open/close where bigger than $215 during this year
(fb > 215).any()
# low prices never were higher than $215

high       True
low       False
open       True
close      True
volume     True
dtype: bool

#### Binning

we might be interested in ranges of volume rather than the volume by itself, we can convert the numerical column into a categorical usind `pd.cut()`. It calls binning or **discretizing** (going from continious to descrete)

In [12]:
# check if in any of days the volume was the same
(fb.volume.value_counts() > 1).sum()

0

In [13]:
(fb.volume.value_counts() > 1).any()

False