In [None]:
# default_exp pandas

# Pandas

> Utilities for pandas

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
# export
import pandas as pd
import pandas_flavor
from datetime import datetime 
import typing as T

In [None]:
import numpy as np
import time

## `DataFrame` methods

### `ends`

This function shows combines the `.head()` and `.tail()` method of the `pd.DataFrame`.

Options:

* `n`: number of rows to show for the head and tail. The resulting df will have 2\*n rows, unless the df has <2\*n rows.

In [None]:
# export
@pandas_flavor.register_dataframe_method
@pandas_flavor.register_series_method
def ends(self, n:int=3)-> pd.DataFrame:
    """
    Combines the .head and .tail methods to show both ends
    of a pd.DataFrame.
    
    Options:
    * n: number of rows to show for the head and tail. The resulting
         DataFrame will have 2*n rows, unless the df has <2*n rows.
    """
    return pd.concat([self.head(n), self.tail(n)], axis=0).drop_duplicates()

In [None]:
df = pd.DataFrame(np.random.rand(20, 5))

In [None]:
df.head()

Unnamed: 0,0,1,2,3,4
0,0.007753,0.019556,0.228568,0.733227,0.97157
1,0.881625,0.630953,0.553137,0.998656,0.867941
2,0.065172,0.997073,0.651824,0.993076,0.677535
3,0.047016,0.915512,0.976585,0.317625,0.961146
4,0.121723,0.565807,0.339233,0.649174,0.489531


In [None]:
df.tail()

Unnamed: 0,0,1,2,3,4
15,0.503663,0.511829,0.244935,0.091605,0.661425
16,0.222074,0.410225,0.165857,0.815235,0.628469
17,0.750615,0.22505,0.387448,0.238739,0.766079
18,0.38561,0.345542,0.064823,0.178021,0.530253
19,0.857105,0.884259,0.249529,0.775519,0.727644


In [None]:
df.ends()

Unnamed: 0,0,1,2,3,4
0,0.007753,0.019556,0.228568,0.733227,0.97157
1,0.881625,0.630953,0.553137,0.998656,0.867941
2,0.065172,0.997073,0.651824,0.993076,0.677535
17,0.750615,0.22505,0.387448,0.238739,0.766079
18,0.38561,0.345542,0.064823,0.178021,0.530253
19,0.857105,0.884259,0.249529,0.775519,0.727644


### `group_by_summary`

This method returns a df with the mean, median, sem, std, and count of each column.

Options:

* `by`: Colums to group by. These will be the index of the df returned.
* `filter_cols`: if `None`, will use all the columns. Otherwise, pass column name or list of column names.

In [None]:
# export
@pandas_flavor.register_dataframe_method
def group_by_summary(self, by:T.Union[T.AnyStr, T.List], filter_cols:T.Union[None, T.List[str]]=None)-> pd.DataFrame:
    """
    
    """
    if not filter_cols:
        filter_cols = self.columns[~self.columns.isin([by] if isinstance(by, str) else by)]
    return self.groupby(by)[filter_cols].agg(['mean','median','sem','std','count'])

In [None]:
df['cat_col'] = df[0] > 0.5

In [None]:
df.group_by_summary('cat_col')

Unnamed: 0_level_0,0,0,0,0,0,1,1,1,1,1,...,3,3,3,3,3,4,4,4,4,4
Unnamed: 0_level_1,mean,median,sem,std,count,mean,median,sem,std,count,...,mean,median,sem,std,count,mean,median,sem,std,count
cat_col,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
False,0.241702,0.232908,0.043469,0.168355,15,0.453094,0.410225,0.071444,0.276701,15,...,0.535455,0.501178,0.064498,0.249799,15,0.544569,0.530253,0.076342,0.295673,15
True,0.783391,0.857105,0.075552,0.168939,5,0.462992,0.511829,0.145735,0.325874,5,...,0.479746,0.294213,0.173192,0.387269,5,0.674204,0.727644,0.088161,0.197134,5


In [None]:
df.group_by_summary(by='cat_col', filter_cols=1)

Unnamed: 0_level_0,mean,median,sem,std,count
cat_col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,0.453094,0.410225,0.071444,0.276701,15
True,0.462992,0.511829,0.145735,0.325874,5


In [None]:
df.group_by_summary(['cat_col'], filter_cols=1)

Unnamed: 0_level_0,mean,median,sem,std,count
cat_col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,0.453094,0.410225,0.071444,0.276701,15
True,0.462992,0.511829,0.145735,0.325874,5


### datetime_from_timestamp

Converts a pd.Series of integer timestamps to datetime format.

Options:

* `s`: pd.Series of timestamps
* `ts_type`: type of timestamp. 
  * `ms`: default, milliseconds
  * `ns`: nanoseconds
  * `s`: seconds

In [None]:
# export
def datetime_from_timestamp(s:pd.Series, ts_type:str='ms') -> pd.Series:
    lookup = {
        'ms':1e3,
        's':1,
        'ns': 1e6
    }
    return s.map(lambda x: datetime.fromtimestamp(x/lookup[ts_type]))

In [None]:
datetime_from_timestamp(pd.Series([1627417286117]))

0   2021-07-27 13:21:26.117
dtype: datetime64[ns]