In [None]:
import pandas as pd
import numpy as np

In [None]:
long_series = pd.Series(np.random.randn(1000))
long_series.head()

In [None]:
long_series.tail()

# Matching / broadcasting behavioR

In [None]:
df = pd.DataFrame({ 'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
                    'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
                    'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
df

In [None]:
row = df.iloc[1]
row

In [None]:
column = df['two']
column

In [None]:
df.sub(row,axis='columns')

In [None]:
df.sub(row, axis=1)

In [None]:
df.sub(column,axis='index')

In [None]:
df

# Missing values / Fill values

In [None]:
df2 = pd.DataFrame({ 'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
                    'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
                    'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
df2

In [None]:
df2.add(df, fill_value=999)

# Boolean Reductions

In [None]:
(df > 0).any()

In [None]:
(df.loc['a']['two']) > -0.5

In [None]:
(df > 0).all()

In [None]:
pd.Series([True]).bool()

In [None]:
pd.DataFrame([[True]]).bool()

# Comparing if objects are equivalen

In [None]:
df+df == df*2

In [None]:
(df+df == df*2).all()

### Series or DataFrame index needs to be in the same order for equality to be True:

In [None]:
df1 = pd.DataFrame({'col':['foo', 0, np.nan]})
df2 = pd.DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0])
df1.equals(df2)

In [None]:
df1.equals(df2.sort_index())

## Comparing array-like objects

In [None]:
df.index == 'b'

In [None]:
np.array([1, 2, 3]) == np.array([2])

In [None]:
pd.Series(['foo', 'bar', 'baz']) == np.array(['foo', 'bar', 'qux'])

## Combining overlapping data sets

In [None]:
df1 = pd.DataFrame({'A' : [1., np.nan, 3., 5., np.nan],
                    'B' : [np.nan, 2., 3., np.nan, 6.]})
df2 = pd.DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.],
                    'B' : [np.nan, np.nan, 3., 4., 6., 8.]})

In [None]:
df1

In [None]:
df2

In [None]:
df1.combine_first(df2)

# Descriptive statistics

In [None]:
df3 = pd.DataFrame({'A' : [2.0, 2.0, 4.0, 0.0, 2.0],
                    'B' : [2.0, 3.0, np.nan, 5.0, 6.0],
                    'C' : [2.0, 3.0, 4.0, 5.0, 6.0]
                   }
                   ,index=list('vwxyz')
                  )
df3

### **__DataFrame: “index” (axis=0, default), “columns” (axis=1)__**

In [None]:
df3.mean(0)

In [None]:
df3.std()

In [None]:
df3.mean(1)

In [None]:
df3.cumsum()

### **_missing data_**

In [None]:
df3.sum(1, skipna=False)

# Summarizing data: describe

## Series

In [None]:
series = pd.Series(np.random.randn(10))

In [None]:
series[::2]

In [None]:
series[::2] = np.nan

In [None]:
series

In [None]:
series.describe()

In [None]:
series.describe(percentiles=[.25, .50, .95])

## DataFrame

In [None]:
frame = pd.DataFrame(np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e'])
frame.iloc[::2] = np.nan
frame

In [None]:
frame.describe()

## INDEX min/max Value

In [None]:
df3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba'))
df3

In [None]:
df3['A'].idxmin()

## Value counts (histogramming) / Mode

In [None]:
data = np.random.randint(0,7, size=50)
data

In [None]:
num = pd.value_counts(data)
num

In [None]:
s = pd.Series(data)
s.value_counts()

In [None]:
df6 = pd.DataFrame({'A' : [2, 2, 4, 0, 2],
                    'B' : [2, 3, 3, 5, 6],
                    'C' : [1, 3, 4, 2, 3]
                   })
df6

In [None]:
df6.mode()

### _Row or Column wise function application_

In [None]:
df6.apply(np.mean)

In [None]:
df6.apply('mean', axis=1)

## Discretization

In [None]:
arr = np.random.randn(5)
arr

In [None]:
factor = pd.cut(arr,2)
factor

# Row or Column-wise function 

**_functions can be applied along the axes of a DataFrame_**

_If the applied function returns a Series, the final output is a DataFrame. The columns match the index of the Series returned by the applied function._

_If the applied function returns any other type, the final output is a Series._


In [None]:
tsdf = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C'],
                    index=pd.date_range('1/1/2000', periods=5))
tsdf

In [None]:
tsdf.apply(lambda x: x.idxmax())

In [None]:
tsdf.iloc[[1,3]] = np.nan
tsdf

In [None]:
tsdf.apply(pd.Series.interpolate)

frame

In [None]:
frame.apply(pd.Series.interpolate)

In [None]:
def subtract_and_divide(x, sub, divide=1):
    return (x - sub) / divide

tsdf.apply(subtract_and_divide, args=(5,), divide=3)

# Aggregation

In [None]:
df7 = pd.DataFrame({'A' : [2.0, 2.0, 4.0, 0.0, 2.0],
                    'B' : [2.0, 3.0, np.nan, 5.0, 6.0],
                    'C' : [2.0, 3.0, 4.0, 5.0, 6.0]
                   })
df7

In [None]:
df7.agg(np.sum)

In [None]:
df7.agg('sum')

In [None]:
df7.sum()

In [None]:
df7.A.agg('sum')

In [None]:
df7.A.sum()

## Aggregation with functions

In [None]:
df7.agg(['sum'])

In [None]:
df7.agg(['sum','mean'])

In [None]:
df7.agg(['sum', (lambda x: x.sum())])

In [None]:
def lambdasum(x):
    return x.sum()

df7.agg(['sum', lambdasum])

## Aggregating with a dictionary

In [None]:
df7.agg(({'A': 'sum', 'B': mymean}))

In [None]:
df7.agg(({'A': ['sum','mean'], 'B': sum}))

# Custom describe

In [None]:
from functools import partial

In [None]:
q_25 = partial(pd.Series.quantile,q=0.25)
q_25.__name__ = '25% Percentile Rank'

tsdf.agg(['count', 'mean', 'std', 'min', q_25, 'median', 'max'])

# Transform

### **_as numpy function_**

In [None]:
tsdf.transform(np.abs)

In [None]:
np.abs(tsdf)

### **_as string function_**

In [None]:
tsdf.transform('abs')

### **_as user defined function_**

In [None]:
tsdf.transform( lambda x: x.abs())

In [None]:
tsdf.A.transform(np.abs)

In [None]:
# Transform with dictionary

In [None]:
tsdf.transform({'A': np.abs, 'B': lambda x: x+2 })

In [None]:
#tsdf.transform({'A': np.abs, 'B': [lambda x: x+2, np.sum]})
tsdf.transform({'A': np.abs, 'B': [lambda x: x+1, np.cumsum]})


# Element-wise function

**_taking a single value and returning a single value_**

## DataFrame element-wise

In [None]:
ewdf = pd.DataFrame({'A' : [2.32310, 12.0, 4222.0, 10.0, 22.0],
                    'B' : [2.01, 3.0, 3.2, 5.0, 6.0],
                    'C' : [2.1110, 3.0, 4.0, 5.0, 6.0]
                   })
ewdf

In [None]:
f = lambda x: len(str(x))

ewdf['A'].map(f)

In [None]:
ewdf.applymap(f)

## Series Element-wise

In [None]:
s = pd.Series(['six', 'five', 'six', 'seven', 'six'],
              index=['a', 'b', 'c', 'd', 'e'])
t = pd.Series({'six' : 6., 'seven' : 7.})
s

In [None]:
s.map(t)

# Reindexing and altering labels

## **_Reindex with Series_**

In [None]:
s = pd.Series(np.random.randn(5), index=list('abcde'))
s.head()

In [None]:
s.reindex(['e','f','a'])

## **_Reindexing with DataFrames_**

In [None]:
ridf = pd.DataFrame({ 'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
                    'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
                    'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
ridf

In [None]:
ridf.reindex(index=['d','a'],columns=['three','one'])

In [None]:
ridf.reindex(list('da'), axis='index')

In [None]:
ridf.reindex(['three','one'], axis='columns')

In [None]:
ridf.reindex(ridf.columns, axis='columns')

In [None]:
ridf2 = ridf.reindex(['three','one'], axis='columns')
ridf3 = ridf.reindex(ridf.columns, axis='columns')
ridf2.reindex_like(ridf3)

# Aligning objects

In [None]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s1 = s[:4]
s1

In [None]:
s2 = s[1:]
s2

In [None]:
s3 = s1.align(s2)

# Filling while reindexing

| Method           | Action |
| ---------------- | ------- |
| pad / ffill      | Fill values forward |
| bfill / backfill | Fill values backward |
| nearest          | Fill from the nearest index value |

### _fillna() and interpolate() will not perform any checks on the order of the index._
### _reindex require that the indexes are ordered increasing or decreasing._

In [None]:
rng = pd.date_range('1/3/2000', periods=11, freq='3D')
ts = pd.Series(np.random.randn(11), index=rng)
ts

In [None]:
ts2 = ts[[0, 5, 10]]
ts2

In [None]:
ts2.reindex(ts.index)

In [None]:
ts2.reindex(ts.index, method='ffill')

In [None]:
ts2.reindex(ts.index, method="bfill")

In [None]:
ts2.reindex(ts.index, method="nearest")

In [None]:
ts2.reindex(ts.index).fillna(method="bfill")

In [None]:
ts2.reindex(ts.index).fillna(value=10)

In [None]:
ts2.reindex(ts.index).interpolate()

### _tolerance specifies the maximum distance between the index and indexer values_

In [None]:
ts2.reindex(ts.index, method='nearest', tolerance='3 Day')

In [None]:
ts2.reindex(ts.index, method='nearest', limit=2)

# Dropping and Renaming axis label

In [None]:
dpdf = pd.DataFrame({ 'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
                    'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
                    'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
dpdf

In [None]:
dpdf.drop(['a','d'], axis='index')

In [None]:
dpdf.drop(['three'], axis='columns')

In [None]:
dpdf.rename(columns={'one': 'foo', 'three': 'bar'},
            index={'a': 'apple', 'b': 'banana', 'd': 'durian'})

In [None]:
dpdf.rename({'one': 'foo', 'three': 'bar'}, axis='columns')

# DateTime accessor

In [None]:
s = pd.Series(pd.date_range('20130101 09:10:12', periods=6))
s

In [None]:
stz = s.dt.tz_localize('US/Eastern')
stz

In [None]:
stz.dt.tz

In [None]:
s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

In [None]:
s.dt.strftime('%Y/%m/%d')

# Sorting

In [None]:
sdf = pd.DataFrame({'A':[2,1,1,1],'C':[5,4,3,2], 'B':[1,3,2,4],})
sdf

In [None]:
sdf.sort_values(by="one")

In [None]:
sdf.sort_values(by=["A","B"])

In [None]:
sdf.sort_index(ascending=False)

In [None]:
sdf.sort_index(axis="columns")

# Sort By Indexes and Values

In [None]:
idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2),('b', 2), ('b', 1), ('b', 1)])
idx.names = ['first', 'second']
idx

In [None]:
np.arange
df_multi = pd.DataFrame({'A': np.arange(6, 0, -1),
                         'B': np.arange(9, 3, -1)
                        }
                        ,
                        index=idx)
df_multi

In [None]:
df_multi.sort_values(by=['second', 'A'])

In [None]:
df_multi.sort_values(by=['A', 'B'])

In [None]:
srs = pd.Series(np.random.randn(20))
srs

## Sorting by Series

In [None]:
srs.nsmallest(4)

In [None]:
srs.nlargest(3)

## Sorting by DataFrames

In [None]:
srdf = pd.DataFrame({'a': [-2, -1, 1, 10, 8, 11, -1],
                    'b': list('abdceff'),
                    'c': [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0]})
srdf

In [None]:
srdf.nsmallest(3, 'a')

In [None]:
srdf.nsmallest(2, ['a', 'c'])

# Dtypes

In [None]:
dft = pd.DataFrame(dict(
                    A = np.random.rand(3),
                    B = 1,
                    C = 'foo',
                    D = pd.Timestamp('20010102'),
                    E = pd.Series([1.0]*3).astype('float32'),
                    F = pd.Series([3.0]*3).astype('float32'),
                    G = False,
                    H = pd.Series([1]*3,dtype='int8'))
                   
                  )
                  
dft

In [None]:
dft.dtypes

In [None]:
dft.A.dtype

In [None]:
dft.B

In [None]:
dft.B.astype('float64')

# Type Casting

In [None]:
dft[['A','B']].dtypes

In [None]:
dft[['A','B']].astype('float64').dtypes

In [None]:
dft1 = pd.DataFrame({'a': [1,0,1], 'b': [4,5,6], 'c': [7, 8, 9]})
dft1

In [None]:
dft1.dtypes

In [None]:
dft2 = dft1.astype({'a': np.bool, 'b': np.str ,'c': np.float64})
dft2

In [None]:
dft2.dtypes

## Numerice Typecasting

In [None]:
m = ['1', 2, 3]
pd.to_numeric(m, downcast='integer')

In [None]:
pd.to_numeric(m, downcast='float')

## Converting Object by inferring

In [None]:
import datetime

In [None]:
indf = pd.DataFrame([[1.0, 29.0],
                     ['a', 'b'],
                     [datetime.datetime(2016, 3, 2), datetime.datetime(2016, 3, 2)]])
indf.T

In [None]:
indf.T.dtypes

In [None]:
indf.T.infer_objects().dtypes

## Converting Date

In [None]:
m = ['2016-07-09', datetime.datetime(2016, 3, 2)]
m

In [None]:
pd.to_datetime(m)

## Conversion Force and Coerce

In [None]:
m = ['apple', datetime.datetime(2016, 3, 2)]
#pd.to_datetime(m, errors='raise')
#pd.to_datetime(m, errors='coerce')
pd.to_datetime(m, errors='ignore')

In [None]:
m = ['apple', 2, 3]
pd.to_numeric(m, errors='coerce')

# Converting to Multi-dimention or DataFrames

In [None]:
cndf = pd.DataFrame([['2016-07-09', datetime.datetime(2016, 3, 2)]] * 2, dtype='O')
cndf

In [None]:
cndf.dtypes

In [None]:
cndf.apply(pd.to_datetime)

In [None]:
indf = pd.DataFrame([['1.1', 2, 3]] * 2, dtype='O')
indf

In [None]:
indf.dtypes

In [None]:
indf = indf.apply(pd.to_numeric)
indf

In [None]:
indf.dtypes

In [None]:
to_float = lambda y: pd.to_numeric(y, downcast='float')
indf.loc[0].map(to_float)

In [None]:
fldf = indf.apply(to_float)
fldf

In [None]:
fldf.dtypes