In [617]:
import pandas as pd
import numpy as np

In [343]:
long_series = pd.Series(np.random.randn(1000))
long_series.head()

0    0.203155
1    0.237249
2   -2.124439
3    1.900297
4   -1.570391
dtype: float64

In [344]:
long_series.tail()

995    0.059337
996   -0.439746
997   -0.870583
998   -1.815125
999   -0.200987
dtype: float64

# Matching / broadcasting behavioR

In [345]:
df = pd.DataFrame({ 'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
                    'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
                    'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
df

Unnamed: 0,one,two,three
a,-1.465796,0.276514,
b,1.0879,0.589693,-0.808588
c,-1.053355,-1.559783,2.299307
d,,0.740501,1.105122


In [346]:
row = df.iloc[1]
row

one      1.087900
two      0.589693
three   -0.808588
Name: b, dtype: float64

In [347]:
column = df['two']
column

a    0.276514
b    0.589693
c   -1.559783
d    0.740501
Name: two, dtype: float64

In [348]:
df.sub(row,axis='columns')

Unnamed: 0,one,two,three
a,-2.553696,-0.313179,
b,0.0,0.0,0.0
c,-2.141255,-2.149476,3.107895
d,,0.150809,1.913709


In [349]:
df.sub(row, axis=1)

Unnamed: 0,one,two,three
a,-2.553696,-0.313179,
b,0.0,0.0,0.0
c,-2.141255,-2.149476,3.107895
d,,0.150809,1.913709


In [350]:
df.sub(column,axis='index')

Unnamed: 0,one,two,three
a,-1.74231,0.0,
b,0.498207,0.0,-1.398281
c,0.506428,0.0,3.85909
d,,0.0,0.36462


In [351]:
df

Unnamed: 0,one,two,three
a,-1.465796,0.276514,
b,1.0879,0.589693,-0.808588
c,-1.053355,-1.559783,2.299307
d,,0.740501,1.105122


# Missing values / Fill values

In [352]:
df2 = pd.DataFrame({ 'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
                    'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
                    'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
df2

Unnamed: 0,one,two,three
a,-0.820344,0.548197,
b,-1.46702,-0.048549,-1.063702
c,1.113924,1.063048,0.651743
d,,-0.651506,-0.911261


In [353]:
df2.add(df, fill_value=999)

Unnamed: 0,one,two,three
a,-2.28614,0.824711,
b,-0.37912,0.541144,-1.87229
c,0.060569,-0.496734,2.95105
d,,0.088995,0.193861


# Boolean Reductions

In [354]:
(df > 0).any()

one      True
two      True
three    True
dtype: bool

In [355]:
(df.loc['a']['two']) > -0.5

True

In [356]:
(df > 0).all()

one      False
two      False
three    False
dtype: bool

In [357]:
pd.Series([True]).bool()

True

In [358]:
pd.DataFrame([[True]]).bool()

True

# Comparing if objects are equivalen

In [359]:
df+df == df*2

Unnamed: 0,one,two,three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [360]:
(df+df == df*2).all()

one      False
two       True
three    False
dtype: bool

### Series or DataFrame index needs to be in the same order for equality to be True:

In [361]:
df1 = pd.DataFrame({'col':['foo', 0, np.nan]})
df2 = pd.DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0])
df1.equals(df2)

False

In [362]:
df1.equals(df2.sort_index())

True

## Comparing array-like objects

In [363]:
df.index == 'b'

array([False,  True, False, False])

In [364]:
np.array([1, 2, 3]) == np.array([2])

array([False,  True, False])

In [365]:
pd.Series(['foo', 'bar', 'baz']) == np.array(['foo', 'bar', 'qux'])

0     True
1     True
2    False
dtype: bool

## Combining overlapping data sets

In [366]:
df1 = pd.DataFrame({'A' : [1., np.nan, 3., 5., np.nan],
                    'B' : [np.nan, 2., 3., np.nan, 6.]})
df2 = pd.DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.],
                    'B' : [np.nan, np.nan, 3., 4., 6., 8.]})

In [367]:
df1

Unnamed: 0,A,B
0,1.0,
1,,2.0
2,3.0,3.0
3,5.0,
4,,6.0


In [368]:
df2

Unnamed: 0,A,B
0,5.0,
1,2.0,
2,4.0,3.0
3,,4.0
4,3.0,6.0
5,7.0,8.0


In [369]:
df1.combine_first(df2)

Unnamed: 0,A,B
0,1.0,
1,2.0,2.0
2,3.0,3.0
3,5.0,4.0
4,3.0,6.0
5,7.0,8.0


# Descriptive statistics

In [493]:
df3 = pd.DataFrame({'A' : [2.0, 2.0, 4.0, 0.0, 2.0],
                    'B' : [2.0, 3.0, np.nan, 5.0, 6.0],
                    'C' : [2.0, 3.0, 4.0, 5.0, 6.0]
                   }
                   ,index=list('vwxyz')
                  )
df3

Unnamed: 0,A,B,C
v,2.0,2.0,2.0
w,2.0,3.0,3.0
x,4.0,,4.0
y,0.0,5.0,5.0
z,2.0,6.0,6.0


### **__DataFrame: “index” (axis=0, default), “columns” (axis=1)__**

In [371]:
df3.mean(0)

A    2.0
B    4.0
C    4.0
dtype: float64

In [372]:
df3.std()

A    1.414214
B    1.825742
C    1.581139
dtype: float64

In [373]:
df3.mean(1)

v    2.000000
w    2.666667
x    4.000000
y    3.333333
z    4.666667
dtype: float64

In [374]:
df3.cumsum()

Unnamed: 0,A,B,C
v,2.0,2.0,2.0
w,4.0,5.0,5.0
x,8.0,,9.0
y,8.0,10.0,14.0
z,10.0,16.0,20.0


### **_missing data_**

In [375]:
df3.sum(1, skipna=False)

v     6.0
w     8.0
x     NaN
y    10.0
z    14.0
dtype: float64

# Summarizing data: describe

## Series

In [376]:
series = pd.Series(np.random.randn(10))

In [377]:
series[::2]

0   -1.381820
2    0.655257
4    0.703732
6   -1.137409
8    2.659941
dtype: float64

In [378]:
series[::2] = np.nan

In [379]:
series

0         NaN
1   -1.041444
2         NaN
3    0.033758
4         NaN
5   -1.426415
6         NaN
7   -0.632636
8         NaN
9    0.033374
dtype: float64

In [380]:
series.describe()

count    5.000000
mean    -0.606673
std      0.648361
min     -1.426415
25%     -1.041444
50%     -0.632636
75%      0.033374
max      0.033758
dtype: float64

In [381]:
series.describe(percentiles=[.25, .50, .95])

count    5.000000
mean    -0.606673
std      0.648361
min     -1.426415
25%     -1.041444
50%     -0.632636
95%      0.033681
max      0.033758
dtype: float64

## DataFrame

In [382]:
frame = pd.DataFrame(np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e'])
frame.iloc[::2] = np.nan
frame

Unnamed: 0,a,b,c,d,e
0,,,,,
1,0.701258,0.020114,0.80036,0.246024,0.183152
2,,,,,
3,0.994427,1.017783,0.962466,-0.159989,0.451113
4,,,,,
5,-0.485004,0.496417,-1.062812,-0.263186,-0.274428
6,,,,,
7,1.806753,0.86769,-0.433447,0.485218,-1.112909
8,,,,,
9,-0.929969,-0.521327,-1.208512,0.457119,0.687029


In [383]:
frame.describe()

Unnamed: 0,a,b,c,d,e
count,5.0,5.0,5.0,5.0,5.0
mean,0.417493,0.376136,-0.188389,0.153037,-0.013208
std,1.115076,0.632454,1.020712,0.347377,0.710996
min,-0.929969,-0.521327,-1.208512,-0.263186,-1.112909
25%,-0.485004,0.020114,-1.062812,-0.159989,-0.274428
50%,0.701258,0.496417,-0.433447,0.246024,0.183152
75%,0.994427,0.86769,0.80036,0.457119,0.451113
max,1.806753,1.017783,0.962466,0.485218,0.687029


## INDEX min/max Value

In [384]:
df3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba'))
df3

Unnamed: 0,A
e,2.0
d,1.0
c,1.0
b,3.0
a,


In [385]:
df3['A'].idxmin()

'd'

## Value counts (histogramming) / Mode

In [386]:
data = np.random.randint(0,7, size=50)
data

array([3, 6, 6, 6, 5, 4, 3, 1, 4, 2, 0, 4, 3, 6, 6, 2, 3, 6, 4, 3, 6, 0,
       5, 4, 3, 0, 6, 0, 0, 6, 3, 2, 0, 6, 0, 1, 0, 5, 4, 2, 2, 2, 5, 5,
       3, 0, 2, 1, 0, 1])

In [387]:
num = pd.value_counts(data)
num

6    10
0    10
3     8
2     7
4     6
5     5
1     4
dtype: int64

In [388]:
s = pd.Series(data)
s.value_counts()

6    10
0    10
3     8
2     7
4     6
5     5
1     4
dtype: int64

In [389]:
df6 = pd.DataFrame({'A' : [2, 2, 4, 0, 2],
                    'B' : [2, 3, 3, 5, 6],
                    'C' : [1, 3, 4, 2, 3]
                   })
df6

Unnamed: 0,A,B,C
0,2,2,1
1,2,3,3
2,4,3,4
3,0,5,2
4,2,6,3


In [390]:
df6.mode()

Unnamed: 0,A,B,C
0,2,3,3


### _Row or Column wise function application_

In [391]:
df6.apply(np.mean)

A    2.0
B    3.8
C    2.6
dtype: float64

In [392]:
df6.apply('mean', axis=1)

0    1.666667
1    2.666667
2    3.666667
3    2.333333
4    3.666667
dtype: float64

## Discretization

In [393]:
arr = np.random.randn(5)
arr

array([-0.82616275,  0.56455613, -1.23461921, -0.52214083, -0.46505074])

In [394]:
factor = pd.cut(arr,2)
factor

[(-1.236, -0.335], (-0.335, 0.565], (-1.236, -0.335], (-1.236, -0.335], (-1.236, -0.335]]
Categories (2, interval[float64]): [(-1.236, -0.335] < (-0.335, 0.565]]

# Row or Column-wise function 

**_functions can be applied along the axes of a DataFrame_**

_If the applied function returns a Series, the final output is a DataFrame. The columns match the index of the Series returned by the applied function._

_If the applied function returns any other type, the final output is a Series._


In [541]:
tsdf = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C'],
                    index=pd.date_range('1/1/2000', periods=5))
tsdf

Unnamed: 0,A,B,C
2000-01-01,-1.955891,2.946301,-0.165224
2000-01-02,0.699038,-1.250334,1.16285
2000-01-03,0.483958,0.427391,0.988688
2000-01-04,-0.747193,-0.213244,0.306599
2000-01-05,-1.337005,0.953401,0.223081


In [540]:
tsdf.apply(lambda x: x.idxmax())

A   2000-01-01
B   2000-01-01
C   2000-01-01
dtype: datetime64[ns]

In [542]:
tsdf.iloc[[1,3]] = np.nan
tsdf

Unnamed: 0,A,B,C
2000-01-01,-1.955891,2.946301,-0.165224
2000-01-02,,,
2000-01-03,0.483958,0.427391,0.988688
2000-01-04,,,
2000-01-05,-1.337005,0.953401,0.223081


In [543]:
tsdf.apply(pd.Series.interpolate)

Unnamed: 0,A,B,C
2000-01-01,-1.955891,2.946301,-0.165224
2000-01-02,-0.735966,1.686846,0.411732
2000-01-03,0.483958,0.427391,0.988688
2000-01-04,-0.426523,0.690396,0.605885
2000-01-05,-1.337005,0.953401,0.223081


frame

In [398]:
frame.apply(pd.Series.interpolate)

Unnamed: 0,a,b,c,d,e
0,,,,,
1,0.701258,0.020114,0.80036,0.246024,0.183152
2,0.847843,0.518949,0.881413,0.043018,0.317133
3,0.994427,1.017783,0.962466,-0.159989,0.451113
4,0.254711,0.7571,-0.050173,-0.211588,0.088343
5,-0.485004,0.496417,-1.062812,-0.263186,-0.274428
6,0.660874,0.682054,-0.748129,0.111016,-0.693668
7,1.806753,0.86769,-0.433447,0.485218,-1.112909
8,0.438392,0.173182,-0.820979,0.471168,-0.21294
9,-0.929969,-0.521327,-1.208512,0.457119,0.687029


In [399]:
def subtract_and_divide(x, sub, divide=1):
    return (x - sub) / divide

tsdf.apply(subtract_and_divide, args=(5,), divide=3)

Unnamed: 0,A,B,C
2000-01-01,-1.500179,-1.833821,-1.523269
2000-01-02,-1.820354,-2.208186,-1.851775
2000-01-03,-1.78212,-1.943924,-1.781209
2000-01-04,-1.870922,-1.704624,-0.982538
2000-01-05,-1.685456,-1.628933,-1.170938


# Aggregation

In [400]:
df7 = pd.DataFrame({'A' : [2.0, 2.0, 4.0, 0.0, 2.0],
                    'B' : [2.0, 3.0, np.nan, 5.0, 6.0],
                    'C' : [2.0, 3.0, 4.0, 5.0, 6.0]
                   })
df7

Unnamed: 0,A,B,C
0,2.0,2.0,2.0
1,2.0,3.0,3.0
2,4.0,,4.0
3,0.0,5.0,5.0
4,2.0,6.0,6.0


In [401]:
df7.agg(np.sum)

A    10.0
B    16.0
C    20.0
dtype: float64

In [402]:
df7.agg('sum')

A    10.0
B    16.0
C    20.0
dtype: float64

In [403]:
df7.sum()

A    10.0
B    16.0
C    20.0
dtype: float64

In [404]:
df7.A.agg('sum')

10.0

In [405]:
df7.A.sum()

10.0

## Aggregation with functions

In [406]:
df7.agg(['sum'])

Unnamed: 0,A,B,C
sum,10.0,16.0,20.0


In [407]:
df7.agg(['sum','mean'])

Unnamed: 0,A,B,C
sum,10.0,16.0,20.0
mean,2.0,4.0,4.0


In [408]:
df7.agg(['sum', (lambda x: x.sum())])

Unnamed: 0,A,B,C
sum,10.0,16.0,20.0
<lambda>,10.0,16.0,20.0


In [409]:
def lambdasum(x):
    return x.sum()

df7.agg(['sum', lambdasum])

Unnamed: 0,A,B,C
sum,10.0,16.0,20.0
lambdasum,10.0,16.0,20.0


## Aggregating with a dictionary

In [410]:
df7.agg(({'A': 'sum', 'B': mymean}))

A    10.0
B     4.0
dtype: float64

In [411]:
df7.agg(({'A': ['sum','mean'], 'B': sum}))

Unnamed: 0,A,B
mean,2.0,
sum,10.0,16.0


# Custom describe

In [412]:
from functools import partial

In [413]:
q_25 = partial(pd.Series.quantile,q=0.25)
q_25.__name__ = '25% Percentile Rank'

tsdf.agg(['count', 'mean', 'std', 'min', q_25, 'median', 'max'])

Unnamed: 0,A,B,C
count,5.0,5.0,5.0
mean,-0.195419,-0.591693,0.614163
std,0.438669,0.681502,1.134632
min,-0.612765,-1.624558,-0.555324
25% Percentile Rank,-0.461061,-0.831773,-0.343626
median,-0.34636,-0.501463,0.430194
max,0.499462,0.113202,2.052385


# Transform

### **_as numpy function_**

In [414]:
tsdf.transform(np.abs)

Unnamed: 0,A,B,C
2000-01-01,0.499462,0.501463,0.430194
2000-01-02,0.461061,1.624558,0.555324
2000-01-03,0.34636,0.831773,0.343626
2000-01-04,0.612765,0.113872,2.052385
2000-01-05,0.056369,0.113202,1.487186


In [415]:
np.abs(tsdf)

Unnamed: 0,A,B,C
2000-01-01,0.499462,0.501463,0.430194
2000-01-02,0.461061,1.624558,0.555324
2000-01-03,0.34636,0.831773,0.343626
2000-01-04,0.612765,0.113872,2.052385
2000-01-05,0.056369,0.113202,1.487186


### **_as string function_**

In [416]:
tsdf.transform('abs')

Unnamed: 0,A,B,C
2000-01-01,0.499462,0.501463,0.430194
2000-01-02,0.461061,1.624558,0.555324
2000-01-03,0.34636,0.831773,0.343626
2000-01-04,0.612765,0.113872,2.052385
2000-01-05,0.056369,0.113202,1.487186


### **_as user defined function_**

In [417]:
tsdf.transform( lambda x: x.abs())

Unnamed: 0,A,B,C
2000-01-01,0.499462,0.501463,0.430194
2000-01-02,0.461061,1.624558,0.555324
2000-01-03,0.34636,0.831773,0.343626
2000-01-04,0.612765,0.113872,2.052385
2000-01-05,0.056369,0.113202,1.487186


In [418]:
tsdf.A.transform(np.abs)

2000-01-01    0.499462
2000-01-02    0.461061
2000-01-03    0.346360
2000-01-04    0.612765
2000-01-05    0.056369
Freq: D, Name: A, dtype: float64

In [419]:
# Transform with dictionary

In [420]:
tsdf.transform({'A': np.abs, 'B': lambda x: x+2 })

Unnamed: 0,A,B
2000-01-01,0.499462,1.498537
2000-01-02,0.461061,0.375442
2000-01-03,0.34636,1.168227
2000-01-04,0.612765,1.886128
2000-01-05,0.056369,2.113202


In [429]:
#tsdf.transform({'A': np.abs, 'B': [lambda x: x+2, np.sum]})
tsdf.transform({'A': np.abs, 'B': [lambda x: x+1, np.cumsum]})


Unnamed: 0_level_0,A,B,B
Unnamed: 0_level_1,absolute,<lambda>,cumsum
2000-01-01,0.499462,0.498537,-0.501463
2000-01-02,0.461061,-0.624558,-2.126021
2000-01-03,0.34636,0.168227,-2.957793
2000-01-04,0.612765,0.886128,-3.071665
2000-01-05,0.056369,1.113202,-2.958464


# Element-wise function

**_taking a single value and returning a single value_**

## DataFrame element-wise

In [445]:
ewdf = pd.DataFrame({'A' : [2.32310, 12.0, 4222.0, 10.0, 22.0],
                    'B' : [2.01, 3.0, 3.2, 5.0, 6.0],
                    'C' : [2.1110, 3.0, 4.0, 5.0, 6.0]
                   })
ewdf

Unnamed: 0,A,B,C
0,2.3231,2.01,2.111
1,12.0,3.0,3.0
2,4222.0,3.2,4.0
3,10.0,5.0,5.0
4,22.0,6.0,6.0


In [847]:
f = lambda x: len(str(x))

ewdf['A'].map(f)

0    6
1    4
2    6
3    4
4    4
Name: A, dtype: int64

In [447]:
ewdf.applymap(f)

Unnamed: 0,A,B,C
0,6,4,5
1,4,3,3
2,6,3,3
3,4,3,3
4,4,3,3


## Series Element-wise

In [452]:
s = pd.Series(['six', 'five', 'six', 'seven', 'six'],
              index=['a', 'b', 'c', 'd', 'e'])
t = pd.Series({'six' : 6., 'seven' : 7.})
s

a      six
b     five
c      six
d    seven
e      six
dtype: object

In [453]:
s.map(t)

a    6.0
b    NaN
c    6.0
d    7.0
e    6.0
dtype: float64

# Reindexing and altering labels

## **_Reindex with Series_**

In [458]:
s = pd.Series(np.random.randn(5), index=list('abcde'))
s.head()

a   -1.350221
b    0.698930
c    1.372374
d    0.385574
e    1.083314
dtype: float64

In [460]:
s.reindex(['e','f','a'])

e    1.083314
f         NaN
a   -1.350221
dtype: float64

## **_Reindexing with DataFrames_**

In [463]:
ridf = pd.DataFrame({ 'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
                    'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
                    'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
ridf

Unnamed: 0,one,two,three
a,0.093605,-0.745032,
b,0.20382,-0.505482,-0.715585
c,-0.019924,-0.875984,-0.699092
d,,0.460544,1.232076


In [465]:
ridf.reindex(index=['d','a'],columns=['three','one'])

Unnamed: 0,three,one
d,1.232076,
a,,0.093605


In [467]:
ridf.reindex(list('da'), axis='index')

Unnamed: 0,one,two,three
d,,0.460544,1.232076
a,0.093605,-0.745032,


In [470]:
ridf.reindex(['three','one'], axis='columns')

Unnamed: 0,three,one
a,,0.093605
b,-0.715585,0.20382
c,-0.699092,-0.019924
d,1.232076,


In [472]:
ridf.reindex(ridf.columns, axis='columns')

Unnamed: 0,one,two,three
a,0.093605,-0.745032,
b,0.20382,-0.505482,-0.715585
c,-0.019924,-0.875984,-0.699092
d,,0.460544,1.232076


In [476]:
ridf2 = ridf.reindex(['three','one'], axis='columns')
ridf3 = ridf.reindex(ridf.columns, axis='columns')
ridf2.reindex_like(ridf3)

Unnamed: 0,one,two,three
a,0.093605,,
b,0.20382,,-0.715585
c,-0.019924,,-0.699092
d,,,1.232076


# Aligning objects

In [479]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s1 = s[:4]
s1

a    0.339599
b   -1.192416
c   -1.854193
d    0.161937
dtype: float64

In [481]:
s2 = s[1:]
s2

b   -1.192416
c   -1.854193
d    0.161937
e   -0.123446
dtype: float64

In [484]:
s3 = s1.align(s2)

# Filling while reindexing

| Method           | Action |
| ---------------- | ------- |
| pad / ffill      | Fill values forward |
| bfill / backfill | Fill values backward |
| nearest          | Fill from the nearest index value |

### _fillna() and interpolate() will not perform any checks on the order of the index._
### _reindex require that the indexes are ordered increasing or decreasing._

In [559]:
rng = pd.date_range('1/3/2000', periods=11, freq='3D')
ts = pd.Series(np.random.randn(11), index=rng)
ts

2000-01-03   -1.517976
2000-01-06    0.252059
2000-01-09    1.795468
2000-01-12   -0.752647
2000-01-15   -0.382224
2000-01-18    0.181094
2000-01-21    0.524812
2000-01-24    0.933996
2000-01-27   -0.951419
2000-01-30    0.063523
2000-02-02    1.588174
Freq: 3D, dtype: float64

In [562]:
ts2 = ts[[0, 5, 10]]
ts2

2000-01-03   -1.517976
2000-01-18    0.181094
2000-02-02    1.588174
dtype: float64

In [563]:
ts2.reindex(ts.index)

2000-01-03   -1.517976
2000-01-06         NaN
2000-01-09         NaN
2000-01-12         NaN
2000-01-15         NaN
2000-01-18    0.181094
2000-01-21         NaN
2000-01-24         NaN
2000-01-27         NaN
2000-01-30         NaN
2000-02-02    1.588174
Freq: 3D, dtype: float64

In [517]:
ts2.reindex(ts.index, method='ffill')

2000-01-03    0.119659
2000-01-04    0.119659
2000-01-05    0.119659
2000-01-06    0.119659
2000-01-07    0.119659
2000-01-08    1.116196
2000-01-09    1.116196
2000-01-10    1.116196
2000-01-11    1.116196
2000-01-12    1.116196
2000-01-13   -1.156516
Freq: D, dtype: float64

In [518]:
ts2.reindex(ts.index, method="bfill")

2000-01-03    0.119659
2000-01-04    1.116196
2000-01-05    1.116196
2000-01-06    1.116196
2000-01-07    1.116196
2000-01-08    1.116196
2000-01-09   -1.156516
2000-01-10   -1.156516
2000-01-11   -1.156516
2000-01-12   -1.156516
2000-01-13   -1.156516
Freq: D, dtype: float64

In [519]:
ts2.reindex(ts.index, method="nearest")

2000-01-03    0.119659
2000-01-04    0.119659
2000-01-05    0.119659
2000-01-06    1.116196
2000-01-07    1.116196
2000-01-08    1.116196
2000-01-09    1.116196
2000-01-10    1.116196
2000-01-11   -1.156516
2000-01-12   -1.156516
2000-01-13   -1.156516
Freq: D, dtype: float64

In [525]:
ts2.reindex(ts.index).fillna(method="bfill")

2000-01-03    0.119659
2000-01-04    1.116196
2000-01-05    1.116196
2000-01-06    1.116196
2000-01-07    1.116196
2000-01-08    1.116196
2000-01-09   -1.156516
2000-01-10   -1.156516
2000-01-11   -1.156516
2000-01-12   -1.156516
2000-01-13   -1.156516
Freq: D, dtype: float64

In [733]:
ts2.reindex(ts.index).fillna(value=10)

2000-01-03    -1.517976
2000-01-06    10.000000
2000-01-09    10.000000
2000-01-12    10.000000
2000-01-15    10.000000
2000-01-18     0.181094
2000-01-21    10.000000
2000-01-24    10.000000
2000-01-27    10.000000
2000-01-30    10.000000
2000-02-02     1.588174
Freq: 3D, dtype: float64

In [526]:
ts2.reindex(ts.index).interpolate()

2000-01-03    0.119659
2000-01-04    0.318966
2000-01-05    0.518274
2000-01-06    0.717581
2000-01-07    0.916889
2000-01-08    1.116196
2000-01-09    0.661654
2000-01-10    0.207112
2000-01-11   -0.247431
2000-01-12   -0.701973
2000-01-13   -1.156516
Freq: D, dtype: float64

### _tolerance specifies the maximum distance between the index and indexer values_

In [588]:
ts2.reindex(ts.index, method='nearest', tolerance='3 Day')

2000-01-03   -1.517976
2000-01-06   -1.517976
2000-01-09         NaN
2000-01-12         NaN
2000-01-15    0.181094
2000-01-18    0.181094
2000-01-21    0.181094
2000-01-24         NaN
2000-01-27         NaN
2000-01-30    1.588174
2000-02-02    1.588174
Freq: 3D, dtype: float64

In [589]:
ts2.reindex(ts.index, method='nearest', limit=2)

2000-01-03   -1.517976
2000-01-06   -1.517976
2000-01-09   -1.517976
2000-01-12    0.181094
2000-01-15    0.181094
2000-01-18    0.181094
2000-01-21    0.181094
2000-01-24    0.181094
2000-01-27    1.588174
2000-01-30    1.588174
2000-02-02    1.588174
Freq: 3D, dtype: float64

# Dropping and Renaming axis label

In [590]:
dpdf = pd.DataFrame({ 'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
                    'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
                    'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
dpdf

Unnamed: 0,one,two,three
a,1.180187,1.105521,
b,-1.208745,-0.142912,0.428906
c,-1.257398,0.32781,-0.777233
d,,-0.280626,-0.476008


In [595]:
dpdf.drop(['a','d'], axis='index')

Unnamed: 0,one,two,three
b,-1.208745,-0.142912,0.428906
c,-1.257398,0.32781,-0.777233


In [593]:
dpdf.drop(['three'], axis='columns')

Unnamed: 0,one,two
a,1.180187,1.105521
b,-1.208745,-0.142912
c,-1.257398,0.32781
d,,-0.280626


In [597]:
dpdf.rename(columns={'one': 'foo', 'three': 'bar'},
            index={'a': 'apple', 'b': 'banana', 'd': 'durian'})

Unnamed: 0,foo,two,bar
apple,1.180187,1.105521,
banana,-1.208745,-0.142912,0.428906
c,-1.257398,0.32781,-0.777233
durian,,-0.280626,-0.476008


In [599]:
dpdf.rename({'one': 'foo', 'three': 'bar'}, axis='columns')

Unnamed: 0,foo,two,bar
a,1.180187,1.105521,
b,-1.208745,-0.142912,0.428906
c,-1.257398,0.32781,-0.777233
d,,-0.280626,-0.476008


# DateTime accessor

In [610]:
s = pd.Series(pd.date_range('20130101 09:10:12', periods=6))
s

0   2013-01-01 09:10:12
1   2013-01-02 09:10:12
2   2013-01-03 09:10:12
3   2013-01-04 09:10:12
4   2013-01-05 09:10:12
5   2013-01-06 09:10:12
dtype: datetime64[ns]

In [612]:
stz = s.dt.tz_localize('US/Eastern')
stz

0   2013-01-01 09:10:12-05:00
1   2013-01-02 09:10:12-05:00
2   2013-01-03 09:10:12-05:00
3   2013-01-04 09:10:12-05:00
4   2013-01-05 09:10:12-05:00
5   2013-01-06 09:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [614]:
stz.dt.tz

<DstTzInfo 'US/Eastern' LMT-1 day, 19:04:00 STD>

In [615]:
s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

0   2013-01-01 04:10:12-05:00
1   2013-01-02 04:10:12-05:00
2   2013-01-03 04:10:12-05:00
3   2013-01-04 04:10:12-05:00
4   2013-01-05 04:10:12-05:00
5   2013-01-06 04:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [616]:
s.dt.strftime('%Y/%m/%d')

0    2013/01/01
1    2013/01/02
2    2013/01/03
3    2013/01/04
4    2013/01/05
5    2013/01/06
dtype: object

# Sorting

In [645]:
sdf = pd.DataFrame({'A':[2,1,1,1],'C':[5,4,3,2], 'B':[1,3,2,4],})
sdf

Unnamed: 0,A,C,B
0,2,5,1
1,1,4,3
2,1,3,2
3,1,2,4


In [623]:
sdf.sort_values(by="one")

Unnamed: 0,one,three,two
1,1,4,3
2,1,3,2
3,1,2,4
0,2,5,1


In [648]:
sdf.sort_values(by=["A","B"])

Unnamed: 0,A,C,B
2,1,3,2
1,1,4,3
3,1,2,4
0,2,5,1


In [633]:
sdf.sort_index(ascending=False)

Unnamed: 0,A,C,B
3,1,2,4
2,1,3,2
1,1,4,3
0,2,5,1


In [628]:
sdf.sort_index(axis="columns")

Unnamed: 0,A,B,C
0,2,1,5
1,1,3,4
2,1,2,3
3,1,4,2


# Sort By Indexes and Values

In [679]:
idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2),('b', 2), ('b', 1), ('b', 1)])
idx.names = ['first', 'second']
idx

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 1, 1, 0, 0]],
           names=['first', 'second'])

In [689]:
np.arange
df_multi = pd.DataFrame({'A': np.arange(6, 0, -1),
                         'B': np.arange(9, 3, -1)
                        }
                        ,
                        index=idx)
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,6,9
a,2,5,8
a,2,4,7
b,2,3,6
b,1,2,5
b,1,1,4


In [690]:
df_multi.sort_values(by=['second', 'A'])

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
b,1,1,4
b,1,2,5
a,1,6,9
b,2,3,6
a,2,4,7
a,2,5,8


In [691]:
df_multi.sort_values(by=['A', 'B'])

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
b,1,1,4
b,1,2,5
b,2,3,6
a,2,4,7
a,2,5,8
a,1,6,9


In [697]:
srs = pd.Series(np.random.randn(20))
srs

0     1.741776
1     1.159691
2    -1.062805
3     0.286982
4    -0.697413
5    -0.177972
6    -0.066730
7     1.707703
8    -1.153216
9     1.091973
10   -0.196643
11   -2.497786
12    0.054492
13   -1.330765
14   -0.764562
15    1.682469
16   -0.001578
17    1.157609
18    0.224011
19   -1.970096
dtype: float64

## Sorting by Series

In [698]:
srs.nsmallest(4)

11   -2.497786
19   -1.970096
13   -1.330765
8    -1.153216
dtype: float64

In [701]:
srs.nlargest(3)

0     1.741776
7     1.707703
15    1.682469
dtype: float64

## Sorting by DataFrames

In [702]:
srdf = pd.DataFrame({'a': [-2, -1, 1, 10, 8, 11, -1],
                    'b': list('abdceff'),
                    'c': [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0]})
srdf

Unnamed: 0,a,b,c
0,-2,a,1.0
1,-1,b,2.0
2,1,d,4.0
3,10,c,3.2
4,8,e,
5,11,f,3.0
6,-1,f,4.0


In [703]:
srdf.nsmallest(3, 'a')

Unnamed: 0,a,b,c
0,-2,a,1.0
1,-1,b,2.0
6,-1,f,4.0


In [706]:
srdf.nsmallest(2, ['a', 'c'])

Unnamed: 0,a,b,c
0,-2,a,1.0
1,-1,b,2.0


# Dtypes

In [727]:
dft = pd.DataFrame(dict(
                    A = np.random.rand(3),
                    B = 1,
                    C = 'foo',
                    D = pd.Timestamp('20010102'),
                    E = pd.Series([1.0]*3).astype('float32'),
                    F = pd.Series([3.0]*3).astype('float32'),
                    G = False,
                    H = pd.Series([1]*3,dtype='int8'))
                   
                  )
                  
dft

Unnamed: 0,A,B,C,D,E,F,G,H
0,0.846423,1,foo,2001-01-02,1.0,3.0,False,1
1,0.02332,1,foo,2001-01-02,1.0,3.0,False,1
2,0.883988,1,foo,2001-01-02,1.0,3.0,False,1


In [740]:
dft.dtypes

A           float64
B             int64
C            object
D    datetime64[ns]
E           float32
F           float32
G              bool
H              int8
dtype: object

In [742]:
dft.A.dtype

dtype('float64')

In [752]:
dft.B

0    1
1    1
2    1
Name: B, dtype: int64

In [753]:
dft.B.astype('float64')

0    1.0
1    1.0
2    1.0
Name: B, dtype: float64

# Type Casting

In [745]:
dft[['A','B']].dtypes

A    float64
B      int64
dtype: object

In [756]:
dft[['A','B']].astype('float64').dtypes

A    float64
B    float64
dtype: object

In [757]:
dft1 = pd.DataFrame({'a': [1,0,1], 'b': [4,5,6], 'c': [7, 8, 9]})
dft1

Unnamed: 0,a,b,c
0,1,4,7
1,0,5,8
2,1,6,9


In [770]:
dft1.dtypes

a    int64
b    int64
c    int64
dtype: object

In [766]:
dft2 = dft1.astype({'a': np.bool, 'b': np.str ,'c': np.float64})
dft2

Unnamed: 0,a,b,c
0,True,4,7.0
1,False,5,8.0
2,True,6,9.0


In [767]:
dft2.dtypes

a       bool
b     object
c    float64
dtype: object

## Numerice Typecasting

In [816]:
m = ['1', 2, 3]
pd.to_numeric(m, downcast='integer')

array([1, 2, 3], dtype=int8)

In [817]:
pd.to_numeric(m, downcast='float')

array([1., 2., 3.], dtype=float32)

## Converting Object by inferring

In [792]:
import datetime

In [805]:
indf = pd.DataFrame([[1.0, 29.0],
                     ['a', 'b'],
                     [datetime.datetime(2016, 3, 2), datetime.datetime(2016, 3, 2)]])
indf.T

Unnamed: 0,0,1,2
0,1,a,2016-03-02 00:00:00
1,29,b,2016-03-02 00:00:00


In [806]:
indf.T.dtypes

0    object
1    object
2    object
dtype: object

In [807]:
indf.T.infer_objects().dtypes

0           float64
1            object
2    datetime64[ns]
dtype: object

## Converting Date

In [808]:
m = ['2016-07-09', datetime.datetime(2016, 3, 2)]
m

['2016-07-09', datetime.datetime(2016, 3, 2, 0, 0)]

In [809]:
pd.to_datetime(m)

DatetimeIndex(['2016-07-09', '2016-03-02'], dtype='datetime64[ns]', freq=None)

## Conversion Force and Coerce

In [815]:
m = ['apple', datetime.datetime(2016, 3, 2)]
#pd.to_datetime(m, errors='raise')
#pd.to_datetime(m, errors='coerce')
pd.to_datetime(m, errors='ignore')

array(['apple', datetime.datetime(2016, 3, 2, 0, 0)], dtype=object)

In [814]:
m = ['apple', 2, 3]
pd.to_numeric(m, errors='coerce')

array([nan,  2.,  3.])

# Converting to Multi-dimention or DataFrames

In [818]:
cndf = pd.DataFrame([['2016-07-09', datetime.datetime(2016, 3, 2)]] * 2, dtype='O')
cndf

Unnamed: 0,0,1
0,2016-07-09,2016-03-02 00:00:00
1,2016-07-09,2016-03-02 00:00:00


In [819]:
cndf.dtypes

0    object
1    object
dtype: object

In [820]:
cndf.apply(pd.to_datetime)

Unnamed: 0,0,1
0,2016-07-09,2016-03-02
1,2016-07-09,2016-03-02


In [853]:
indf = pd.DataFrame([['1.1', 2, 3]] * 2, dtype='O')
indf

Unnamed: 0,0,1,2
0,1.1,2,3
1,1.1,2,3


In [854]:
indf.dtypes

0    object
1    object
2    object
dtype: object

In [855]:
indf = indf.apply(pd.to_numeric)
indf

Unnamed: 0,0,1,2
0,1.1,2,3
1,1.1,2,3


In [886]:
indf.dtypes

0    float64
1      int64
2      int64
dtype: object

In [884]:
to_float = lambda y: pd.to_numeric(y, downcast='float')
indf.loc[0].map(to_float)

0    1.1
1    2.0
2    3.0
Name: 0, dtype: float64

In [892]:
fldf = indf.apply(to_float)
fldf

Unnamed: 0,0,1,2
0,1.1,2.0,3.0
1,1.1,2.0,3.0


In [893]:
fldf.dtypes

0    float32
1    float32
2    float32
dtype: object