# Pandas Documentation on Essential Basic Functionality

In this notebook, you will work through the Pandas documentation on DataFrames.

## Imports

In [1]:
import numpy as np
import pandas as pd

## Pandas essential basic functionality

In this notebook, you are going to learn how to use Pandas by typing the code from the Pandas documentation into this notebook.

* Go to the Pandas [Essential Basic Functionality](http://pandas.pydata.org/pandas-docs/stable/basics.html#essential-basic-functionality).
* Type all of the code from that section of the documentation into this notebook and get it working.
* **To learn this API well, you must type the code rather than copy and pasting it**.
* Create a new cell in this section for each `In[]` prompt in the documentation.
* Ignore the cells in the **Grading** section below.
* No Markdown comments are needed.
* Skip the following sub-sections:
  - Tablewise Function Application
  - Applying with a Panel

## Essential Basic Functionality

In [2]:
index = pd.date_range('1/1/2000', periods=8)

In [3]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])

In [4]:
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=['A', 'B', 'C'])

In [5]:
wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item', 'Item2'],
              major_axis=pd.date_range('1/1/2000', periods=5),
              minor_axis=['A','B','C','D'])

## Head and Tail

In [6]:
long_series = pd.Series(np.random.randn(1000))

In [7]:
long_series.head()

0   -1.012632
1   -1.006298
2    0.219777
3   -0.210161
4    0.585328
dtype: float64

In [8]:
long_series.tail(3)

997    1.292164
998   -1.134290
999    0.157486
dtype: float64

## Attributes and the raw ndarray(s)

In [9]:
df[:2]

Unnamed: 0,A,B,C
2000-01-01,0.726283,0.731682,0.056073
2000-01-02,0.199653,0.630034,-0.962489


In [10]:
df

Unnamed: 0,A,B,C
2000-01-01,0.726283,0.731682,0.056073
2000-01-02,0.199653,0.630034,-0.962489
2000-01-03,0.678175,0.124879,1.182354
2000-01-04,0.136702,-0.847291,-0.812993
2000-01-05,-0.457109,-0.919188,0.305105
2000-01-06,-1.23589,1.43763,1.34671
2000-01-07,0.901784,-0.385224,-1.898623
2000-01-08,-1.148091,0.679952,0.582157


In [11]:
s.values

array([-1.01481785, -1.91918135, -0.22235761,  0.70780851,  0.32604367])

In [12]:
df.values

array([[ 0.72628264,  0.73168202,  0.0560731 ],
       [ 0.19965311,  0.63003368, -0.96248894],
       [ 0.67817498,  0.12487871,  1.18235372],
       [ 0.13670217, -0.84729084, -0.81299284],
       [-0.45710901, -0.91918751,  0.30510452],
       [-1.23589029,  1.43762971,  1.34671046],
       [ 0.90178357, -0.3852241 , -1.89862286],
       [-1.14809067,  0.67995171,  0.58215714]])

In [13]:
wp.values

array([[[-0.05897734,  0.91937988, -0.03586404, -0.92064847],
        [-1.99784305,  0.44579165,  1.18223855, -0.36286159],
        [-0.01852719,  0.6075982 ,  1.0617201 , -0.05866304],
        [ 0.66123579, -0.4666836 ,  0.37690241,  0.86077195],
        [ 1.6404646 ,  0.41361473, -1.56603638,  1.22915029]],

       [[-2.03214396,  0.41556736, -1.3597927 ,  1.38052258],
        [ 0.12293049,  0.98445745, -0.6748634 , -2.11722585],
        [-0.66929729, -1.63646823, -0.53649436,  0.17439009],
        [ 0.2204793 ,  0.97320986, -0.22483989,  0.62891673],
        [-2.03793505, -0.5363651 ,  0.3003455 ,  0.65633306]]])

## Flexible Binary operations

__Matching / broadcasting behavior__

In [14]:
df = pd.DataFrame({'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
                   'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
                   'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})

In [15]:
df

Unnamed: 0,one,three,two
a,2.013593,,0.751052
b,-0.627942,-0.082755,1.717062
c,0.222298,-0.240539,-0.632688
d,,0.411203,0.105523


In [16]:
row = df.ix[1]

In [17]:
column = df['two']

In [18]:
df.sub(row, axis='columns')

Unnamed: 0,one,three,two
a,2.641535,,-0.96601
b,0.0,0.0,0.0
c,0.850241,-0.157784,-2.34975
d,,0.493958,-1.611539


In [19]:
df.sub(row, axis=1)

Unnamed: 0,one,three,two
a,2.641535,,-0.96601
b,0.0,0.0,0.0
c,0.850241,-0.157784,-2.34975
d,,0.493958,-1.611539


In [20]:
df.sub(column, axis='index')

Unnamed: 0,one,three,two
a,1.26254,,0
b,-2.345005,-1.799817,0
c,0.854986,0.392149,0
d,,0.30568,0


In [21]:
df.sub(column, axis=0)

Unnamed: 0,one,three,two
a,1.26254,,0
b,-2.345005,-1.799817,0
c,0.854986,0.392149,0
d,,0.30568,0


In [22]:
df_orig = df

In [23]:
dfmi = df.copy()

In [24]:
dfmi.index = pd.MultiIndex.from_tuples([(1,'a'),(1,'b'),(1,'c'),(2,'a')], names=['first','second'])

In [25]:
dfmi.sub(column, axis=0, level='second')

Unnamed: 0_level_0,Unnamed: 1_level_0,one,three,two
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,1.26254,,0.0
1,b,-2.345005,-1.799817,0.0
1,c,0.854986,0.392149,0.0
2,a,,-0.339849,-0.645529


In [26]:
major_mean = wp.mean(axis='major')

In [27]:
major_mean

Unnamed: 0,Item,Item2
A,0.045271,-0.879193
B,0.38394,0.04008
C,0.203792,-0.499129
D,0.14955,0.144587


In [28]:
wp.sub(major_mean, axis='major')

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 5 (major_axis) x 4 (minor_axis)
Items axis: Item to Item2
Major_axis axis: 2000-01-01 00:00:00 to 2000-01-05 00:00:00
Minor_axis axis: A to D

__Missing data / operations with fill values__

In [29]:
df

Unnamed: 0,one,three,two
a,2.013593,,0.751052
b,-0.627942,-0.082755,1.717062
c,0.222298,-0.240539,-0.632688
d,,0.411203,0.105523


In [30]:
df2 = df.copy()
df2['three']['a'] = 1.

In [31]:
df2

Unnamed: 0,one,three,two
a,2.013593,1.0,0.751052
b,-0.627942,-0.082755,1.717062
c,0.222298,-0.240539,-0.632688
d,,0.411203,0.105523


In [32]:
df + df2

Unnamed: 0,one,three,two
a,4.027185,,1.502104
b,-1.255885,-0.16551,3.434125
c,0.444597,-0.481078,-1.265376
d,,0.822406,0.211047


In [33]:
df.add(df2, fill_value=0)

Unnamed: 0,one,three,two
a,4.027185,1.0,1.502104
b,-1.255885,-0.16551,3.434125
c,0.444597,-0.481078,-1.265376
d,,0.822406,0.211047


__Flexible Comparisons__

In [34]:
df.gt(df2)

Unnamed: 0,one,three,two
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


In [35]:
df2.ne(df)

Unnamed: 0,one,three,two
a,False,True,False
b,False,False,False
c,False,False,False
d,True,False,False


__Boolean Reductions__

In [36]:
(df > 0).all()

one      False
three    False
two      False
dtype: bool

In [37]:
(df > 0).any()

one      True
three    True
two      True
dtype: bool

In [38]:
(df > 0).any().any()

True

In [39]:
df.empty

False

In [40]:
pd.DataFrame(columns=list('ABC')).empty

True

In [41]:
pd.Series([True]).bool()

True

In [42]:
pd.Series([False]).bool()

False

In [43]:
pd.DataFrame([[True]]).bool()

True

In [44]:
pd.DataFrame([[False]]).bool()

False

__Comparing if objects are equivalent__

In [45]:
df+df == df*2

Unnamed: 0,one,three,two
a,True,False,True
b,True,True,True
c,True,True,True
d,False,True,True


In [46]:
(df+df == df*2).all()

one      False
three    False
two       True
dtype: bool

In [47]:
np.nan == np.nan

False

In [48]:
(df+df).equals(df*2)

True

In [49]:
df1 = pd.DataFrame({'col':['foo', 0, np.nan]})

In [50]:
df2 = pd.DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0])

In [51]:
df1.equals(df2)

False

In [52]:
df1.equals(df2.sort_index())

True

__Comparing array-like objects__

In [53]:
pd.Series(['foo', 'bar', 'baz']) == 'foo'

0     True
1    False
2    False
dtype: bool

In [54]:
pd.Index(['foo', 'bar', 'baz']) == 'foo'

array([ True, False, False], dtype=bool)

In [55]:
pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux'])

0     True
1     True
2    False
dtype: bool

In [56]:
pd.Series(['foo', 'bar', 'baz']) == np.array(['foo', 'bar', 'qux'])

0     True
1     True
2    False
dtype: bool

In [57]:
np.array([1, 2, 3]) == np.array([2])

array([False,  True, False], dtype=bool)

In [58]:
np.array([1, 2, 3]) == np.array([1, 2])

  if __name__ == '__main__':


False

__Combining overlapping data sets__

In [59]:
df1 = pd.DataFrame({'A' : [1., np.nan, 3., 5., np.nan],
                    'B' : [np.nan, 2., 3., np.nan, 6.]})

In [60]:
df2 = pd.DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.],
                    'B' : [np.nan, np.nan, 3., 4., 6., 8.]})

In [61]:
df1

Unnamed: 0,A,B
0,1.0,
1,,2.0
2,3.0,3.0
3,5.0,
4,,6.0


In [62]:
df2

Unnamed: 0,A,B
0,5.0,
1,2.0,
2,4.0,3.0
3,,4.0
4,3.0,6.0
5,7.0,8.0


In [63]:
df1.combine_first(df2)

Unnamed: 0,A,B
0,1,
1,2,2.0
2,3,3.0
3,5,4.0
4,3,6.0
5,7,8.0


__General DataFrame Combine__

In [64]:
combiner = lambda x, y: np.where(pd.isnull(x), y, x)

In [65]:
df1.combine(df2, combiner)

Unnamed: 0,A,B
0,1,
1,2,2.0
2,3,3.0
3,5,4.0
4,3,6.0
5,7,8.0


__Descriptive statistics__

In [66]:
df

Unnamed: 0,one,three,two
a,2.013593,,0.751052
b,-0.627942,-0.082755,1.717062
c,0.222298,-0.240539,-0.632688
d,,0.411203,0.105523


In [67]:
df.mean(0)

one      0.535983
three    0.029303
two      0.485237
dtype: float64

In [68]:
df.mean(1)

a    1.382322
b    0.335455
c   -0.216976
d    0.258363
dtype: float64

In [69]:
df.sum(0, skipna=False)

one          NaN
three        NaN
two      1.94095
dtype: float64

In [70]:
df.sum(axis=1, skipna=True)

a    2.764645
b    1.006365
c   -0.650928
d    0.516726
dtype: float64

In [71]:
ts_stand = (df - df.mean()) / df.std()

In [72]:
ts_stand.std()

one      1
three    1
two      1
dtype: float64

In [73]:
xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0)

In [74]:
xs_stand.std(1)

a    1
b    1
c    1
d    1
dtype: float64

In [75]:
df.cumsum()

Unnamed: 0,one,three,two
a,2.013593,,0.751052
b,1.38565,-0.082755,2.468115
c,1.607949,-0.323294,1.835427
d,,0.087909,1.94095


In [76]:
np.mean(df['one'])

0.53598289228789331

In [77]:
np.mean(df['one'].values)

nan

In [78]:
series = pd.Series(np.random.randn(500))

In [79]:
series[20:500] = np.nan

In [80]:
series[10:20]  = 5

In [81]:
series.nunique()

11

__Summarizing data: describe__

In [82]:
series = pd.Series(np.random.randn(1000))

In [83]:
series[::2] = np.nan

In [84]:
series.describe()

count    500.000000
mean      -0.030385
std        1.006184
min       -3.161036
25%       -0.679199
50%       -0.017242
75%        0.643329
max        2.744886
dtype: float64

In [85]:
frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e'])

In [86]:
frame.ix[::2] = np.nan

In [87]:
frame.describe()

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,-0.023115,0.024379,-0.073375,-4.5e-05,0.02544
std,1.002731,1.050712,0.998835,1.029827,1.010868
min,-2.857595,-3.20465,-2.710512,-3.037705,-2.794397
25%,-0.67731,-0.639915,-0.749606,-0.694867,-0.59341
50%,-0.036083,-0.016974,-0.017464,-0.046257,0.051244
75%,0.73058,0.722143,0.606363,0.760688,0.684056
max,2.451378,2.98087,2.853529,3.653212,3.393298


In [88]:
series.describe(percentiles=[.05, .25, .75, .95])

count    500.000000
mean      -0.030385
std        1.006184
min       -3.161036
5%        -1.737810
25%       -0.679199
50%       -0.017242
75%        0.643329
95%        1.630219
max        2.744886
dtype: float64

In [89]:
s = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a'])

In [90]:
s.describe()

count     9
unique    4
top       a
freq      5
dtype: object

In [91]:
frame = pd.DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)})

In [92]:
frame.describe()

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [93]:
frame.describe(include=['object'])

Unnamed: 0,a
count,4
unique,2
top,Yes
freq,2


In [94]:
frame.describe(include=['number'])

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [95]:
frame.describe(include='all')

Unnamed: 0,a,b
count,4,4.0
unique,2,
top,Yes,
freq,2,
mean,,1.5
std,,1.290994
min,,0.0
25%,,0.75
50%,,1.5
75%,,2.25


__Index of Min/Max Values__

In [96]:
s1 = pd.Series(np.random.randn(5))

In [97]:
s1

0   -1.601261
1    1.061794
2    0.944073
3   -0.335409
4    0.822411
dtype: float64

In [98]:
s1.idxmin(), s1.idxmax()

(0, 1)

In [99]:
df1 = pd.DataFrame(np.random.randn(5,3), columns=['A','B','C'])

In [100]:
df1

Unnamed: 0,A,B,C
0,-0.649976,-0.153917,0.475621
1,1.82971,-1.05832,0.626563
2,2.419253,-0.609758,-1.303332
3,-0.083329,0.176502,0.157461
4,-1.384206,1.179222,-0.591433


In [101]:
df1.idxmin(axis=0)

A    4
B    1
C    2
dtype: int64

In [102]:
df1.idxmax(axis=1)

0    C
1    A
2    A
3    B
4    B
dtype: object

In [103]:
df3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba'))

In [104]:
df3

Unnamed: 0,A
e,2.0
d,1.0
c,1.0
b,3.0
a,


In [105]:
df3['A'].idxmin()

'd'

__Value counts (histogramming) / Mode__

In [106]:
data = np.random.randint(0, 7, size=50)

In [107]:
data

array([5, 3, 1, 1, 1, 1, 6, 4, 0, 3, 6, 2, 3, 2, 4, 2, 0, 2, 6, 6, 4, 0, 4,
       0, 0, 6, 4, 1, 0, 3, 1, 6, 4, 3, 0, 1, 2, 4, 6, 1, 4, 2, 4, 1, 0, 1,
       5, 0, 5, 2])

In [108]:
s = pd.Series(data)


In [109]:
s.value_counts()

1    10
4     9
0     9
6     7
2     7
3     5
5     3
dtype: int64

In [110]:
pd.value_counts(data)

1    10
4     9
0     9
6     7
2     7
3     5
5     3
dtype: int64

In [111]:
s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7])

In [112]:
s5.mode()

0    3
1    7
dtype: int64

In [113]:
df5 = pd.DataFrame({"A": np.random.randint(0, 7, size=50),
                    "B": np.random.randint(-10, 15, size=50)})

In [114]:
df5.mode()

Unnamed: 0,A,B
0,0,-9.0
1,3,


In [115]:
arr = np.random.randn(20)

In [116]:
factor = pd.cut(arr, 4)

In [117]:
factor

[(0.561, 1.539], (-1.399, -0.417], (-0.417, 0.561], (0.561, 1.539], (-0.417, 0.561], ..., (-0.417, 0.561], (0.561, 1.539], (-0.417, 0.561], (1.539, 2.517], (-0.417, 0.561]]
Length: 20
Categories (4, object): [(-1.399, -0.417] < (-0.417, 0.561] < (0.561, 1.539] < (1.539, 2.517]]

In [118]:
factor = pd.cut(arr, [-5, -1, 0, 1, 5])

In [119]:
factor

[(1, 5], (-1, 0], (0, 1], (1, 5], (0, 1], ..., (0, 1], (0, 1], (-1, 0], (1, 5], (-1, 0]]
Length: 20
Categories (4, object): [(-5, -1] < (-1, 0] < (0, 1] < (1, 5]]

In [120]:
arr = np.random.randn(30)

In [121]:
factor = pd.qcut(arr, [0, .25, .5, .75, 1])

In [122]:
factor

[(0.317, 0.554], [-1.727, -0.668], (0.317, 0.554], [-1.727, -0.668], [-1.727, -0.668], ..., (0.317, 0.554], (0.554, 1.771], (-0.668, 0.317], (-0.668, 0.317], (0.317, 0.554]]
Length: 30
Categories (4, object): [[-1.727, -0.668] < (-0.668, 0.317] < (0.317, 0.554] < (0.554, 1.771]]

In [123]:
pd.value_counts(factor)

(0.554, 1.771]      8
[-1.727, -0.668]    8
(0.317, 0.554]      7
(-0.668, 0.317]     7
dtype: int64

In [124]:
arr = np.random.randn(20)

In [125]:
factor = pd.cut(arr, [-np.inf, 0, np.inf])

In [126]:
factor

[(0, inf], (0, inf], (0, inf], (0, inf], (-inf, 0], ..., (-inf, 0], (-inf, 0], (-inf, 0], (-inf, 0], (-inf, 0]]
Length: 20
Categories (2, object): [(-inf, 0] < (0, inf]]

## Function application

__Row or Column-wise Function Application__

In [127]:
df.apply(np.mean)

one      0.535983
three    0.029303
two      0.485237
dtype: float64

In [128]:
df.apply(np.mean, axis=1)

a    1.382322
b    0.335455
c   -0.216976
d    0.258363
dtype: float64

In [129]:
df.apply(lambda x: x.max() - x.min())

one      2.641535
three    0.651742
two      2.349750
dtype: float64

In [130]:
df.apply(np.cumsum)

Unnamed: 0,one,three,two
a,2.013593,,0.751052
b,1.38565,-0.082755,2.468115
c,1.607949,-0.323294,1.835427
d,,0.087909,1.94095


In [131]:
df.apply(np.exp)

Unnamed: 0,one,three,two
a,7.490179,,2.119229
b,0.533689,0.920577,5.568148
c,1.248944,0.786204,0.531162
d,,1.508632,1.111292


In [132]:
tsdf = pd.DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'], index=pd.date_range('1/1/2000', periods=1000))

In [133]:
tsdf.apply(lambda x: x.idxmax())

A   2000-07-09
B   2000-11-07
C   2000-05-12
dtype: datetime64[ns]

In [134]:
tsdf

Unnamed: 0,A,B,C
2000-01-01,-0.047903,-0.687480,-1.525116
2000-01-02,0.483231,-0.151852,-0.469704
2000-01-03,1.735149,-0.023914,-1.517146
2000-01-04,-0.928871,-0.174385,0.887859
2000-01-05,0.047423,-0.006196,-1.528965
2000-01-06,0.909521,-0.375846,-0.766279
2000-01-07,0.490138,-1.124167,-1.454395
2000-01-08,0.125900,0.436020,-1.334273
2000-01-09,-0.018062,0.721922,-0.981291
2000-01-10,-0.856237,-0.576300,-0.445980


In [135]:
tsdf.apply(pd.Series.interpolate)

Unnamed: 0,A,B,C
2000-01-01,-0.047903,-0.687480,-1.525116
2000-01-02,0.483231,-0.151852,-0.469704
2000-01-03,1.735149,-0.023914,-1.517146
2000-01-04,-0.928871,-0.174385,0.887859
2000-01-05,0.047423,-0.006196,-1.528965
2000-01-06,0.909521,-0.375846,-0.766279
2000-01-07,0.490138,-1.124167,-1.454395
2000-01-08,0.125900,0.436020,-1.334273
2000-01-09,-0.018062,0.721922,-0.981291
2000-01-10,-0.856237,-0.576300,-0.445980


__Applying elementwise Python functions__

In [136]:
df4 = df_orig.copy()

In [137]:
df4

Unnamed: 0,one,three,two
a,2.013593,,0.751052
b,-0.627942,-0.082755,1.717062
c,0.222298,-0.240539,-0.632688
d,,0.411203,0.105523


In [138]:
f = lambda x: len(str(x))

In [139]:
df4['one'].map(f)

a    13
b    14
c    14
d     3
Name: one, dtype: int64

In [140]:
df4.applymap(f)

Unnamed: 0,one,three,two
a,13,3,13
b,14,16,13
c,14,15,15
d,3,14,14


In [141]:
s = pd.Series(['six', 'seven', 'six', 'seven', 'six'],
              index=['a', 'b', 'c', 'd', 'e'])

In [142]:
t = pd.Series({'six' : 6., 'seven' : 7.})

In [143]:
s

a      six
b    seven
c      six
d    seven
e      six
dtype: object

In [144]:
s.map(t)

a    6
b    7
c    6
d    7
e    6
dtype: float64

## Reindexing and altering labels

In [145]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])

In [146]:
s

a   -0.981874
b   -0.077139
c   -0.364560
d   -0.203020
e   -0.627569
dtype: float64

In [147]:
s.reindex(['e', 'b', 'f', 'd'])

e   -0.627569
b   -0.077139
f         NaN
d   -0.203020
dtype: float64

In [148]:
df

Unnamed: 0,one,three,two
a,2.013593,,0.751052
b,-0.627942,-0.082755,1.717062
c,0.222298,-0.240539,-0.632688
d,,0.411203,0.105523


In [149]:
df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one'])

Unnamed: 0,three,two,one
c,-0.240539,-0.632688,0.222298
f,,,
b,-0.082755,1.717062,-0.627942


In [150]:
rs = s.reindex(df.index)

In [151]:
rs

a   -0.981874
b   -0.077139
c   -0.364560
d   -0.203020
dtype: float64

In [152]:
rs.index is df.index

True

__Reindexing to align with another object__

In [153]:
df2

Unnamed: 0,A,B
0,5.0,
1,2.0,
2,4.0,3.0
3,,4.0
4,3.0,6.0
5,7.0,8.0


In [154]:
df3

Unnamed: 0,A
e,2.0
d,1.0
c,1.0
b,3.0
a,


In [155]:
df.reindex_like(df2)

Unnamed: 0,A,B
0,,
1,,
2,,
3,,
4,,
5,,


__Aligning objects with each other with align__

In [156]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])

In [157]:
s1 = s[:4]

In [158]:
s2 = s[1:]

In [159]:
s1.align(s2)

(a    2.235540
 b   -1.189665
 c    0.777799
 d    0.587212
 e         NaN
 dtype: float64, a         NaN
 b   -1.189665
 c    0.777799
 d    0.587212
 e   -1.819864
 dtype: float64)

In [160]:
s1.align(s2, join='inner')

(b   -1.189665
 c    0.777799
 d    0.587212
 dtype: float64, b   -1.189665
 c    0.777799
 d    0.587212
 dtype: float64)

In [161]:
s1.align(s2, join='left')

(a    2.235540
 b   -1.189665
 c    0.777799
 d    0.587212
 dtype: float64, a         NaN
 b   -1.189665
 c    0.777799
 d    0.587212
 dtype: float64)

In [162]:
df.align(df2, join='inner')

(Empty DataFrame
 Columns: []
 Index: [], Empty DataFrame
 Columns: []
 Index: [])

In [163]:
df.align(df2, join='inner', axis=0)

(Empty DataFrame
 Columns: [one, three, two]
 Index: [], Empty DataFrame
 Columns: [A, B]
 Index: [])

In [164]:
df.align(df2.ix[0], axis=1)

(    A   B       one     three       two
 a NaN NaN  2.013593       NaN  0.751052
 b NaN NaN -0.627942 -0.082755  1.717062
 c NaN NaN  0.222298 -0.240539 -0.632688
 d NaN NaN       NaN  0.411203  0.105523, A         5
 B       NaN
 one     NaN
 three   NaN
 two     NaN
 Name: 0, dtype: float64)

__Filling while reindexing__

In [165]:
rng = pd.date_range('1/3/2000', periods=8)

In [166]:
ts = pd.Series(np.random.randn(8), index=rng)

In [167]:
ts2 = ts[[0, 3, 6]]

In [168]:
ts

2000-01-03    0.404122
2000-01-04   -0.556469
2000-01-05    0.205125
2000-01-06   -1.375964
2000-01-07    0.335216
2000-01-08    1.419264
2000-01-09    0.725614
2000-01-10    2.279254
Freq: D, dtype: float64

In [169]:
ts2

2000-01-03    0.404122
2000-01-06   -1.375964
2000-01-09    0.725614
dtype: float64

In [170]:
ts2.reindex(ts.index)

2000-01-03    0.404122
2000-01-04         NaN
2000-01-05         NaN
2000-01-06   -1.375964
2000-01-07         NaN
2000-01-08         NaN
2000-01-09    0.725614
2000-01-10         NaN
Freq: D, dtype: float64

In [171]:
ts2.reindex(ts.index, method='ffill')

2000-01-03    0.404122
2000-01-04    0.404122
2000-01-05    0.404122
2000-01-06   -1.375964
2000-01-07   -1.375964
2000-01-08   -1.375964
2000-01-09    0.725614
2000-01-10    0.725614
Freq: D, dtype: float64

In [172]:
ts2.reindex(ts.index, method='bfill')

2000-01-03    0.404122
2000-01-04   -1.375964
2000-01-05   -1.375964
2000-01-06   -1.375964
2000-01-07    0.725614
2000-01-08    0.725614
2000-01-09    0.725614
2000-01-10         NaN
Freq: D, dtype: float64

In [173]:
ts2.reindex(ts.index, method='nearest')

2000-01-03    0.404122
2000-01-04    0.404122
2000-01-05   -1.375964
2000-01-06   -1.375964
2000-01-07   -1.375964
2000-01-08    0.725614
2000-01-09    0.725614
2000-01-10    0.725614
Freq: D, dtype: float64

In [174]:
ts2.reindex(ts.index).fillna(method='ffill')

2000-01-03    0.404122
2000-01-04    0.404122
2000-01-05    0.404122
2000-01-06   -1.375964
2000-01-07   -1.375964
2000-01-08   -1.375964
2000-01-09    0.725614
2000-01-10    0.725614
Freq: D, dtype: float64

__Limits on filling while reindexing__

In [175]:
ts2.reindex(ts.index, method='ffill', limit=1)

2000-01-03    0.404122
2000-01-04    0.404122
2000-01-05         NaN
2000-01-06   -1.375964
2000-01-07   -1.375964
2000-01-08         NaN
2000-01-09    0.725614
2000-01-10    0.725614
Freq: D, dtype: float64

In [176]:
ts2.reindex(ts.index, method='ffill', tolerance='1 day')

2000-01-03    0.404122
2000-01-04    0.404122
2000-01-05         NaN
2000-01-06   -1.375964
2000-01-07   -1.375964
2000-01-08         NaN
2000-01-09    0.725614
2000-01-10    0.725614
Freq: D, dtype: float64

__Dropping labels from an axis__

In [177]:
df

Unnamed: 0,one,three,two
a,2.013593,,0.751052
b,-0.627942,-0.082755,1.717062
c,0.222298,-0.240539,-0.632688
d,,0.411203,0.105523


In [178]:
df.drop(['a', 'd'], axis=0)

Unnamed: 0,one,three,two
b,-0.627942,-0.082755,1.717062
c,0.222298,-0.240539,-0.632688


In [179]:
df.drop(['one'], axis=1)

Unnamed: 0,three,two
a,,0.751052
b,-0.082755,1.717062
c,-0.240539,-0.632688
d,0.411203,0.105523


In [180]:
df.reindex(df.index.difference(['a', 'd']))

Unnamed: 0,one,three,two
b,-0.627942,-0.082755,1.717062
c,0.222298,-0.240539,-0.632688


__Renaming / mapping labels__

In [181]:
s

a    2.235540
b   -1.189665
c    0.777799
d    0.587212
e   -1.819864
dtype: float64

In [182]:
s.rename(str.upper)

A    2.235540
B   -1.189665
C    0.777799
D    0.587212
E   -1.819864
dtype: float64

In [183]:
df.rename(columns={'one' : 'foo', 'two' : 'bar'},
          index={'a' : 'apple', 'b' : 'banana', 'd' : 'durian'})

Unnamed: 0,foo,three,bar
apple,2.013593,,0.751052
banana,-0.627942,-0.082755,1.717062
c,0.222298,-0.240539,-0.632688
durian,,0.411203,0.105523


__Iteration__

In [184]:
df = pd.DataFrame({'col1' : np.random.randn(3), 'col2' : np.random.randn(3)},
                  index=['a', 'b', 'c'])

In [185]:
for col in df:
    print(col)

col1
col2


In [186]:
df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']})

In [187]:
for index, row in df.iterrows():
    row['a'] = 10

In [188]:
df

Unnamed: 0,a,b
0,1,a
1,2,b
2,3,c


__iteritems__

In [189]:
for item, frame in wp.iteritems():
    print(item)
    print(frame)

Item
                   A         B         C         D
2000-01-01 -0.058977  0.919380 -0.035864 -0.920648
2000-01-02 -1.997843  0.445792  1.182239 -0.362862
2000-01-03 -0.018527  0.607598  1.061720 -0.058663
2000-01-04  0.661236 -0.466684  0.376902  0.860772
2000-01-05  1.640465  0.413615 -1.566036  1.229150
Item2
                   A         B         C         D
2000-01-01 -2.032144  0.415567 -1.359793  1.380523
2000-01-02  0.122930  0.984457 -0.674863 -2.117226
2000-01-03 -0.669297 -1.636468 -0.536494  0.174390
2000-01-04  0.220479  0.973210 -0.224840  0.628917
2000-01-05 -2.037935 -0.536365  0.300346  0.656333


__iterrows__

In [190]:
for row_index, row in df.iterrows():
    print('%s\n%s' % (row_index, row))

0
a    1
b    a
Name: 0, dtype: object
1
a    2
b    b
Name: 1, dtype: object
2
a    3
b    c
Name: 2, dtype: object


In [191]:
df_orig = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])


In [192]:
df_orig.dtypes

int        int64
float    float64
dtype: object

In [193]:
row = next(df_orig.iterrows())[1]

In [194]:
row

int      1.0
float    1.5
Name: 0, dtype: float64

In [195]:
row['int'].dtype

dtype('float64')

In [196]:
df_orig['int'].dtype

dtype('int64')

In [197]:
df2 = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})


In [198]:
print(df2)

   x  y
0  1  4
1  2  5
2  3  6


In [199]:
print(df2.T)

   0  1  2
x  1  2  3
y  4  5  6


In [200]:
df2_t = pd.DataFrame(dict((idx,values) for idx, values in df2.iterrows()))

In [201]:
print(df2_t)

   0  1  2
x  1  2  3
y  4  5  6


__itertuples__

In [202]:
for row in df.itertuples():
    print(row)

Pandas(Index=0, a=1, b='a')
Pandas(Index=1, a=2, b='b')
Pandas(Index=2, a=3, b='c')


In [203]:
s = pd.Series(pd.date_range('20130101 09:10:12', periods=4))

In [204]:
s

0   2013-01-01 09:10:12
1   2013-01-02 09:10:12
2   2013-01-03 09:10:12
3   2013-01-04 09:10:12
dtype: datetime64[ns]

In [205]:
s.dt.hour

0    9
1    9
2    9
3    9
dtype: int64

In [206]:
s.dt.second

0    12
1    12
2    12
3    12
dtype: int64

In [207]:
s.dt.day

0    1
1    2
2    3
3    4
dtype: int64

In [208]:
s[s.dt.day==2]

1   2013-01-02 09:10:12
dtype: datetime64[ns]

In [209]:
stz = s.dt.tz_localize('US/Eastern')

In [210]:
stz.dt.tz

<DstTzInfo 'US/Eastern' LMT-1 day, 19:04:00 STD>

In [211]:
s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

0   2013-01-01 04:10:12-05:00
1   2013-01-02 04:10:12-05:00
2   2013-01-03 04:10:12-05:00
3   2013-01-04 04:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [212]:
s = pd.Series(pd.date_range('20130101', periods=4))

In [213]:
s

0   2013-01-01
1   2013-01-02
2   2013-01-03
3   2013-01-04
dtype: datetime64[ns]

In [214]:
s.dt.strftime('%Y/%m/%d')

0    2013/01/01
1    2013/01/02
2    2013/01/03
3    2013/01/04
dtype: object

In [215]:
s = pd.Series(pd.period_range('20130101', periods=4))

In [216]:
s

0   2013-01-01
1   2013-01-02
2   2013-01-03
3   2013-01-04
dtype: object

In [217]:
s.dt.strftime('%Y/%m/%d')

0    2013/01/01
1    2013/01/02
2    2013/01/03
3    2013/01/04
dtype: object

In [218]:
s = pd.Series(pd.period_range('20130101', periods=4, freq='D'))

In [219]:
s

0   2013-01-01
1   2013-01-02
2   2013-01-03
3   2013-01-04
dtype: object

In [220]:
s.dt.year

0    2013
1    2013
2    2013
3    2013
dtype: int64

In [221]:
s.dt.day

0    1
1    2
2    3
3    4
dtype: int64

In [222]:
s = pd.Series(pd.timedelta_range('1 day 00:00:05', periods=4, freq='s'))

In [223]:
s

0   1 days 00:00:05
1   1 days 00:00:06
2   1 days 00:00:07
3   1 days 00:00:08
dtype: timedelta64[ns]

In [224]:
s.dt.days

0    1
1    1
2    1
3    1
dtype: int64

In [225]:
s.dt.seconds

0    5
1    6
2    7
3    8
dtype: int64

In [226]:
s.dt.components

Unnamed: 0,days,hours,minutes,seconds,milliseconds,microseconds,nanoseconds
0,1,0,0,5,0,0,0
1,1,0,0,6,0,0,0
2,1,0,0,7,0,0,0
3,1,0,0,8,0,0,0


__Vectorized string methods__

In [227]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])

Sorting

By Index

In [228]:
unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'],
                         columns=['three', 'two', 'one'])

In [229]:
unsorted_df.sort_index()

Unnamed: 0,three,two,one
a,,,
b,,,
c,,,
d,,,


In [230]:
unsorted_df.sort_index(ascending=False)

Unnamed: 0,three,two,one
d,,,
c,,,
b,,,
a,,,


In [231]:
unsorted_df.sort_index(axis=1)

Unnamed: 0,one,three,two
a,,,
d,,,
c,,,
b,,,


In [232]:
unsorted_df['three'].sort_index()

a   NaN
b   NaN
c   NaN
d   NaN
Name: three, dtype: float64

By Values

In [233]:
df1 = pd.DataFrame({'one':[2,1,1,1],'two':[1,3,2,4],'three':[5,4,3,2]})

In [234]:
df1.sort_values(by='two')

Unnamed: 0,one,three,two
0,2,5,1
2,1,3,2
1,1,4,3
3,1,2,4


In [235]:
df1[['one', 'two', 'three']].sort_values(by=['one','two'])

Unnamed: 0,one,two,three
2,1,2,3
1,1,3,4
3,1,4,2
0,2,1,5


In [236]:
s[2] = np.nan

In [237]:
s.sort_values()

0       A
3    Aaba
1       B
4    Baca
6    CABA
8     cat
7     dog
2     NaN
5     NaN
dtype: object

In [238]:
s.sort_values(na_position='first')

2     NaN
5     NaN
0       A
3    Aaba
1       B
4    Baca
6    CABA
8     cat
7     dog
dtype: object

searchsorted

In [240]:
ser = pd.Series([1, 2, 3])

In [241]:
ser.searchsorted([0, 3])

array([0, 2])

In [242]:
ser.searchsorted([0, 4])

array([0, 3])

In [243]:
ser.searchsorted([1, 3], side='right')

array([1, 3])

In [244]:
ser.searchsorted([1, 3], side='left')

array([0, 2])

In [245]:
ser = pd.Series([3, 1, 2])

In [246]:
ser.searchsorted([0, 3], sorter=np.argsort(ser))

array([0, 2])

smallest / largest values

In [248]:
s = pd.Series(np.random.permutation(10))

In [249]:
s

0    1
1    2
2    3
3    9
4    8
5    7
6    0
7    5
8    4
9    6
dtype: int64

In [250]:
s.sort_values()

6    0
0    1
1    2
2    3
8    4
7    5
9    6
5    7
4    8
3    9
dtype: int64

In [251]:
s.nsmallest(3)

6    0
0    1
1    2
dtype: int64

In [252]:
s.nlargest(3)

3    9
4    8
5    7
dtype: int64

In [253]:
df = pd.DataFrame({'a': [-2, -1, 1, 10, 8, 11, -1],
                   'b': list('abdceff'),
                   'c': [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0]})

In [254]:
df.nlargest(3, 'a')

Unnamed: 0,a,b,c
5,11,f,3.0
3,10,c,3.2
4,8,e,


In [255]:
df.nlargest(5, ['a', 'c'])

Unnamed: 0,a,b,c
5,11,f,3.0
3,10,c,3.2
4,8,e,
2,1,d,4.0
1,-1,b,2.0


In [256]:
df.nsmallest(3, 'a')

Unnamed: 0,a,b,c
0,-2,a,1
1,-1,b,2
6,-1,f,4


In [257]:
df.nsmallest(5, ['a', 'c'])

Unnamed: 0,a,b,c
0,-2,a,1.0
1,-1,b,2.0
6,-1,f,4.0
2,1,d,4.0
4,8,e,


Sorting by a multi-index column

Copying

dtypes

In [258]:
dft = pd.DataFrame(dict(A = np.random.rand(3),
                           B = 1,
                           C = 'foo',
                       D = pd.Timestamp('20010102'),
                       E = pd.Series([1.0]*3).astype('float32'),
                                    F = False,
                                    G = pd.Series([1]*3,dtype='int8')))

In [259]:
dft

Unnamed: 0,A,B,C,D,E,F,G
0,0.94452,1,foo,2001-01-02,1,False,1
1,0.67366,1,foo,2001-01-02,1,False,1
2,0.614796,1,foo,2001-01-02,1,False,1


In [260]:
dft.dtypes

A           float64
B             int64
C            object
D    datetime64[ns]
E           float32
F              bool
G              int8
dtype: object

In [261]:
dft['A'].dtype

dtype('float64')

In [262]:
pd.Series([1, 2, 3, 4, 5, 6.])

0    1
1    2
2    3
3    4
4    5
5    6
dtype: float64

In [263]:
pd.Series([1, 2, 3, 6., 'foo'])

0      1
1      2
2      3
3      6
4    foo
dtype: object

In [264]:
dft.get_dtype_counts()

bool              1
datetime64[ns]    1
float32           1
float64           1
int64             1
int8              1
object            1
dtype: int64

In [265]:
df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32')

In [266]:
df1

Unnamed: 0,A
0,0.08455
1,1.393453
2,-0.081813
3,0.321553
4,-0.189804
5,0.289389
6,0.291065
7,-0.010302


In [267]:
df1.dtypes

A    float32
dtype: object

In [268]:
df2

Unnamed: 0,x,y
0,1,4
1,2,5
2,3,6


In [269]:
df2.dtypes

x    int64
y    int64
dtype: object

defaults

In [270]:
pd.DataFrame([1, 2], columns=['a']).dtypes

a    int64
dtype: object

In [271]:
pd.DataFrame({'a': [1, 2]}).dtypes

a    int64
dtype: object

In [272]:
pd.DataFrame({'a': 1 }, index=list(range(2))).dtypes

a    int64
dtype: object

In [273]:
frame = pd.DataFrame(np.array([1, 2]))

In [274]:
df3 = df1.reindex_like(df2).fillna(value=0.0) + df2

In [275]:
df3

Unnamed: 0,x,y
0,1,4
1,2,5
2,3,6


In [276]:
df3.dtypes

x    float64
y    float64
dtype: object

In [277]:
df3.values.dtype

dtype('float64')

astype

In [278]:
df3

Unnamed: 0,x,y
0,1,4
1,2,5
2,3,6


In [279]:
df3.dtypes

x    float64
y    float64
dtype: object

In [280]:
df3.astype('float32').dtypes

x    float32
y    float32
dtype: object

In [281]:
df3['D'] = '1.'

In [282]:
df3['E'] = '1'

In [283]:
df3['D'] = df3['D'].astype('float16')

In [284]:
df3['E'] = df3['E'].astype('int32')

In [285]:
df3.dtypes

x    float64
y    float64
D    float16
E      int32
dtype: object

In [286]:
import datetime

In [287]:
s = pd.Series([datetime.datetime(2001,1,1,0,0),
             'foo', 1.0, 1, pd.Timestamp('20010104'),
              '20010105'], dtype='O')

In [288]:
s

0    2001-01-01 00:00:00
1                    foo
2                      1
3                      1
4    2001-01-04 00:00:00
5               20010105
dtype: object

In [289]:
s.convert_objects(convert_dates='coerce')

  if __name__ == '__main__':


0   2001-01-01
1          NaT
2          NaT
3          NaT
4   2001-01-04
5   2001-01-05
dtype: datetime64[ns]

gotchas

In [290]:
dfi = df3.astype('int32')

In [291]:
dfi['E'] = 1

In [292]:
dfi

Unnamed: 0,x,y,D,E
0,1,4,1,1
1,2,5,1,1
2,3,6,1,1


In [293]:
dfi.dtypes

x    int32
y    int32
D    int32
E    int64
dtype: object

In [294]:
casted = dfi[dfi>0]

In [295]:
casted

Unnamed: 0,x,y,D,E
0,1,4,1,1
1,2,5,1,1
2,3,6,1,1


In [296]:
casted.dtypes

x    int32
y    int32
D    int32
E    int64
dtype: object

In [297]:
dfa = df3.copy()

In [298]:
dfa['A'] = dfa['A'].astype('float32')

KeyError: 'A'

In [None]:
dfa.dtypes

In [None]:
casted = dfa[df2>0]

In [None]:
casted

In [None]:
casted.dtypes

In [None]:
Selecting columns based on dtype

In [None]:
df = pd.DataFrame({'string': list('abc'),
                      'int64': list(range(1, 4)),
                      'uint8': np.arange(3, 6).astype('u1'),
                      'float64': np.arange(4.0, 7.0),
                      'bool1': [True, False, True],
                      'bool2': [False, True, False],
                      'dates': pd.date_range('now', periods=3).values,
                      'category': pd.Series(list("ABC")).astype('category')})

In [None]:
df['tdeltas'] = df.dates.diff()

In [None]:
df['uint64'] = np.arange(3, 6).astype('u8')

In [None]:
df['other_dates'] = pd.date_range('20130101', periods=3).values

In [None]:
df['tz_aware_dates'] = pd.date_range('20130101', periods=3, tz='US/Eastern')

In [None]:
df

In [None]:
df.types

In [None]:
df.select_dtypes(include=[bool])

In [None]:
df.select_dtypes(include=['number', 'bool'], exclude=['unsignedinteger'])

In [None]:
df.select_dtypes(include=['object'])

In [None]:
def subdtypes(dtype):
       subs = dtype.__subclasses__()
       if not subs:
           return dtype
       return [dtype, [subdtypes(dt) for dt in subs]]

In [None]:
subdtypes(np.generic)

## Grading

YOUR ANSWER HERE