# Getting Started with pandas

In [1]:
import pandas as pd

In [2]:
from pandas import Series, DataFrame

In [3]:
import numpy as np
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.set_printoptions(precision=4, suppress=True)

## Introduction to pandas Data Structures  Pandas数据结构简介

### Series

In [4]:
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [5]:
print(pd.Series)

<class 'pandas.core.series.Series'>


In [6]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [7]:
obj.index  # like range(4)

RangeIndex(start=0, stop=4, step=1)

In [8]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])

In [9]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [9]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [10]:
arr1 = np.arange(100).reshape((20,5))

In [11]:
arr1

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24],
       [25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34],
       [35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44],
       [45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54],
       [55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64],
       [65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74],
       [75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84],
       [85, 86, 87, 88, 89],
       [90, 91, 92, 93, 94],
       [95, 96, 97, 98, 99]])

In [12]:
arr1.astype(np.float64)

array([[ 0.,  1.,  2.,  3.,  4.],
       [ 5.,  6.,  7.,  8.,  9.],
       [10., 11., 12., 13., 14.],
       [15., 16., 17., 18., 19.],
       [20., 21., 22., 23., 24.],
       [25., 26., 27., 28., 29.],
       [30., 31., 32., 33., 34.],
       [35., 36., 37., 38., 39.],
       [40., 41., 42., 43., 44.],
       [45., 46., 47., 48., 49.],
       [50., 51., 52., 53., 54.],
       [55., 56., 57., 58., 59.],
       [60., 61., 62., 63., 64.],
       [65., 66., 67., 68., 69.],
       [70., 71., 72., 73., 74.],
       [75., 76., 77., 78., 79.],
       [80., 81., 82., 83., 84.],
       [85., 86., 87., 88., 89.],
       [90., 91., 92., 93., 94.],
       [95., 96., 97., 98., 99.]])

In [13]:
arr1

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24],
       [25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34],
       [35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44],
       [45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54],
       [55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64],
       [65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74],
       [75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84],
       [85, 86, 87, 88, 89],
       [90, 91, 92, 93, 94],
       [95, 96, 97, 98, 99]])

In [14]:
arr2 = arr1.astype(np.float64)
arr2

array([[ 0.,  1.,  2.,  3.,  4.],
       [ 5.,  6.,  7.,  8.,  9.],
       [10., 11., 12., 13., 14.],
       [15., 16., 17., 18., 19.],
       [20., 21., 22., 23., 24.],
       [25., 26., 27., 28., 29.],
       [30., 31., 32., 33., 34.],
       [35., 36., 37., 38., 39.],
       [40., 41., 42., 43., 44.],
       [45., 46., 47., 48., 49.],
       [50., 51., 52., 53., 54.],
       [55., 56., 57., 58., 59.],
       [60., 61., 62., 63., 64.],
       [65., 66., 67., 68., 69.],
       [70., 71., 72., 73., 74.],
       [75., 76., 77., 78., 79.],
       [80., 81., 82., 83., 84.],
       [85., 86., 87., 88., 89.],
       [90., 91., 92., 93., 94.],
       [95., 96., 97., 98., 99.]])

In [15]:
arr3 = arr1.astype(np.int64)
arr3

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24],
       [25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34],
       [35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44],
       [45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54],
       [55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64],
       [65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74],
       [75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84],
       [85, 86, 87, 88, 89],
       [90, 91, 92, 93, 94],
       [95, 96, 97, 98, 99]], dtype=int64)

In [16]:
arr3 * arr3

array([[   0,    1,    4,    9,   16],
       [  25,   36,   49,   64,   81],
       [ 100,  121,  144,  169,  196],
       [ 225,  256,  289,  324,  361],
       [ 400,  441,  484,  529,  576],
       [ 625,  676,  729,  784,  841],
       [ 900,  961, 1024, 1089, 1156],
       [1225, 1296, 1369, 1444, 1521],
       [1600, 1681, 1764, 1849, 1936],
       [2025, 2116, 2209, 2304, 2401],
       [2500, 2601, 2704, 2809, 2916],
       [3025, 3136, 3249, 3364, 3481],
       [3600, 3721, 3844, 3969, 4096],
       [4225, 4356, 4489, 4624, 4761],
       [4900, 5041, 5184, 5329, 5476],
       [5625, 5776, 5929, 6084, 6241],
       [6400, 6561, 6724, 6889, 7056],
       [7225, 7396, 7569, 7744, 7921],
       [8100, 8281, 8464, 8649, 8836],
       [9025, 9216, 9409, 9604, 9801]], dtype=int64)

In [17]:
arr3 ** arr3

array([[                   1,                    1,                    4,
                          27,                  256],
       [                3125,                46656,               823543,
                    16777216,            387420489],
       [         10000000000,         285311670611,        8916100448256,
             302875106592253,    11112006825558016],
       [  437893890380859375,                    0, -2863221430593058543,
         -497033925936021504,  6353754964178307979],
       [-2101438300051996672, -1595931050845505211,  4981753131911086080,
         8450172506621111015,                    0],
       [-6776596920136667815, -1123307876295639040, -5278486589563110205,
        -6845471433603153920,  5529354540715494413],
       [ 2565992168703393792, -4642015662142636065,                    0,
        -4053175462519618527, -5296008294479953920],
       [ 8407224849895527163,                    0, -4689350456247753643,
         6543552412563537920, -879564

In [18]:
arr3 + arr3

array([[  0,   2,   4,   6,   8],
       [ 10,  12,  14,  16,  18],
       [ 20,  22,  24,  26,  28],
       [ 30,  32,  34,  36,  38],
       [ 40,  42,  44,  46,  48],
       [ 50,  52,  54,  56,  58],
       [ 60,  62,  64,  66,  68],
       [ 70,  72,  74,  76,  78],
       [ 80,  82,  84,  86,  88],
       [ 90,  92,  94,  96,  98],
       [100, 102, 104, 106, 108],
       [110, 112, 114, 116, 118],
       [120, 122, 124, 126, 128],
       [130, 132, 134, 136, 138],
       [140, 142, 144, 146, 148],
       [150, 152, 154, 156, 158],
       [160, 162, 164, 166, 168],
       [170, 172, 174, 176, 178],
       [180, 182, 184, 186, 188],
       [190, 192, 194, 196, 198]], dtype=int64)

In [19]:
arr3 + arr3*10 - 20

array([[ -20,   -9,    2,   13,   24],
       [  35,   46,   57,   68,   79],
       [  90,  101,  112,  123,  134],
       [ 145,  156,  167,  178,  189],
       [ 200,  211,  222,  233,  244],
       [ 255,  266,  277,  288,  299],
       [ 310,  321,  332,  343,  354],
       [ 365,  376,  387,  398,  409],
       [ 420,  431,  442,  453,  464],
       [ 475,  486,  497,  508,  519],
       [ 530,  541,  552,  563,  574],
       [ 585,  596,  607,  618,  629],
       [ 640,  651,  662,  673,  684],
       [ 695,  706,  717,  728,  739],
       [ 750,  761,  772,  783,  794],
       [ 805,  816,  827,  838,  849],
       [ 860,  871,  882,  893,  904],
       [ 915,  926,  937,  948,  959],
       [ 970,  981,  992, 1003, 1014],
       [1025, 1036, 1047, 1058, 1069]], dtype=int64)

In [20]:
obj2['a']

-5

In [21]:
obj2['d'] = 6
obj2[['c', 'a', 'd']]

c    3
a   -5
d    6
dtype: int64

In [22]:
obj2[obj2 > 0]

d    6
b    7
c    3
dtype: int64

In [23]:
obj2 * 2

d    12
b    14
a   -10
c     6
dtype: int64

In [28]:
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [25]:
'b' in obj2

True

In [26]:
'e' in obj2

False

In [27]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [29]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [30]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [31]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [32]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [33]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [34]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [35]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [36]:
obj3 * obj4

California             NaN
Ohio          1.225000e+09
Oregon        2.560000e+08
Texas         5.041000e+09
Utah                   NaN
dtype: float64

In [37]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [38]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [40]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

# Alex's Learning Codes

In [42]:
list1 = list(range(-5,6))
list1

[-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]

In [44]:
series1 = pd.Series(list1)
series1

0    -5
1    -4
2    -3
3    -2
4    -1
5     0
6     1
7     2
8     3
9     4
10    5
dtype: int64

In [46]:
print(series1.index, series1.values)

RangeIndex(start=0, stop=11, step=1) [-5 -4 -3 -2 -1  0  1  2  3  4  5]


### DataFrame

In [47]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

In [48]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [49]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [50]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [51]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four',
                             'five', 'six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [52]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [53]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [54]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [55]:
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [56]:
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [57]:
frame2['debt'] = np.arange(6.)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [58]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [59]:
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [60]:
del frame2['eastern']
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [61]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [62]:
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [63]:
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [64]:
pd.DataFrame(pop, index=[2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [65]:
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [66]:
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [67]:
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [68]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

### Index Objects 索引对象

In [69]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
obj

a    0
b    1
c    2
dtype: int64

In [70]:
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [71]:
index[1:]

Index(['b', 'c'], dtype='object')

In [72]:
obj3 = pd.Series(['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],index=range(8))

In [73]:
obj3

0    aaaa
1    bbbb
2    cccc
3    dddd
4    eeee
5    ffff
6    gggg
7    hhhh
dtype: object

In [74]:
obj4 = pd.Series([['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh']，['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh']，['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh']，['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh']，['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh']，['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh']，['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh']，['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh']],index=range(8))

SyntaxError: invalid character in identifier (<ipython-input-74-72ce33421662>, line 1)

In [75]:
obj3 = pd.Series([['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh']],index=range(8))

ValueError: Length of passed values is 2, index implies 8

In [76]:
obj3 = pd.Series([['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],,['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh']],index=range(8))

SyntaxError: invalid syntax (<ipython-input-76-63dde167130d>, line 1)

In [77]:
obj3 = pd.Series({['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],,['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh']},index=range(8))

SyntaxError: invalid syntax (<ipython-input-77-70c5df677d2c>, line 1)

obj3 = pd.Series([['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh'],['aaaa','bbbb','cccc','dddd','eeee','ffff','gggg','hhhh']],index=range(8))

index[1] = 'd'  # TypeError

In [78]:
labels = pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [79]:
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [80]:
obj2.index is labels

True

In [81]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [83]:
frame3.columns

Index(['Nevada', 'Ohio'], dtype='object', name='state')

In [84]:
'Ohio' in frame3.columns

True

In [87]:
2003 in frame3.index

False

In [88]:
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

## Essential Functionality

### Reindexing

In [89]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [90]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [92]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [93]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [94]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [95]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [96]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [97]:
frame.loc[['a', 'b', 'c', 'd'], states]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


### Dropping Entries from an Axis

In [98]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [99]:
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [100]:
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [101]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [102]:
data.T

Unnamed: 0,Ohio,Colorado,Utah,New York
one,0,4,8,12
two,1,5,9,13
three,2,6,10,14
four,3,7,11,15


In [103]:
data  * data.T

Unnamed: 0,Colorado,New York,Ohio,Utah,four,one,three,two
Colorado,,,,,,,,
New York,,,,,,,,
Ohio,,,,,,,,
Utah,,,,,,,,
four,,,,,,,,
one,,,,,,,,
three,,,,,,,,
two,,,,,,,,


In [104]:
data.T * data

Unnamed: 0,Colorado,New York,Ohio,Utah,four,one,three,two
Colorado,,,,,,,,
New York,,,,,,,,
Ohio,,,,,,,,
Utah,,,,,,,,
four,,,,,,,,
one,,,,,,,,
three,,,,,,,,
two,,,,,,,,


In [105]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [106]:
data.drop('two', axis=1)
data.drop(['two', 'four'], axis='columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [107]:
obj.drop('c', inplace=True)
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

### Indexing, Selection, and Filtering 索引，选择，和过滤

In [108]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj


a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [109]:
obj['b']

1.0

In [110]:
obj[1]

1.0

In [111]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [112]:
obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [113]:
obj[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [None]:
obj[2:4]
obj[['b', 'a', 'd']]
obj[[1, 3]]
obj[obj < 2]

In [114]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

In [115]:
obj['b':'c']

b    1.0
c    2.0
dtype: float64

In [116]:
obj['b':'c'] = 5
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [117]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [118]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [119]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [120]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [121]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [122]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [123]:
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


#### Selection with loc and iloc

In [None]:
data.loc['Colorado', ['two', 'three']]

In [None]:
data.iloc[2, [3, 0, 1]]
data.iloc[2]
data.iloc[[1, 2], [3, 0, 1]]

In [None]:
data.loc[:'Utah', 'two']
data.iloc[:, :3][data.three > 5]

### Integer Indexes

ser = pd.Series(np.arange(3.))
ser
ser[-1]

In [None]:
ser = pd.Series(np.arange(3.))

In [None]:
ser

In [None]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2[-1]

In [None]:
ser[:1]
ser.loc[:1]
ser.iloc[:1]

### Arithmetic and Data Alignment

In [None]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
               index=['a', 'c', 'e', 'f', 'g'])
s1
s2

In [None]:
s1 + s2

In [None]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1
df2

In [None]:
df1 + df2

In [None]:
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})
df1
df2
df1 - df2

#### Arithmetic methods with fill values

In [None]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list('abcde'))
df2.loc[1, 'b'] = np.nan
df1
df2

In [None]:
df1 + df2

In [None]:
df1.add(df2, fill_value=0)

In [None]:
1 / df1
df1.rdiv(1)

In [None]:
df1.reindex(columns=df2.columns, fill_value=0)

#### Operations between DataFrame and Series

In [None]:
arr = np.arange(12.).reshape((3, 4))
arr
arr[0]
arr - arr[0]

In [None]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]
frame
series

In [None]:
frame - series

In [None]:
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
frame + series2

In [None]:
series3 = frame['d']
frame
series3
frame.sub(series3, axis='index')

### Function Application and Mapping

In [None]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
np.abs(frame)

In [None]:
f = lambda x: x.max() - x.min()
frame.apply(f)

In [None]:
frame.apply(f, axis='columns')

In [None]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

In [None]:
format = lambda x: '%.2f' % x
frame.applymap(format)

In [None]:
frame['e'].map(format)

### Sorting and Ranking

In [None]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

In [None]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
frame.sort_index()
frame.sort_index(axis=1)

In [None]:
frame.sort_index(axis=1, ascending=False)

In [None]:
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

In [None]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

In [None]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
frame.sort_values(by='b')

In [None]:
frame.sort_values(by=['a', 'b'])

In [None]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

In [None]:
obj.rank(method='first')

In [None]:
# Assign tie values the maximum rank in the group
obj.rank(ascending=False, method='max')

In [None]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
frame
frame.rank(axis='columns')

### Axis Indexes with Duplicate Labels

In [None]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj

In [None]:
obj.index.is_unique

In [None]:
obj['a']
obj['c']

In [None]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df
df.loc['b']

## Summarizing and Computing Descriptive Statistics

In [None]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

In [None]:
df.sum()

In [None]:
df.sum(axis='columns')

In [None]:
df.mean(axis='columns', skipna=False)

In [None]:
df.idxmax()

In [None]:
df.cumsum()

In [None]:
df.describe()

In [None]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()

### Correlation and Covariance

conda install pandas-datareader

In [None]:
price = pd.read_pickle('examples/yahoo_price.pkl')
volume = pd.read_pickle('examples/yahoo_volume.pkl')

import pandas_datareader.data as web
all_data = {ticker: web.get_data_yahoo(ticker)
            for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

price = pd.DataFrame({ticker: data['Adj Close']
                     for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume']
                      for ticker, data in all_data.items()})

In [None]:
returns = price.pct_change()
returns.tail()

In [None]:
returns['MSFT'].corr(returns['IBM'])
returns['MSFT'].cov(returns['IBM'])

In [None]:
returns.MSFT.corr(returns.IBM)

In [None]:
returns.corr()
returns.cov()

In [None]:
returns.corrwith(returns.IBM)

In [None]:
returns.corrwith(volume)

### Unique Values, Value Counts, and Membership

In [None]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [None]:
uniques = obj.unique()
uniques

In [None]:
obj.value_counts()

In [None]:
pd.value_counts(obj.values, sort=False)

In [None]:
obj
mask = obj.isin(['b', 'c'])
mask
obj[mask]

In [None]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)

In [None]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
data

In [None]:
result = data.apply(pd.value_counts).fillna(0)
result

## Conclusion

In [None]:
pd.options.display.max_rows = PREVIOUS_MAX_ROWS