# Essential basic functionality

In [38]:
import numpy as np

In [39]:
import pandas as pd

In [40]:
index = pd.date_range("1/1/2000", periods=8)

In [41]:
index

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
              dtype='datetime64[ns]', freq='D')

In [42]:
index2 = pd.date_range("1/1/2000", periods=8,freq='M')

In [43]:
index2

DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-30',
               '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31'],
              dtype='datetime64[ns]', freq='M')

In [44]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])

In [45]:
s

a    1.894632
b    0.419211
c   -0.403931
d    0.622467
e    0.175273
dtype: float64

In [46]:
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"])

In [47]:
df

Unnamed: 0,A,B,C
2000-01-01,0.658921,1.128701,0.15507
2000-01-02,0.722183,-0.414182,-0.079539
2000-01-03,-0.174725,1.704728,1.571684
2000-01-04,-0.464225,0.194052,-0.84896
2000-01-05,-0.409598,1.027104,-0.625165
2000-01-06,-1.184963,-0.279577,1.386408
2000-01-07,-1.097977,-0.26557,1.894756
2000-01-08,-0.985807,-0.849123,0.384304


In [48]:
long_series = pd.Series(np.random.randn(1000))

In [49]:
long_series.head()

0    1.154502
1   -1.489039
2   -0.478276
3    0.058297
4    0.038581
dtype: float64

In [50]:
long_series.tail(3)

997    1.058802
998    0.131188
999   -0.755543
dtype: float64

## Attributes and underlying data

In [51]:
df[:2]

Unnamed: 0,A,B,C
2000-01-01,0.658921,1.128701,0.15507
2000-01-02,0.722183,-0.414182,-0.079539


In [52]:
index

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
              dtype='datetime64[ns]', freq='D')

In [53]:
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [54]:
for x in df.columns:
    print(x.lower())

a
b
c


In [55]:
df.columns = [x.lower() for x in df.columns ]

In [56]:
df.columns

Index(['a', 'b', 'c'], dtype='object')

In [57]:
df.columns = [x.upper() for x in df.columns ]

In [58]:
df

Unnamed: 0,A,B,C
2000-01-01,0.658921,1.128701,0.15507
2000-01-02,0.722183,-0.414182,-0.079539
2000-01-03,-0.174725,1.704728,1.571684
2000-01-04,-0.464225,0.194052,-0.84896
2000-01-05,-0.409598,1.027104,-0.625165
2000-01-06,-1.184963,-0.279577,1.386408
2000-01-07,-1.097977,-0.26557,1.894756
2000-01-08,-0.985807,-0.849123,0.384304


In [59]:
s.array

<PandasArray>
[  1.894632286679628,  0.4192111345907947, -0.4039307906699935,
  0.6224666439802105,  0.1752726903520808]
Length: 5, dtype: float64

In [60]:
s.index.array

<PandasArray>
['a', 'b', 'c', 'd', 'e']
Length: 5, dtype: object

In [61]:
s.to_numpy()

array([ 1.89463229,  0.41921113, -0.40393079,  0.62246664,  0.17527269])

In [62]:
np.asarray(s)

array([ 1.89463229,  0.41921113, -0.40393079,  0.62246664,  0.17527269])

In [63]:
ser = pd.Series(pd.date_range("2000", periods=2, tz="CET"))

In [64]:
ser.to_numpy(dtype=object)

array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'),
       Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')],
      dtype=object)

In [65]:
ser

0   2000-01-01 00:00:00+01:00
1   2000-01-02 00:00:00+01:00
dtype: datetime64[ns, CET]

In [66]:
ser.to_numpy(dtype="datetime64[ns]")

array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00.000000000'],
      dtype='datetime64[ns]')

In [67]:
df.to_numpy()

array([[ 0.658921  ,  1.12870109,  0.15507021],
       [ 0.72218283, -0.4141818 , -0.07953886],
       [-0.1747248 ,  1.70472837,  1.57168411],
       [-0.46422534,  0.19405242, -0.8489599 ],
       [-0.40959828,  1.02710371, -0.62516514],
       [-1.18496304, -0.27957687,  1.38640844],
       [-1.09797735, -0.26556977,  1.89475576],
       [-0.98580651, -0.8491231 ,  0.38430415]])

## Accelerated operations

pandas通过numexpr和bottleneck两种类库来pandas的某些二进制和布尔类型

In [68]:
pd.set_option("compute.use_bottleneck",False)

In [69]:
pd.set_option("compute.use_numexpr",False)

## Matching / broadcasting behavior

DataFrame有add（）、sub（）、mul（）、div（）方法和相关函数radd（）、rsub（）、…   
用于执行二进制操作。对于广播行为，序列输入是最重要的。   
使用这些函数，您可以使用来通过axis关键字匹配索引或列：

In [70]:
df = pd.DataFrame({"one":pd.Series([1,2,3], index=["a","b","c"]),"two":pd.Series([2,3,4,4], index=["a","b","c","d"]),"three":pd.Series([3,2,5], index=["b","c","d"]),})

In [71]:
df

Unnamed: 0,one,two,three
a,1.0,2,
b,2.0,3,3.0
c,3.0,4,2.0
d,,4,5.0


In [72]:
row = df.iloc[1] #取第二行

In [73]:
row

one      2.0
two      3.0
three    3.0
Name: b, dtype: float64

In [74]:
column = df["two"] #取第二列

In [75]:
column

a    2
b    3
c    4
d    4
Name: two, dtype: int64

In [76]:
df.sub(row, axis="columns") #用df逐行减去row,如df的第一行a 1.0 2 NaN 减去row的2.0 3.0 3.0

Unnamed: 0,one,two,three
a,-1.0,-1.0,
b,0.0,0.0,0.0
c,1.0,1.0,-1.0
d,,1.0,2.0


In [77]:
df.sub(row, axis=1)

Unnamed: 0,one,two,three
a,-1.0,-1.0,
b,0.0,0.0,0.0
c,1.0,1.0,-1.0
d,,1.0,2.0


In [78]:
df.sub(column, axis="index")

Unnamed: 0,one,two,three
a,-1.0,0,
b,-1.0,0,0.0
c,-1.0,0,-2.0
d,,0,1.0


In [79]:
df.sub(column, axis=0)

Unnamed: 0,one,two,three
a,-1.0,0,
b,-1.0,0,0.0
c,-1.0,0,-2.0
d,,0,1.0


DataFrame.sub(other, axis='columns', level=None, fill_value=None) 

获取DataFrame和其他元素的减法（二进制运算符sub）。  
与等效，但支持用fill_value替换输入之一中的丢失数据。rsub是反向版本。dataframe - other  
在灵活的包装器（add，sub，mul，div，mod，pow）中使用算术运算符：+，-，*，/，//，％，**。  

<font color='red'><strong>
axis ：{0 或‘index’, 1 或‘columns’}
是按索引(0或' index ')还是按列(1或' columns ')进行比较。
对于Series输入，轴匹配Series索引。
</strong></font>

level ：int 或 label 跨级别广播，匹配通过的多索引级别上的索引值。

fill_value ：float 或 None, 默认为 None
在计算之前，用这个值填充现有的NaN值，
以及成功的DataFrame对齐所需的任何新元素。
如果两个对应的DataFrame位置中的数据丢失，
则结果将丢失。

In [80]:
dfmi = df.copy()

In [81]:
dfmi

Unnamed: 0,one,two,three
a,1.0,2,
b,2.0,3,3.0
c,3.0,4,2.0
d,,4,5.0


In [82]:
dfmi.index = pd.MultiIndex.from_tuples([(1,"a"),(2,"b"),(3,"c"),(4,"a")],names=["first", "second"])

In [83]:
dfmi.index

MultiIndex([(1, 'a'),
            (2, 'b'),
            (3, 'c'),
            (4, 'a')],
           names=['first', 'second'])

In [84]:
dfmi

Unnamed: 0_level_0,Unnamed: 1_level_0,one,two,three
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,1.0,2,
2,b,2.0,3,3.0
3,c,3.0,4,2.0
4,a,,4,5.0


In [85]:
column

a    2
b    3
c    4
d    4
Name: two, dtype: int64

In [86]:
dfmi.sub(column, axis=0, level="second")

Unnamed: 0_level_0,Unnamed: 1_level_0,one,two,three
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,-1.0,0,
2,b,-1.0,0,0.0
3,c,-1.0,0,-2.0
4,a,,2,3.0


In [87]:
s = pd.Series(np.arange(10))

In [88]:
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

In [89]:
div, rem = divmod(s, 3)

In [90]:
div # index value

0    0
1    0
2    0
3    1
4    1
5    1
6    2
7    2
8    2
9    3
dtype: int32

In [91]:
rem

0    0
1    1
2    2
3    0
4    1
5    2
6    0
7    1
8    2
9    0
dtype: int32

In [92]:
idx = pd.Index(np.arange(10))

In [93]:
idx

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

In [94]:
div, rem = divmod(idx, 3)

In [95]:
div

Int64Index([0, 0, 0, 1, 1, 1, 2, 2, 2, 3], dtype='int64')

In [96]:
rem

Int64Index([0, 1, 2, 0, 1, 2, 0, 1, 2, 0], dtype='int64')

In [97]:
div, rem = divmod(s, [2,2,3,3,4,4,5,5,6,6])

In [98]:
div

0    0
1    0
2    0
3    1
4    1
5    1
6    1
7    1
8    1
9    1
dtype: int32

In [99]:
rem

0    0
1    1
2    2
3    0
4    0
5    1
6    1
7    2
8    2
9    3
dtype: int32

## Missing data / operations with fill values

In [100]:
df

Unnamed: 0,one,two,three
a,1.0,2,
b,2.0,3,3.0
c,3.0,4,2.0
d,,4,5.0


In [101]:
df2 = df

In [102]:
df2

Unnamed: 0,one,two,three
a,1.0,2,
b,2.0,3,3.0
c,3.0,4,2.0
d,,4,5.0


In [103]:
df + df2

Unnamed: 0,one,two,three
a,2.0,4,
b,4.0,6,6.0
c,6.0,8,4.0
d,,8,10.0


In [104]:
df.add(df2, fill_value=0)

Unnamed: 0,one,two,three
a,2.0,4,
b,4.0,6,6.0
c,6.0,8,4.0
d,,8,10.0


## Flexible comparisons

Series和DataFrame具有二进制比较方法eq、ne、lt、gt、le和ge，其行为类似于上述二进制算术运算：

In [105]:
df.gt(df2)

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


In [106]:
df2.ne(df)

Unnamed: 0,one,two,three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


## Boolean reductions

你可以用empty、any（）、all（）和bool（）来提供汇总布尔结果

In [107]:
(df > 0).all()

one      False
two       True
three    False
dtype: bool

In [108]:
(df > 0).any()

one      True
two      True
three    True
dtype: bool

In [109]:
(df > 0).any().any()

True

In [110]:
df.empty

False

In [111]:
pd.DataFrame(columns=list("ABC")).empty

True

In [112]:
pd.Series([True]).bool()

True

In [113]:
pd.Series([False]).bool()

False

In [114]:
pd.DataFrame([True]).bool()

True

In [115]:
pd.DataFrame([False]).bool()

False

## Comparing if objects are equivalent

In [116]:
df + df == df * 2

Unnamed: 0,one,two,three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [117]:
(df + df == df * 2).all()

one      False
two       True
three    False
dtype: bool

In [118]:
np.nan == np.nan

False

In [119]:
(df + df).equals(df * 2)

True

In [120]:
df1 = pd.DataFrame({"col":["foo", 0 , np.nan]})

In [121]:
df2 = pd.DataFrame({"col":[np.nan, 0, "foo"]}, index=[2, 1, 0])

In [122]:
df1.equals(df2)

False

In [123]:
df1.equals(df2.sort_index())

True

## Comparing array-like objects

In [124]:
pd.Series(["foo","bar","baz"]) == "foo"

0     True
1    False
2    False
dtype: bool

In [125]:
pd.Index(["foo","bar","baz"]) == "foo"

array([ True, False, False])

In [126]:
pd.Series(["foo","bar","baz"]) == pd.Index(["foo","bar","qux"])

0     True
1     True
2    False
dtype: bool

In [127]:
pd.Series(["foo","bar","baz"]) == np.array(["foo","bar","qux"])

0     True
1     True
2    False
dtype: bool

In [128]:
# pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo', 'bar'])
## 报错! ValueError: Can only compare identically-labeled Series objects

In [129]:
# pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo'])
## 报错! ValueError: Can only compare identically-labeled Series objects

In [130]:
np.array([1, 2, 3]) == np.array([2])

array([False,  True, False])

In [131]:
np.array([1, 2, 3]) == np.array([1, 2])

  """Entry point for launching an IPython kernel.


False

## Combining overlapping data sets

In [133]:
df1 = pd.DataFrame({"A":[1.0, np.nan, 3.0, 5.0, np.nan], "B":[np.nan, 2.0, 3.0, np.nan, 6.0]})

In [134]:
df2 = pd.DataFrame({"A":[5.0, 2.0, 4.0, np.nan,3.0, 7.0], "B":[np.nan, np.nan, 3.0, 4.0, 6.0, 8.0]})

In [135]:
df1

Unnamed: 0,A,B
0,1.0,
1,,2.0
2,3.0,3.0
3,5.0,
4,,6.0


In [136]:
df2

Unnamed: 0,A,B
0,5.0,
1,2.0,
2,4.0,3.0
3,,4.0
4,3.0,6.0
5,7.0,8.0
