# Essential basic functionality

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
index = pd.date_range("1/1/2000", periods=8)

In [4]:
index

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
              dtype='datetime64[ns]', freq='D')

In [5]:
index2 = pd.date_range("1/1/2000", periods=8,freq='M')

In [6]:
index2

DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-30',
               '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31'],
              dtype='datetime64[ns]', freq='M')

In [7]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])

In [8]:
s

a   -1.064462
b    0.303048
c    0.674209
d    0.860001
e    0.355765
dtype: float64

In [9]:
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"])

In [10]:
df

Unnamed: 0,A,B,C
2000-01-01,-0.548604,0.200052,1.697537
2000-01-02,0.029709,0.097369,-0.898586
2000-01-03,-1.139799,-0.518729,1.015617
2000-01-04,0.043525,1.162587,-1.194197
2000-01-05,-0.506127,0.293115,1.216037
2000-01-06,-0.297148,0.442957,1.239084
2000-01-07,-0.047924,-0.253873,-1.176359
2000-01-08,-0.764993,0.053302,1.339386


In [11]:
long_series = pd.Series(np.random.randn(1000))

In [12]:
long_series.head()

0    1.545888
1    0.494468
2   -0.160815
3   -0.012195
4   -0.144311
dtype: float64

In [13]:
long_series.tail(3)

997    0.917896
998    0.729370
999    2.641425
dtype: float64

## Attributes and underlying data

In [14]:
df[:2]

Unnamed: 0,A,B,C
2000-01-01,-0.548604,0.200052,1.697537
2000-01-02,0.029709,0.097369,-0.898586


In [15]:
index

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
              dtype='datetime64[ns]', freq='D')

In [16]:
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [17]:
for x in df.columns:
    print(x.lower())

a
b
c


In [18]:
df.columns = [x.lower() for x in df.columns ]

In [19]:
df.columns

Index(['a', 'b', 'c'], dtype='object')

In [20]:
df.columns = [x.upper() for x in df.columns ]

In [21]:
df

Unnamed: 0,A,B,C
2000-01-01,-0.548604,0.200052,1.697537
2000-01-02,0.029709,0.097369,-0.898586
2000-01-03,-1.139799,-0.518729,1.015617
2000-01-04,0.043525,1.162587,-1.194197
2000-01-05,-0.506127,0.293115,1.216037
2000-01-06,-0.297148,0.442957,1.239084
2000-01-07,-0.047924,-0.253873,-1.176359
2000-01-08,-0.764993,0.053302,1.339386


In [22]:
s.array

<PandasArray>
[-1.0644616284273323,  0.3030479172315779,  0.6742086975225658,
  0.8600011033005774,  0.3557652768730419]
Length: 5, dtype: float64

In [23]:
s.index.array

<PandasArray>
['a', 'b', 'c', 'd', 'e']
Length: 5, dtype: object

In [24]:
s.to_numpy()

array([-1.06446163,  0.30304792,  0.6742087 ,  0.8600011 ,  0.35576528])

In [25]:
np.asarray(s)

array([-1.06446163,  0.30304792,  0.6742087 ,  0.8600011 ,  0.35576528])

In [26]:
ser = pd.Series(pd.date_range("2000", periods=2, tz="CET"))

In [27]:
ser.to_numpy(dtype=object)

array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'),
       Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')],
      dtype=object)

In [28]:
ser

0   2000-01-01 00:00:00+01:00
1   2000-01-02 00:00:00+01:00
dtype: datetime64[ns, CET]

In [29]:
ser.to_numpy(dtype="datetime64[ns]")

array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00.000000000'],
      dtype='datetime64[ns]')

In [30]:
df.to_numpy()

array([[-0.54860387,  0.20005227,  1.69753721],
       [ 0.02970861,  0.09736905, -0.89858551],
       [-1.1397986 , -0.51872914,  1.01561704],
       [ 0.04352531,  1.16258728, -1.19419746],
       [-0.50612669,  0.29311452,  1.21603683],
       [-0.29714797,  0.44295734,  1.23908414],
       [-0.04792374, -0.25387252, -1.17635901],
       [-0.76499281,  0.0533022 ,  1.33938601]])

## Accelerated operations

pandas通过numexpr和bottleneck两种类库来pandas的某些二进制和布尔类型

In [31]:
pd.set_option("compute.use_bottleneck",False)

In [32]:
pd.set_option("compute.use_numexpr",False)

## Matching / broadcasting behavior

DataFrame有add（）、sub（）、mul（）、div（）方法和相关函数radd（）、rsub（）、…   
用于执行二进制操作。对于广播行为，序列输入是最重要的。   
使用这些函数，您可以使用来通过axis关键字匹配索引或列：

In [33]:
df = pd.DataFrame({"one":pd.Series([1,2,3], index=["a","b","c"]),"two":pd.Series([2,3,4,4], index=["a","b","c","d"]),"three":pd.Series([3,2,5], index=["b","c","d"]),})

In [34]:
df

Unnamed: 0,one,two,three
a,1.0,2,
b,2.0,3,3.0
c,3.0,4,2.0
d,,4,5.0


In [35]:
row = df.iloc[1] #取第二行

In [36]:
row

one      2.0
two      3.0
three    3.0
Name: b, dtype: float64

In [37]:
column = df["two"] #取第二列

In [38]:
column

a    2
b    3
c    4
d    4
Name: two, dtype: int64

In [39]:
df.sub(row, axis="columns") #用df逐行减去row,如df的第一行a 1.0 2 NaN 减去row的2.0 3.0 3.0

Unnamed: 0,one,two,three
a,-1.0,-1.0,
b,0.0,0.0,0.0
c,1.0,1.0,-1.0
d,,1.0,2.0


In [40]:
df.sub(row, axis=1)

Unnamed: 0,one,two,three
a,-1.0,-1.0,
b,0.0,0.0,0.0
c,1.0,1.0,-1.0
d,,1.0,2.0


In [41]:
df.sub(column, axis="index")

Unnamed: 0,one,two,three
a,-1.0,0,
b,-1.0,0,0.0
c,-1.0,0,-2.0
d,,0,1.0


In [42]:
df.sub(column, axis=0)

Unnamed: 0,one,two,three
a,-1.0,0,
b,-1.0,0,0.0
c,-1.0,0,-2.0
d,,0,1.0


DataFrame.sub(other, axis='columns', level=None, fill_value=None) 

获取DataFrame和其他元素的减法（二进制运算符sub）。  
与等效，但支持用fill_value替换输入之一中的丢失数据。rsub是反向版本。dataframe - other  
在灵活的包装器（add，sub，mul，div，mod，pow）中使用算术运算符：+，-，*，/，//，％，**。  

<font color='red'><strong>
axis ：{0 或‘index’, 1 或‘columns’}
是按索引(0或' index ')还是按列(1或' columns ')进行比较。
对于Series输入，轴匹配Series索引。
</strong></font>

level ：int 或 label 跨级别广播，匹配通过的多索引级别上的索引值。

fill_value ：float 或 None, 默认为 None
在计算之前，用这个值填充现有的NaN值，
以及成功的DataFrame对齐所需的任何新元素。
如果两个对应的DataFrame位置中的数据丢失，
则结果将丢失。

In [43]:
dfmi = df.copy()

In [44]:
dfmi

Unnamed: 0,one,two,three
a,1.0,2,
b,2.0,3,3.0
c,3.0,4,2.0
d,,4,5.0


In [45]:
dfmi.index = pd.MultiIndex.from_tuples([(1,"a"),(2,"b"),(3,"c"),(4,"a")],names=["first", "second"])

In [46]:
dfmi.index

MultiIndex([(1, 'a'),
            (2, 'b'),
            (3, 'c'),
            (4, 'a')],
           names=['first', 'second'])

In [47]:
dfmi

Unnamed: 0_level_0,Unnamed: 1_level_0,one,two,three
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,1.0,2,
2,b,2.0,3,3.0
3,c,3.0,4,2.0
4,a,,4,5.0


In [48]:
column

a    2
b    3
c    4
d    4
Name: two, dtype: int64

In [49]:
dfmi.sub(column, axis=0, level="second")

Unnamed: 0_level_0,Unnamed: 1_level_0,one,two,three
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,-1.0,0,
2,b,-1.0,0,0.0
3,c,-1.0,0,-2.0
4,a,,2,3.0


In [50]:
s = pd.Series(np.arange(10))

In [51]:
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

In [52]:
div, rem = divmod(s, 3)

In [53]:
div # index value

0    0
1    0
2    0
3    1
4    1
5    1
6    2
7    2
8    2
9    3
dtype: int32

In [54]:
rem

0    0
1    1
2    2
3    0
4    1
5    2
6    0
7    1
8    2
9    0
dtype: int32

In [55]:
idx = pd.Index(np.arange(10))

In [56]:
idx

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

In [57]:
div, rem = divmod(idx, 3)

In [58]:
div

Int64Index([0, 0, 0, 1, 1, 1, 2, 2, 2, 3], dtype='int64')

In [59]:
rem

Int64Index([0, 1, 2, 0, 1, 2, 0, 1, 2, 0], dtype='int64')

In [60]:
div, rem = divmod(s, [2,2,3,3,4,4,5,5,6,6])

In [61]:
div

0    0
1    0
2    0
3    1
4    1
5    1
6    1
7    1
8    1
9    1
dtype: int32

In [62]:
rem

0    0
1    1
2    2
3    0
4    0
5    1
6    1
7    2
8    2
9    3
dtype: int32

## Missing data / operations with fill values

In [63]:
df

Unnamed: 0,one,two,three
a,1.0,2,
b,2.0,3,3.0
c,3.0,4,2.0
d,,4,5.0


In [64]:
df2 = df

In [65]:
df2

Unnamed: 0,one,two,three
a,1.0,2,
b,2.0,3,3.0
c,3.0,4,2.0
d,,4,5.0


In [66]:
df + df2

Unnamed: 0,one,two,three
a,2.0,4,
b,4.0,6,6.0
c,6.0,8,4.0
d,,8,10.0


In [67]:
df.add(df2, fill_value=0)

Unnamed: 0,one,two,three
a,2.0,4,
b,4.0,6,6.0
c,6.0,8,4.0
d,,8,10.0


## Flexible comparisons

Series和DataFrame具有二进制比较方法eq、ne、lt、gt、le和ge，其行为类似于上述二进制算术运算：

In [68]:
df.gt(df2)

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


In [69]:
df2.ne(df)

Unnamed: 0,one,two,three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


## Boolean reductions