# Essential basic functionality

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
index = pd.date_range("1/1/2000", periods=8)

In [4]:
index

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
              dtype='datetime64[ns]', freq='D')

In [5]:
index2 = pd.date_range("1/1/2000", periods=8,freq='M')

In [6]:
index2

DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-30',
               '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31'],
              dtype='datetime64[ns]', freq='M')

In [7]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])

In [8]:
s

a    0.571101
b    0.430728
c   -1.280462
d    0.107723
e    1.306298
dtype: float64

In [9]:
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"])

In [10]:
df

Unnamed: 0,A,B,C
2000-01-01,1.437903,0.577111,-1.064504
2000-01-02,-0.12175,-0.226074,1.113063
2000-01-03,-0.162692,0.251043,0.279486
2000-01-04,0.564978,0.682652,0.061517
2000-01-05,-0.190954,1.755982,-0.648758
2000-01-06,-0.740646,0.638468,1.415802
2000-01-07,-0.751683,-2.431618,1.400235
2000-01-08,1.441184,-1.731543,-0.971492


In [11]:
long_series = pd.Series(np.random.randn(1000))

In [12]:
long_series.head()

0    1.220890
1    0.145313
2   -0.220575
3    0.009275
4   -0.128669
dtype: float64

In [13]:
long_series.tail(3)

997    0.895556
998   -1.494881
999    0.088125
dtype: float64

## Attributes and underlying data

In [14]:
df[:2]

Unnamed: 0,A,B,C
2000-01-01,1.437903,0.577111,-1.064504
2000-01-02,-0.12175,-0.226074,1.113063


In [15]:
index

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
              dtype='datetime64[ns]', freq='D')

In [16]:
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [17]:
for x in df.columns:
    print(x.lower())

a
b
c


In [18]:
df.columns = [x.lower() for x in df.columns ]

In [19]:
df.columns

Index(['a', 'b', 'c'], dtype='object')

In [20]:
df.columns = [x.upper() for x in df.columns ]

In [21]:
df

Unnamed: 0,A,B,C
2000-01-01,1.437903,0.577111,-1.064504
2000-01-02,-0.12175,-0.226074,1.113063
2000-01-03,-0.162692,0.251043,0.279486
2000-01-04,0.564978,0.682652,0.061517
2000-01-05,-0.190954,1.755982,-0.648758
2000-01-06,-0.740646,0.638468,1.415802
2000-01-07,-0.751683,-2.431618,1.400235
2000-01-08,1.441184,-1.731543,-0.971492


In [22]:
s.array

<PandasArray>
[ 0.5711014336533851, 0.43072752993809654, -1.2804617737671526,
 0.10772275816598725,  1.3062978832891734]
Length: 5, dtype: float64

In [23]:
s.index.array

<PandasArray>
['a', 'b', 'c', 'd', 'e']
Length: 5, dtype: object

In [24]:
s.to_numpy()

array([ 0.57110143,  0.43072753, -1.28046177,  0.10772276,  1.30629788])

In [25]:
np.asarray(s)

array([ 0.57110143,  0.43072753, -1.28046177,  0.10772276,  1.30629788])

In [26]:
ser = pd.Series(pd.date_range("2000", periods=2, tz="CET"))

In [27]:
ser.to_numpy(dtype=object)

array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'),
       Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')],
      dtype=object)

In [28]:
ser

0   2000-01-01 00:00:00+01:00
1   2000-01-02 00:00:00+01:00
dtype: datetime64[ns, CET]

In [29]:
ser.to_numpy(dtype="datetime64[ns]")

array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00.000000000'],
      dtype='datetime64[ns]')

In [30]:
df.to_numpy()

array([[ 1.43790266,  0.57711058, -1.06450394],
       [-0.12175047, -0.22607352,  1.11306264],
       [-0.16269224,  0.25104276,  0.27948557],
       [ 0.56497811,  0.68265227,  0.06151682],
       [-0.19095411,  1.75598208, -0.64875768],
       [-0.74064568,  0.63846826,  1.41580176],
       [-0.75168349, -2.43161808,  1.40023463],
       [ 1.44118439, -1.73154343, -0.97149171]])

## Accelerated operations

pandas通过numexpr和bottleneck两种类库来pandas的某些二进制和布尔类型

In [31]:
pd.set_option("compute.use_bottleneck",False)

In [32]:
pd.set_option("compute.use_numexpr",False)

## Matching / broadcasting behavior

DataFrame有add（）、sub（）、mul（）、div（）方法和相关函数radd（）、rsub（）、…   
用于执行二进制操作。对于广播行为，序列输入是最重要的。   
使用这些函数，您可以使用来通过axis关键字匹配索引或列：

In [33]:
df = pd.DataFrame({"one":pd.Series([1,2,3], index=["a","b","c"]),"two":pd.Series([2,3,4,4], index=["a","b","c","d"]),"three":pd.Series([3,2,5], index=["b","c","d"]),})

In [34]:
df

Unnamed: 0,one,two,three
a,1.0,2,
b,2.0,3,3.0
c,3.0,4,2.0
d,,4,5.0


In [35]:
row = df.iloc[1] #取第二行

In [36]:
row

one      2.0
two      3.0
three    3.0
Name: b, dtype: float64

In [37]:
column = df["two"] #取第二列

In [38]:
column

a    2
b    3
c    4
d    4
Name: two, dtype: int64

In [39]:
df.sub(row, axis="columns") #用df逐行减去row,如df的第一行a 1.0 2 NaN 减去row的2.0 3.0 3.0

Unnamed: 0,one,two,three
a,-1.0,-1.0,
b,0.0,0.0,0.0
c,1.0,1.0,-1.0
d,,1.0,2.0


In [40]:
df.sub(row, axis=1)

Unnamed: 0,one,two,three
a,-1.0,-1.0,
b,0.0,0.0,0.0
c,1.0,1.0,-1.0
d,,1.0,2.0


In [41]:
df.sub(column, axis="index")

Unnamed: 0,one,two,three
a,-1.0,0,
b,-1.0,0,0.0
c,-1.0,0,-2.0
d,,0,1.0


In [42]:
df.sub(column, axis=0)

Unnamed: 0,one,two,three
a,-1.0,0,
b,-1.0,0,0.0
c,-1.0,0,-2.0
d,,0,1.0


DataFrame.sub(other, axis='columns', level=None, fill_value=None) 

获取DataFrame和其他元素的减法（二进制运算符sub）。  
与等效，但支持用fill_value替换输入之一中的丢失数据。rsub是反向版本。dataframe - other  
在灵活的包装器（add，sub，mul，div，mod，pow）中使用算术运算符：+，-，*，/，//，％，**。  

<font color='red'><strong>
axis ：{0 或‘index’, 1 或‘columns’}
是按索引(0或' index ')还是按列(1或' columns ')进行比较。
对于Series输入，轴匹配Series索引。
</strong></font>

level ：int 或 label 跨级别广播，匹配通过的多索引级别上的索引值。

fill_value ：float 或 None, 默认为 None
在计算之前，用这个值填充现有的NaN值，
以及成功的DataFrame对齐所需的任何新元素。
如果两个对应的DataFrame位置中的数据丢失，
则结果将丢失。

In [43]:
dfmi = df.copy()

In [44]:
dfmi

Unnamed: 0,one,two,three
a,1.0,2,
b,2.0,3,3.0
c,3.0,4,2.0
d,,4,5.0


In [45]:
dfmi.index = pd.MultiIndex.from_tuples([(1,"a"),(2,"b"),(3,"c"),(4,"a")],names=["first", "second"])

In [46]:
dfmi.index

MultiIndex([(1, 'a'),
            (2, 'b'),
            (3, 'c'),
            (4, 'a')],
           names=['first', 'second'])

In [47]:
dfmi

Unnamed: 0_level_0,Unnamed: 1_level_0,one,two,three
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,1.0,2,
2,b,2.0,3,3.0
3,c,3.0,4,2.0
4,a,,4,5.0


In [48]:
column

a    2
b    3
c    4
d    4
Name: two, dtype: int64

In [49]:
dfmi.sub(column, axis=0, level="second")

Unnamed: 0_level_0,Unnamed: 1_level_0,one,two,three
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,-1.0,0,
2,b,-1.0,0,0.0
3,c,-1.0,0,-2.0
4,a,,2,3.0


In [50]:
s = pd.Series(np.arange(10))

In [51]:
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

In [52]:
div, rem = divmod(s, 3)

In [53]:
div # index value

0    0
1    0
2    0
3    1
4    1
5    1
6    2
7    2
8    2
9    3
dtype: int32

In [54]:
rem

0    0
1    1
2    2
3    0
4    1
5    2
6    0
7    1
8    2
9    0
dtype: int32

In [55]:
idx = pd.Index(np.arange(10))

In [56]:
idx

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

In [57]:
div, rem = divmod(idx, 3)

In [58]:
div

Int64Index([0, 0, 0, 1, 1, 1, 2, 2, 2, 3], dtype='int64')

In [59]:
rem

Int64Index([0, 1, 2, 0, 1, 2, 0, 1, 2, 0], dtype='int64')

In [60]:
div, rem = divmod(s, [2,2,3,3,4,4,5,5,6,6])

In [61]:
div

0    0
1    0
2    0
3    1
4    1
5    1
6    1
7    1
8    1
9    1
dtype: int32

In [62]:
rem

0    0
1    1
2    2
3    0
4    0
5    1
6    1
7    2
8    2
9    3
dtype: int32

## Missing data / operations with fill values

In [63]:
df

Unnamed: 0,one,two,three
a,1.0,2,
b,2.0,3,3.0
c,3.0,4,2.0
d,,4,5.0


In [64]:
df2 = df

In [65]:
df2

Unnamed: 0,one,two,three
a,1.0,2,
b,2.0,3,3.0
c,3.0,4,2.0
d,,4,5.0


In [66]:
df + df2

Unnamed: 0,one,two,three
a,2.0,4,
b,4.0,6,6.0
c,6.0,8,4.0
d,,8,10.0


In [67]:
df.add(df2, fill_value=0)

Unnamed: 0,one,two,three
a,2.0,4,
b,4.0,6,6.0
c,6.0,8,4.0
d,,8,10.0


## Flexible comparisons

Series和DataFrame具有二进制比较方法eq、ne、lt、gt、le和ge，其行为类似于上述二进制算术运算：

In [68]:
df.gt(df2)

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


In [69]:
df2.ne(df)

Unnamed: 0,one,two,three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


## Boolean reductions

你可以用empty、any（）、all（）和bool（）来提供汇总布尔结果

In [70]:
(df > 0).all()

one      False
two       True
three    False
dtype: bool

In [71]:
(df > 0).any()

one      True
two      True
three    True
dtype: bool

In [72]:
(df > 0).any().any()

True

In [73]:
df.empty

False

In [74]:
pd.DataFrame(columns=list("ABC")).empty

True

In [75]:
pd.Series([True]).bool()

True

In [76]:
pd.Series([False]).bool()

False

In [77]:
pd.DataFrame([True]).bool()

True

In [78]:
pd.DataFrame([False]).bool()

False

## Comparing if objects are equivalent

In [79]:
df + df == df * 2

Unnamed: 0,one,two,three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [80]:
(df + df == df * 2).all()

one      False
two       True
three    False
dtype: bool

In [81]:
np.nan == np.nan

False

In [82]:
(df + df).equals(df * 2)

True

In [83]:
df1 = pd.DataFrame({"col":["foo", 0 , np.nan]})

In [84]:
df2 = pd.DataFrame({"col":[np.nan, 0, "foo"]}, index=[2, 1, 0])

In [85]:
df1.equals(df2)

False

In [86]:
df1.equals(df2.sort_index())

True

## Comparing array-like objects

In [87]:
pd.Series(["foo","bar","baz"]) == "foo"

0     True
1    False
2    False
dtype: bool

In [88]:
pd.Index(["foo","bar","baz"]) == "foo"

array([ True, False, False])

In [89]:
pd.Series(["foo","bar","baz"]) == pd.Index(["foo","bar","qux"])

0     True
1     True
2    False
dtype: bool

In [90]:
pd.Series(["foo","bar","baz"]) == np.array(["foo","bar","qux"])

0     True
1     True
2    False
dtype: bool

In [91]:
# pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo', 'bar'])
## 报错! ValueError: Can only compare identically-labeled Series objects

In [92]:
# pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo'])
## 报错! ValueError: Can only compare identically-labeled Series objects

In [93]:
np.array([1, 2, 3]) == np.array([2])

array([False,  True, False])

In [94]:
np.array([1, 2, 3]) == np.array([1, 2])

  """Entry point for launching an IPython kernel.


False

## Combining overlapping data sets

In [95]:
df1 = pd.DataFrame({"A":[1.0, np.nan, 3.0, 5.0, np.nan], "B":[np.nan, 2.0, 3.0, np.nan, 6.0]})

In [96]:
df2 = pd.DataFrame({"A":[5.0, 2.0, 4.0, np.nan,3.0, 7.0], "B":[np.nan, np.nan, 3.0, 4.0, 6.0, 8.0]})

In [97]:
df1

Unnamed: 0,A,B
0,1.0,
1,,2.0
2,3.0,3.0
3,5.0,
4,,6.0


In [98]:
df2

Unnamed: 0,A,B
0,5.0,
1,2.0,
2,4.0,3.0
3,,4.0
4,3.0,6.0
5,7.0,8.0


In [99]:
df1.combine_first(df2)

Unnamed: 0,A,B
0,1.0,
1,2.0,2.0
2,3.0,3.0
3,5.0,4.0
4,3.0,6.0
5,7.0,8.0


## Descriptive statistics

In [100]:
df

Unnamed: 0,one,two,three
a,1.0,2,
b,2.0,3,3.0
c,3.0,4,2.0
d,,4,5.0


In [101]:
df.mean(0)

one      2.000000
two      3.250000
three    3.333333
dtype: float64

In [102]:
df.mean(1)

a    1.500000
b    2.666667
c    3.000000
d    4.500000
dtype: float64

In [103]:
df.sum(0, skipna=False)

one       NaN
two      13.0
three     NaN
dtype: float64

In [104]:
df.sum(axis=1,skipna=True)

a    3.0
b    8.0
c    9.0
d    9.0
dtype: float64

In [105]:
ts_stand = (df - df.mean()) / df.std()

In [106]:
ts_stand.std()

one      1.0
two      1.0
three    1.0
dtype: float64

In [107]:
xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0)

In [108]:
xs_stand.std(1)

a    1.0
b    1.0
c    1.0
d    1.0
dtype: float64

In [109]:
df.cumsum()

Unnamed: 0,one,two,three
a,1.0,2,
b,3.0,5,3.0
c,6.0,9,5.0
d,,13,10.0


In [110]:
np.mean(df["one"])

2.0

In [111]:
np.mean(df["one"].to_numpy())

nan

In [112]:
series = pd.Series(np.random.randn(50))

In [113]:
series[20:500] = np.nan

In [114]:
series[10:20] = 5

In [115]:
series.nunique()

11

## General DataFrame combin

In [116]:
def combiner(x, y):
    return np.where(pd.isna(x), y, x)

In [117]:
df1.combine(df2, combiner)

Unnamed: 0,A,B
0,1.0,
1,2.0,2.0
2,3.0,3.0
3,5.0,4.0
4,3.0,6.0
5,7.0,8.0


## Descriptive statistics

In [118]:
df

Unnamed: 0,one,two,three
a,1.0,2,
b,2.0,3,3.0
c,3.0,4,2.0
d,,4,5.0


In [119]:
df.mean(0)

one      2.000000
two      3.250000
three    3.333333
dtype: float64

In [120]:
df.mean(1)

a    1.500000
b    2.666667
c    3.000000
d    4.500000
dtype: float64

In [121]:
df.sum(0, skipna=False)

one       NaN
two      13.0
three     NaN
dtype: float64

In [122]:
df.sum(axis=1,skipna=True)

a    3.0
b    8.0
c    9.0
d    9.0
dtype: float64

In [123]:
ts_stand = (df - df.mean() / df.std())

In [124]:
ts_stand.std()

one      1.000000
two      0.957427
three    1.527525
dtype: float64

In [125]:
xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0)

In [126]:
xs_stand.std(1)

a    1.0
b    1.0
c    1.0
d    1.0
dtype: float64

In [127]:
df.cumsum()

Unnamed: 0,one,two,three
a,1.0,2,
b,3.0,5,3.0
c,6.0,9,5.0
d,,13,10.0


In [128]:
np.mean(df["one"])

2.0

In [129]:
np.mean(df["one"].to_numpy())

nan

In [130]:
series = pd.Series(np.random.randn(500))

In [131]:
series[200:500] = np.nan

In [132]:
series[10:20] = 5

In [133]:
series.nunique()

191

|  函数名   | 作用说明  |
|  ----  | ----  |
| count | Number of non-NA observations |
| sum | Sum of values |
| mean | Mean of values |
| mad | Mean absolute deviation |
| median | Arithmetic median of values |
| min | Minimum |
| max | Maximum |
| mode | Mode |
| abs | Absolute Value |
| prod | Product of values |
| std | Bessel-corrected sample standard deviation |
| var | Unbiased variance |
| sem | Standard error of the mean |
| skew | Sample skewness (3rd moment) |
| kurt | Sample kurtosis (4th moment) |
| quantile | Sample quantile (value at %) |
| cumsum | Cumulative sum |
| cumprod | Cumulative product |
| cummax | Cumulative maximum |
| cummin | Cumulative minimum |

## Summarizing data: describe

describe()函数用于计算各种汇总统计信息，关于Series和columns的（当然Na的除外）

In [134]:
series = pd.Series(np.random.randn(1000))

In [135]:
series[::2] = np.nan

In [136]:
series.describe()

count    500.000000
mean       0.038951
std        0.991130
min       -3.158741
25%       -0.540140
50%        0.067559
75%        0.612740
max        4.132080
dtype: float64

In [137]:
frame = pd.DataFrame(np.random.randn(1000,5), columns=["a","b","c","d","e"])

In [138]:
frame.iloc[::2] = np.nan

In [139]:
frame.describe()

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,0.016224,-0.032531,0.074322,-0.061401,-0.028249
std,1.038145,0.949242,0.99367,1.014017,0.974145
min,-2.891179,-2.963364,-2.899155,-2.548994,-2.964372
25%,-0.612858,-0.691827,-0.546612,-0.700181,-0.686367
50%,0.043746,-0.045421,0.068641,-0.012578,-0.073779
75%,0.694287,0.601172,0.678793,0.606827,0.555245
max,3.022273,2.722592,2.857138,2.805116,2.992425


In [140]:
series.describe(percentiles=[0.05,0.25,0.75,0.95])

count    500.000000
mean       0.038951
std        0.991130
min       -3.158741
5%        -1.667007
25%       -0.540140
50%        0.067559
75%        0.612740
95%        1.683303
max        4.132080
dtype: float64

In [141]:
s = pd.Series(["a", "a", "b", "b", "a", "a", np.nan, "c", "d", "a"])

In [142]:
s.describe()

count     9
unique    4
top       a
freq      5
dtype: object

In [143]:
frame = pd.DataFrame({"a":["Yes","Yes","No","No"],"b":range(4)})

In [144]:
frame.describe()

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [145]:
frame.describe(include=["object"])

Unnamed: 0,a
count,4
unique,2
top,No
freq,2


In [146]:
frame.describe(include=["number"])

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [147]:
frame.describe(include="all")

Unnamed: 0,a,b
count,4,4.0
unique,2,
top,No,
freq,2,
mean,,1.5
std,,1.290994
min,,0.0
25%,,0.75
50%,,1.5
75%,,2.25
