In [None]:
"""
Pandas两大数据结构:
Series: 类似一维数组的对象,由数据和索引组成,索引是自动创建的
DataFrame: 表格型数据结构(如excel),每列数据可以是不同类型,索引包括列索引和行索引

区别：Numpy只能处理数值型数据,非数值型数据要用Pandas
"""

In [2]:
import numpy as np
import pandas as pd

In [79]:
# 1.通过list创建series对象,不指定行索引(默认是自增长的int类型)
ser = pd.Series(range(1, 5))

In [80]:
ser

0    1
1    2
2    3
3    4
dtype: int64

In [81]:
type(ser), ser.dtypes  # 查看series对象数据类型,元素数据类型

(pandas.core.series.Series, dtype('int64'))

In [82]:
ser.index, ser.values, ser[3]  # 查看索引和值,根据索引取值

(RangeIndex(start=0, stop=4, step=1), array([1, 2, 3, 4], dtype=int64), 4)

In [90]:
ser.head(), ser.tail(), ser.head(2)  # 默认查看对象的前/后5条数据,也可以指定条数

(a    11.1
 b    22.2
 c    33.3
 dtype: float64, a    11.1
 b    22.2
 c    33.3
 dtype: float64, a    11.1
 b    22.2
 dtype: float64)

In [101]:
# 2.通过list创建series对象,指定行索引
ser = pd.Series(range(10, 15), index=["a", "b", "c", "d", "e"])

In [102]:
ser

a    10
b    11
c    12
d    13
e    14
dtype: int64

In [103]:
ser.index, ser.values

(Index(['a', 'b', 'c', 'd', 'e'], dtype='object'),
 array([10, 11, 12, 13, 14], dtype=int64))

In [104]:
ser[2], ser["c"]  # 索引分标签索引(label)和位置索引(pos)

(12, 12)

In [105]:
ser[1:3], ser["b":"d"]  # 连续索引(切片索引)：位置索引会包含末尾位置

(b    11
 c    12
 dtype: int64, b    11
 c    12
 d    13
 dtype: int64)

In [108]:
ser[[0, 2]], ser[["a", "c"]]  # 不连续索引

(a    10
 c    12
 dtype: int64, a    10
 c    12
 dtype: int64)

In [131]:
# 创建两个series对象
s1 = pd.Series(range(10, 15))
s2 = pd.Series(range(20, 23))

In [132]:
s1, s2

(0    10
 1    11
 2    12
 3    13
 4    14
 dtype: int64, 0    20
 1    21
 2    22
 dtype: int64)

In [133]:
s1.add(s2)  # 两个Series对象合并,缺失值以NAN代替

0    30.0
1    32.0
2    34.0
3     NaN
4     NaN
dtype: float64

In [134]:
s1.add(s2, fill_value=0.)  # 两个Series对象合并,先将缺失值以0.填充再参与运算

0    30.0
1    32.0
2    34.0
3    13.0
4    14.0
dtype: float64

In [84]:
# 通过dict创建series对象
ser = pd.Series({"a": 11.1, "b": 22.2, "c": 33.3})

In [85]:
ser

a    11.1
b    22.2
c    33.3
dtype: float64

In [86]:
type(ser), ser.dtypes  # 查看series对象数据类型,元素数据类型

(pandas.core.series.Series, dtype('float64'))

In [87]:
ser.index, ser.values, ser[1]  # 查看索引和值,根据索引取值

(Index(['a', 'b', 'c'], dtype='object'), array([11.1, 22.2, 33.3]), 22.2)

In [88]:
ser.head(2)  # 查看前2条数据

a    11.1
b    22.2
dtype: float64

In [89]:
ser.name, ser.index.name  # 对象名/对象索引名

(None, None)

In [5]:
# 1.通过ndarray创建dataframe对象,不指定行/列索引,默认是自增长的int类型
df = pd.DataFrame(np.random.rand(3, 4))

In [6]:
df

Unnamed: 0,0,1,2,3
0,0.816031,0.666818,0.886574,0.666571
1,0.318502,0.127803,0.249043,0.853823
2,0.597471,0.792534,0.919694,0.322454


In [7]:
type(df), df.dtypes  # 对象数据类型和元素数据类型

(pandas.core.frame.DataFrame, 0    float64
 1    float64
 2    float64
 3    float64
 dtype: object)

In [8]:
df.index, df.columns, df.values  # 对象的行/列索引和值

(RangeIndex(start=0, stop=3, step=1),
 RangeIndex(start=0, stop=4, step=1),
 array([[0.81603146, 0.66681766, 0.88657357, 0.66657145],
        [0.31850164, 0.12780307, 0.24904269, 0.85382271],
        [0.59747093, 0.792534  , 0.91969362, 0.32245407]]))

In [9]:
df.info()  # 展示当前dataframe在内存中的信息

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
0    3 non-null float64
1    3 non-null float64
2    3 non-null float64
3    3 non-null float64
dtypes: float64(4)
memory usage: 176.0 bytes


In [10]:
df.head(2)  # 指定行数取值

Unnamed: 0,0,1,2,3
0,0.816031,0.666818,0.886574,0.666571
1,0.318502,0.127803,0.249043,0.853823


In [11]:
df[2]  # 默认按列索引取值

0    0.886574
1    0.249043
2    0.919694
Name: 2, dtype: float64

In [12]:
print("=" * 100)



In [27]:
# 2.通过ndarray创建datafremae对象,指定行/列索引
df = pd.DataFrame(np.random.rand(3, 4), index=["A", "B", "C"], columns=["a", "b", "c", "d"])

In [28]:
df

Unnamed: 0,a,b,c,d
A,0.69869,0.788412,0.612029,0.167614
B,0.370737,0.137027,0.75986,0.472538
C,0.713939,0.421626,0.129105,0.232239


In [29]:
df.index, df.columns, df.values

(Index(['A', 'B', 'C'], dtype='object'),
 Index(['a', 'b', 'c', 'd'], dtype='object'),
 array([[0.6986899 , 0.78841245, 0.61202938, 0.16761448],
        [0.37073698, 0.13702741, 0.75985993, 0.47253753],
        [0.71393935, 0.42162566, 0.12910479, 0.23223859]]))

In [30]:
df["c"]  # 列索引: DataFrame的列索引只能是label不能是pos --> KeyError: '[1] not in index'

A    0.612029
B    0.759860
C    0.129105
Name: c, dtype: float64

In [31]:
df["c"].values

array([0.61202938, 0.75985993, 0.12910479])

In [32]:
# 连续索引：DataFrame不能直接切片,会被当成取df的某个列,可通过loc(标签索引)/iloc(位置索引)实现
df.loc["A": "B", "c": "d"]  

Unnamed: 0,c,d
A,0.612029,0.167614
B,0.75986,0.472538


In [33]:
df.iloc[0:2, 2:4]  # loc和iloc的参数是行索引和列索引

Unnamed: 0,c,d
A,0.612029,0.167614
B,0.75986,0.472538


In [35]:
df[["a", "c"]]  # 不连续索引

Unnamed: 0,a,c
A,0.69869,0.612029
B,0.370737,0.75986
C,0.713939,0.129105


In [135]:
# 创建两个DataFrame对象
df1 = pd.DataFrame(np.random.rand(2, 3))
df2 = pd.DataFrame(np.random.rand(3, 4))

In [137]:
df1

Unnamed: 0,0,1,2
0,0.361629,0.605671,0.98393
1,0.042402,0.770929,0.886052


In [138]:
df2

Unnamed: 0,0,1,2,3
0,0.491429,0.923294,0.19436,0.610488
1,0.840963,0.311186,0.472345,0.81492
2,0.47672,0.856272,0.369501,0.8188


In [139]:
df1.add(df2)  # 两个DataFrame对象合并,缺失值以NAN代替

Unnamed: 0,0,1,2,3
0,0.853058,1.528965,1.17829,
1,0.883365,1.082115,1.358397,
2,,,,


In [140]:
df1.add(df2, fill_value=0.)  # 两个DataFrame对象合并,先将缺失值以0.填充再参与运算

Unnamed: 0,0,1,2,3
0,0.853058,1.528965,1.17829,0.610488
1,0.883365,1.082115,1.358397,0.81492
2,0.47672,0.856272,0.369501,0.8188


In [15]:
# 2.通过dict创建(dict的key是dateframe的列索引)
df = pd.DataFrame({
        "A": 1.0,  # float类型
        "B": pd.Timestamp("20170625"),  # timestamp类型
        "C": pd.Series(range(10, 14)),  # Series类型
        "D": ["python", "C", "C++", "Java"],  # Python列表类型
        "E": np.array([10] * 4),  # ndarray类型
        "F": "orc"  # str类型
    })

In [16]:
df

Unnamed: 0,A,B,C,D,E,F
0,1.0,2017-06-25,10,python,10,orc
1,1.0,2017-06-25,11,C,10,orc
2,1.0,2017-06-25,12,C++,10,orc
3,1.0,2017-06-25,13,Java,10,orc


In [17]:
df["D"]  # 查找指定列

0    python
1         C
2       C++
3      Java
Name: D, dtype: object

In [18]:
df["D"][3]  # 查找指定列的指定行

'Java'

In [19]:
df["G"] = df["C"] * 2  # 添加列

In [20]:
df

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2017-06-25,10,python,10,orc,20
1,1.0,2017-06-25,11,C,10,orc,22
2,1.0,2017-06-25,12,C++,10,orc,24
3,1.0,2017-06-25,13,Java,10,orc,26


In [21]:
del(df["F"])  # 删除列

In [22]:
df

Unnamed: 0,A,B,C,D,E,G
0,1.0,2017-06-25,10,python,10,20
1,1.0,2017-06-25,11,C,10,22
2,1.0,2017-06-25,12,C++,10,24
3,1.0,2017-06-25,13,Java,10,26


In [141]:
# 创建series对象
ser = pd.Series(range(10, 15))

In [142]:
ser

0    10
1    11
2    12
3    13
4    14
dtype: int64

In [157]:
np.sum(ser), ser.sum()

(60, 60)

In [158]:
ser.describe()  # 统计描述

count     5.000000
mean     12.000000
std       1.581139
min      10.000000
25%      11.000000
50%      12.000000
75%      13.000000
max      14.000000
dtype: float64

In [3]:
print("=" * 100)



In [144]:
# 创建dataframe对象
df = pd.DataFrame(np.random.rand(3, 4))

In [145]:
df

Unnamed: 0,0,1,2,3
0,0.589701,0.588823,0.695835,0.29822
1,0.186835,0.90995,0.170842,0.434151
2,0.34648,0.809867,0.034011,0.726161


In [154]:
np.sum(df), df.sum()  # 默认axis=0按列计算

(0    1.123016
 1    2.308640
 2    0.900688
 3    1.458532
 dtype: float64, 0    1.123016
 1    2.308640
 2    0.900688
 3    1.458532
 dtype: float64)

In [155]:
np.sum(df, axis=1), df.sum(axis=1, skipna=False)  # 可以指定axis=1按行计算,skipna: 是否排除缺失值,默认True

(0    2.172579
 1    1.701779
 2    1.916518
 dtype: float64, 0    2.172579
 1    1.701779
 2    1.916518
 dtype: float64)

In [159]:
df.describe()

Unnamed: 0,0,1,2,3
count,3.0,3.0,3.0,3.0
mean,0.374339,0.769547,0.300229,0.486177
std,0.202873,0.164317,0.349369,0.218663
min,0.186835,0.588823,0.034011,0.29822
25%,0.266657,0.699345,0.102427,0.366186
50%,0.34648,0.809867,0.170842,0.434151
75%,0.468091,0.859909,0.433339,0.580156
max,0.589701,0.90995,0.695835,0.726161


In [161]:
# df.apply(func): 作用于指定的行/列,func可以是内置函数也可以是自定义函数
df.max(), df.apply(lambda x: x.max())  # 求每一列最大值

(0    0.589701
 1    0.909950
 2    0.695835
 3    0.726161
 dtype: float64, 0    0.589701
 1    0.909950
 2    0.695835
 3    0.726161
 dtype: float64)

In [162]:
df.max(axis=1), df.apply(lambda x: x.max(), axis=1)  # 求每一行最大值

(0    0.695835
 1    0.909950
 2    0.809867
 dtype: float64, 0    0.695835
 1    0.909950
 2    0.809867
 dtype: float64)

In [163]:
# df.applymap(func): 作用于每一个元素
df.applymap(lambda x: "%.2f" % x)  # 将每个元素保留两位小数

Unnamed: 0,0,1,2,3
0,0.59,0.59,0.7,0.3
1,0.19,0.91,0.17,0.43
2,0.35,0.81,0.03,0.73


In [167]:
# 1.创建series对象
ser = pd.Series(range(10, 15), index=[np.random.randint(low=1, high=20, size=(5,))])

In [168]:
ser

14    10
15    11
19    12
10    13
18    14
dtype: int64

In [175]:
ser.sort_index(), ser.sort_index(ascending=False)  # 按索引排序(默认升序)

(10    13
 14    10
 15    11
 18    14
 19    12
 dtype: int64, 19    12
 18    14
 15    11
 14    10
 10    13
 dtype: int64)

In [176]:
ser.sort_values(), ser.sort_values(ascending=False)  # 按值排序(默认升序)

(14    10
 15    11
 19    12
 10    13
 18    14
 dtype: int64, 18    14
 10    13
 19    12
 15    11
 14    10
 dtype: int64)

In [1]:
print("=" * 100)



In [3]:
# 2.创建dataframe对象
df = pd.DataFrame(np.random.rand(3, 4), index=["A", "B", "C"], columns=["a", "b", "c", "d"])

In [4]:
df

Unnamed: 0,a,b,c,d
A,0.960296,0.252056,0.573528,0.206489
B,0.534834,0.669807,0.070086,0.267235
C,0.490745,0.531456,0.937128,0.945812


In [7]:
df.sort_index()   # 按索引排序：默认axis=0按行索引升序(注意：此处axis=0是按行,与别处不一样)

Unnamed: 0,a,b,c,d
A,0.960296,0.252056,0.573528,0.206489
B,0.534834,0.669807,0.070086,0.267235
C,0.490745,0.531456,0.937128,0.945812


In [8]:
df.sort_index(axis=1, ascending=False)  # axis=1指定按列索引

Unnamed: 0,d,c,b,a
A,0.206489,0.573528,0.252056,0.960296
B,0.267235,0.070086,0.669807,0.534834
C,0.945812,0.937128,0.531456,0.490745


In [13]:
df.sort_values(by="c")  # 按值排序：by=字段名,所以只能是某个列名称,默认升序

Unnamed: 0,a,b,c,d
B,0.534834,0.669807,0.070086,0.267235
A,0.960296,0.252056,0.573528,0.206489
C,0.490745,0.531456,0.937128,0.945812


In [15]:
df.sort_values(by="c", ascending=False)

Unnamed: 0,a,b,c,d
C,0.490745,0.531456,0.937128,0.945812
A,0.960296,0.252056,0.573528,0.206489
B,0.534834,0.669807,0.070086,0.267235


In [26]:
# 创建DataFrame对象
df = pd.DataFrame([np.random.randn(4), [10., np.nan, 20., np.nan], [30., np.nan, np.nan, 40.]])

In [27]:
df

Unnamed: 0,0,1,2,3
0,-0.378872,-1.269862,1.424187,-0.076506
1,10.0,,20.0,
2,30.0,,,40.0


In [28]:
df.isna()  # 判断是否是缺失值

Unnamed: 0,0,1,2,3
0,False,False,False,False
1,False,True,False,True
2,False,True,True,False


In [29]:
df.dropna()  # 丢弃缺失数据的行/列(默认axis=0按行处理,how="any"只要有nan值就删除所在行/列)

Unnamed: 0,0,1,2,3
0,-0.378872,-1.269862,1.424187,-0.076506


In [34]:
df.dropna(axis=1, how="all")  # 指定how="all"只有全部为nan时才删除该行/列

Unnamed: 0,0,1,2,3
0,-0.378872,-1.269862,1.424187,-0.076506
1,10.0,,20.0,
2,30.0,,,40.0


In [33]:
df.fillna(50.)  # 填充缺失值

Unnamed: 0,0,1,2,3
0,-0.378872,-1.269862,1.424187,-0.076506
1,10.0,50.0,20.0,50.0
2,30.0,50.0,50.0,40.0


In [16]:
# 创建有多层索引的Series对象
ser = pd.Series(range(10, 15), index=[["a", "a", "b", "c", "c"], [10, 20, 30, 10, 20]])

In [17]:
ser

a  10    10
   20    11
b  30    12
c  10    13
   20    14
dtype: int64

In [18]:
type(ser.index)  # 查看索引类型

pandas.core.indexes.multi.MultiIndex

In [19]:
ser.index  # 查看索引：levels表示两个层级中分别有哪些标签,labels是每个位置分别是什么标签

MultiIndex(levels=[['a', 'b', 'c'], [10, 20, 30]],
           labels=[[0, 0, 1, 2, 2], [0, 1, 2, 0, 1]])

In [20]:
ser["b"]  # 根据外层索引取值

30    12
dtype: int64

In [21]:
ser["a", 10]  # 取出外层索引为a内层索引为10的值

10

In [22]:
ser[:, 20]  # 取出所有外层索引其内层索引为20的值

a    11
c    14
dtype: int64

In [23]:
ser.swaplevel()  # 交换分层索引: ser.swaplevel(),0最外层、1次外层。。只有两层就不用写参数,就是最外层与次外层交换

10  a    10
20  a    11
30  b    12
10  c    13
20  c    14
dtype: int64

In [24]:
ser.sort_index()  # 按层索引排序: ser.sort_index(level),默认level=0最外层、level=1次外层。。。

a  10    10
   20    11
b  30    12
c  10    13
   20    14
dtype: int64

In [25]:
ser.sort_index(level=1)

a  10    10
c  10    13
a  20    11
c  20    14
b  30    12
dtype: int64

In [26]:
ser.swaplevel().sort_index()  # 先交换分层再按索引排序

10  a    10
    c    13
20  a    11
    c    14
30  b    12
dtype: int64

In [18]:
# unstack(): 将具有多层索引的Series对象重构成DataFrame对象
ser.unstack()  # 默认level=-1将最内层的索引变成列索引,匹配不到的就给NaN值

Unnamed: 0,10,20,30
a,10.0,11.0,
b,,,12.0
c,13.0,14.0,


In [19]:
ser.unstack(level=0)  # unstack(level=0)可以将外层索引变成列索引

Unnamed: 0,a,b,c
10,10.0,,13.0
20,11.0,,14.0
30,,12.0,


In [20]:
# stack()将DataFrame对象重构成Series对象,索引不变,默认会丢弃NaN值
ser.unstack().stack()

a  10    10.0
   20    11.0
b  30    12.0
c  10    13.0
   20    14.0
dtype: float64