In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# DataFrame是一个表格型的数据结构，它含有一组有序的列，每列可以是不同的值类型（数值、字符串、布尔值等）
# DataFrame既有行索引也有列索引，它可以被看做由Series组成的字典（共用同一个索引）
# DataFrame中的数据是以一个或多个二维块存放的（而不是列表、字典或别的一维数据结构）
# 虽然DataFrame是以二维结构保存数据的，但你仍然可以使用分层索引轻松地将其表示为更高维度的数据
# 层次化索引的表格型结构，这是pandas中许多高级数据处理功能的关键要素

In [2]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [3]:
frame.head()  # 与bash中的head类似，默认取前5行

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [4]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])  # 指定列的顺序

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [5]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four', 'five', 'six'])
frame2  # debt不在字典中，值为NaN

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [6]:
frame2['state']  # <==> frame2.state  检索列

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [7]:
frame2.loc['three']  # 取第三行，注意要使用DataFrame的索引

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [8]:
frame2['debt'] = 16.5  # 更改列的值
frame2
frame2['debt'] = np.arange(6.)  # 这样赋值，值的长度与DataFrame的长度要匹配
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [9]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val  # 如果使用Series给DataFrame的一列赋值，会按照索引进行update
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [10]:
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [11]:
del frame2['eastern']  # 删除列
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [12]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop)  # 使用嵌套字典创建DataFrame
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [13]:
frame3.T  # DataFrame的转置

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [14]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
pd.DataFrame(pop, index=[2001, 2002, 2003])  # 显式指定索引

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [15]:
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}
pd.DataFrame(pdata)  # 使用Series字典创建DataFrame

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [16]:
frame3.index.name = 'year'; frame3.columns.name = 'state'  # 设置DataFrame索引与列的name属性
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [17]:
frame3.values  # DataFrame的values属性，返回的是一个二维数组

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])