In [1]:
import numpy as np
import pandas as pd

# 基本数据类型

## pd.Series

- 一维数组，由index和value组成
- index自动生成，可以指定为index

In [6]:
s=pd.Series([5,4,3])
s

0    5
1    4
2    3
dtype: int64

In [8]:
s.index,s.array

(RangeIndex(start=0, stop=3, step=1),
 <PandasArray>
 [5, 4, 3]
 Length: 3, dtype: int64)

In [9]:
s=s=pd.Series([5,4,3],index=['aa','bb','cc'])
s

aa    5
bb    4
cc    3
dtype: int64

In [10]:
s.index,s.array

(Index(['aa', 'bb', 'cc'], dtype='object'),
 <PandasArray>
 [5, 4, 3]
 Length: 3, dtype: int64)

### Series元素访问
- 单元素取值
- 赋值
- 区间取值
- 根据索引取值
- 和numay 1-rank array一样的操作

In [50]:
s=s=pd.Series([5,4,3],index=['aa','bb','cc'])
print(s['aa'])
s['aa']=11
print(s['aa']**2)
print()
print(s[['aa','cc','cc']])
print()
print(s[0:2])

5
121

aa    11
cc     3
cc     3
dtype: int64

aa    11
bb     4
dtype: int64


In [51]:
obj=pd.Series([3,1,4])
print(obj[obj>1])
print()
print(np.exp(obj))

0    3
2    4
dtype: int64

0    20.085537
1     2.718282
2    54.598150
dtype: float64


### Series与dict的关系

In [52]:
d={"zhangsan":20,"lisi":50,"wangwu":80}
obj=pd.Series(d)
obj

zhangsan    20
lisi        50
wangwu      80
dtype: int64

In [53]:
#重新定义index的顺序
obj=pd.Series(d,index=['wangwu','lisi','zhangsan','unk'])
obj

wangwu      80.0
lisi        50.0
zhangsan    20.0
unk          NaN
dtype: float64

### isna,notna
- isna表示 索引对于的值是不是没有设置

In [54]:
obj.isna(),obj.notna()

(wangwu      False
 lisi        False
 zhangsan    False
 unk          True
 dtype: bool,
 wangwu       True
 lisi         True
 zhangsan     True
 unk         False
 dtype: bool)

### 高级用法
- 数据标签对齐
- name属性
- 更改index

In [57]:
salary={"zhangsan":22,"lisi":55,"wangwu":30}
bounds={"lisi":55,"zhangsan":22}

s_salary=pd.Series(salary)
s_bounds=pd.Series(bounds)
s_salary+s_bounds

lisi        110.0
wangwu        NaN
zhangsan     44.0
dtype: float64

In [62]:
s_salary.name='salary'
s_salary.index.name='employmentName'
s_salary

employmentName
zhangsan    22
lisi        55
wangwu      30
Name: salary, dtype: int64

In [66]:
s_salary.index=["a",'b','c']
s_salary

a    22
b    55
c    30
Name: salary, dtype: int64

## DataFrame
DataFrame是由多个Series组成，每个Series关联一个Key(colume name),所有的Series共享相同的index

- 创建方法
- 访问：行访问，列访问
- 列整体赋值，创建新列，删除列
- decribe,dtypes
- column,index属性

In [133]:
ds={
    "id":[0,1,2],
    "name":['zhang','li','wang'],
    "age":[33,22,44],
    "sex":['M','F','M']
}

df=pd.DataFrame(ds)
df

Unnamed: 0,id,name,age,sex
0,0,zhang,33,M
1,1,li,22,F
2,2,wang,44,M


In [134]:
df.head(),df.tail()

(   id   name  age sex
 0   0  zhang   33   M
 1   1     li   22   F
 2   2   wang   44   M,
    id   name  age sex
 0   0  zhang   33   M
 1   1     li   22   F
 2   2   wang   44   M)

In [135]:
#重新定义 列的顺序
df=pd.DataFrame(ds,columns=["name",'age','sex','id','pclass'])
df

Unnamed: 0,name,age,sex,id,pclass
0,zhang,33,M,0,
1,li,22,F,1,
2,wang,44,M,2,


In [148]:
print(df.dtypes)

df.describe(include='object')

name      object
age        int64
sex       object
id         int64
pclass    object
ticket     int64
dtype: object


Unnamed: 0,name,sex,pclass
count,3,3,0.0
unique,3,2,0.0
top,zhang,M,
freq,1,2,


访问：行访问，列访问

In [136]:
#访问一列

df.sex

0    M
1    F
2    M
Name: sex, dtype: object

In [137]:
#访问一行 

df.loc[0:1]
# 排除最后的索引
df.iloc[0:1]

Unnamed: 0,name,age,sex,id,pclass
0,zhang,33,M,0,


列整体赋值，创建新列，删除列

In [138]:
df['isOrd']=df.age>30
df

Unnamed: 0,name,age,sex,id,pclass,isOrd
0,zhang,33,M,0,,True
1,li,22,F,1,,False
2,wang,44,M,2,,True


In [139]:
del df['isOrd']
df

Unnamed: 0,name,age,sex,id,pclass
0,zhang,33,M,0,
1,li,22,F,1,
2,wang,44,M,2,


In [140]:
#列赋值

#如果赋的值是Series，还是满足对齐法则
# df['ticket']=pd.Series([1000,200,333],index=['s1','s2','s3'])
df['ticket']=[1000,200,333]
df

Unnamed: 0,name,age,sex,id,pclass,ticket
0,zhang,33,M,0,,1000
1,li,22,F,1,,200
2,wang,44,M,2,,333


In [143]:
#注意，只有深度拷贝，才能保证之后的修改不会影响dataframe
names=df.name.copy()
names[0]='ss'
df

Unnamed: 0,name,age,sex,id,pclass,ticket
0,zhang,33,M,0,,1000
1,li,22,F,1,,200
2,wang,44,M,2,,333


### 多种构建方法，与numpy的转换,保存为csv文件

In [152]:
populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6}, "Nevada": {2001: 2.4, 2002: 2.9}}
df=pd.DataFrame(populations,index=[2001,2002,2000])
df

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9
2000,1.5,


In [153]:
df.to_numpy()

array([[1.7, 2.4],
       [3.6, 2.9],
       [1.5, nan]])

In [154]:
df.to_csv('a.csv')

### 重要的属性

In [155]:
df.index

Int64Index([2001, 2002, 2000], dtype='int64')

In [158]:
df.columns

Index(['Ohio', 'Nevada'], dtype='object')

## 索引对象

In [164]:
obj = pd.Series(np.arange(3), index=["a", "b", "c"])

In [170]:
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [171]:
#索引对象不能修改
index[1]='d'

TypeError: Index does not support mutable operations

In [174]:
# 查询
'a' in index

True

# 重要的方法
- reindex

## reindex:索引排列
- 可以重新排列行(index)或者列（colume）
- index可以引入新的列

In [176]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
   index=["a", "c", "d"],
   columns=["Ohio", "Texas", "California"])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [179]:
# 行重排列
frame.reindex(index=['a','b','c'])
# or
# frame.reindex(['a','b','c'],axis=0)

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0


In [182]:
frame.reindex(columns=['California','Texas','Ohio'])
# or
# frame.reindex(['California','Texas','Ohio'],axis=1)

Unnamed: 0,California,Texas,Ohio
a,2,1,0
c,5,4,3
d,8,7,6


In [188]:
frame.reindex(['California','Texas','Ohio','NewCol'],axis=1,fill_value="missing")

Unnamed: 0,California,Texas,Ohio,NewCol
a,2,1,0,missing
c,5,4,3,missing
d,8,7,6,missing


## drop:删除行或者列
- 可以删除(index)或者列（colume）
- 一次可以删除多个

In [196]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                   index=["Ohio", "Colorado", "Utah", "New York"],
                   columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [199]:
data.drop(index=['Utah','New York'],inplace=False)
# or
# data.drop(['Utah','New York'],axis=0,inplace=False)

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [204]:
data.drop(columns=['one','four'],inplace=False)
# or
data.drop(['one','four'],axis=1,inplace=False)

Unnamed: 0,two,three
Ohio,1,2
Colorado,5,6
Utah,9,10
New York,13,14


## index
- loc方法索引是label, iloc方法索引是integer index
- dataframe的index用法

In [213]:
obj = pd.Series(reversed(np.arange(4.)), index=["a", "b", "c", "d"])
obj

a    3.0
b    2.0
c    1.0
d    0.0
dtype: float64

In [214]:
obj.iloc[[0,1]]

a    3.0
b    2.0
dtype: float64

In [215]:
obj.loc[[0,1]]

KeyError: "None of [Int64Index([0, 1], dtype='int64')] are in the [index]"

In [218]:
# 对于Series，以下是等价的
obj[0:1]
obj.iloc[0:1]

a    3.0
dtype: float64

dataframe的index用法

In [220]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [227]:
# 列选择
data[['one','four']]

Unnamed: 0,one,four
Ohio,0,3
Colorado,4,7
Utah,8,11
New York,12,15


In [238]:
# 行选择
data[1:3]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11


In [232]:
# boolean选择
flag=data.three>3
data[flag]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [239]:
# boolean选择
flag=data>3
data[flag]

Unnamed: 0,one,two,three,four
Ohio,,,,
Colorado,4.0,5.0,6.0,7.0
Utah,8.0,9.0,10.0,11.0
New York,12.0,13.0,14.0,15.0


In [240]:
### iloc,loc

In [255]:
# 选择行
data.loc[["Colorado","Colorado"]]
# 等价于
data.iloc[[1,1]]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Colorado,4,5,6,7


In [256]:
data.loc['Colorado':'Utah','two':'two']
# 等价于
data.iloc[1:3,1:2]

Unnamed: 0,two
Colorado,5
Utah,9


### interger index容易造成的bug

In [258]:
s=pd.Series(range(3))
s

0    0
1    1
2    2
dtype: int64

In [260]:
#-1是 index 还是 col的索引呢？
# s[-1]

In [267]:
# 如果是index，使用 
s.iloc[-1]

2

## apply方法

In [3]:
df=pd.DataFrame(np.random.randn(4,3),index=['Us','Zh','Uk','Jp'],columns=['a','b','c'])
df

Unnamed: 0,a,b,c
Us,-2.325166,1.025218,-1.631096
Zh,-0.371103,1.633793,1.202372
Uk,-0.030218,2.317793,-0.101848
Jp,1.561648,0.039601,0.409692


In [4]:
def diff(x):
    return x.max()-x.min()

In [7]:
diff(df.a)

3.8868135144517013

In [11]:
# axis表示那个轴被reduce
df.apply(diff,axis='index')

a    3.886814
b    2.278192
c    2.833468
dtype: float64

In [12]:
df.apply(diff,axis='columns')

Us    3.350384
Zh    2.004896
Uk    2.419641
Jp    1.522046
dtype: float64

In [16]:
def minmax(x):
    return pd.Series([x.min(),x.max(),x.std()],index=['min','max','std'])
df.apply(minmax)

Unnamed: 0,a,b,c
min,-2.325166,0.039601,-1.631096
max,1.561648,2.317793,1.202372
std,1.596304,0.96661,1.194535
