### Series

In [25]:
import pandas as pd

from pandas import DataFrame,Series

obj = Series([1, 2, 3, 4, 5])
print(obj)
print(obj.values)
print(obj.index)

0    1
1    2
2    3
3    4
4    5
dtype: int64
[1 2 3 4 5]
RangeIndex(start=0, stop=5, step=1)


In [15]:
obj = Series(['a', 'b', 'c', 'd', 'e'], index=[1, 2, 3, 4, 5])
print(obj)

# 可以通过索引访问值
print(obj[1])
print(obj[2])

# 把series当作字典使用，这个时候，字典中的键就是series的索引
data = {'a':10000, 'b':20000, 'c':30000}
obj = Series(data)
print(obj)

keys = ['a', 'c']
obj_1 = Series(data=data, index=keys)
print(obj_1)

1    a
2    b
3    c
4    d
5    e
dtype: object
a
b
a    10000
b    20000
c    30000
dtype: int64
a    10000
c    30000
dtype: int64


In [16]:
# 缺失数据的处理
data = {'a': None, 'b':20000, 'c':30000}
obj = Series(data)
print(obj)

a        NaN
b    20000.0
c    30000.0
dtype: float64


In [17]:
pd.isnull(obj)

a     True
b    False
c    False
dtype: bool

In [18]:
obj.isnull()

a     True
b    False
c    False
dtype: bool

In [19]:
obj.notnull()

a    False
b     True
c     True
dtype: bool

In [24]:
data = {'Jack':20, 'Alice':13, 'Bob':None, 'Rick':None}
obj = Series(data)
print(obj)

obj.name = 'NameAndAge'
obj.index.name = 'name'
print(obj)

Alice    13.0
Bob       NaN
Jack     20.0
Rick      NaN
dtype: float64
name
Alice    13.0
Bob       NaN
Jack     20.0
Rick      NaN
Name: NameAndAge, dtype: float64


### Dataframe

In [27]:
data = {
    '姓名':['李雷', '韩梅梅', '刘杰'],
    '年龄':['17', '20', '22'],
    '身高':['170','165','180']
}
frame_data = DataFrame(data)
print(frame_data)
# 获取的就是一个Series
print(frame_data['姓名'])

    姓名  年龄   身高
0   李雷  17  170
1  韩梅梅  20  165
2   刘杰  22  180
0     李雷
1    韩梅梅
2     刘杰
Name: 姓名, dtype: object


In [34]:
import numpy as np

dates = pd.date_range('20200229', periods=6)
print(dates)

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)

print(df.T)

DatetimeIndex(['2020-02-29', '2020-03-01', '2020-03-02', '2020-03-03',
               '2020-03-04', '2020-03-05'],
              dtype='datetime64[ns]', freq='D')
                   A         B         C         D
2020-02-29  3.207279  0.657534  1.014012 -1.490475
2020-03-01  2.112011  1.607727 -0.356179  0.458387
2020-03-02 -0.183542 -1.244763  0.293693 -0.498699
2020-03-03  0.998262  1.206722  0.048553  1.212402
2020-03-04  0.640979  2.193805 -0.658450  0.497745
2020-03-05  0.522214 -1.257766 -1.243947 -1.523981
   2020-02-29  2020-03-01  2020-03-02  2020-03-03  2020-03-04  2020-03-05
A    3.207279    2.112011   -0.183542    0.998262    0.640979    0.522214
B    0.657534    1.607727   -1.244763    1.206722    2.193805   -1.257766
C    1.014012   -0.356179    0.293693    0.048553   -0.658450   -1.243947
D   -1.490475    0.458387   -0.498699    1.212402    0.497745   -1.523981


In [36]:
df['20200229':'20200302']

Unnamed: 0,A,B,C,D
2020-02-29,3.207279,0.657534,1.014012,-1.490475
2020-03-01,2.112011,1.607727,-0.356179,0.458387
2020-03-02,-0.183542,-1.244763,0.293693,-0.498699


In [37]:
print(df.loc['20200229':'20200302', ['A', 'B']])

                   A         B
2020-02-29  3.207279  0.657534
2020-03-01  2.112011  1.607727
2020-03-02 -0.183542 -1.244763


In [43]:
print(df.at[dates[0],'A'])
print(dates[0])

# 还有一些dataframe的操作
# 取前3行
df.head(3)

# 取末尾2行
df.tail(2)

3.2072785081016186
2020-02-29 00:00:00


Unnamed: 0,A,B,C,D
2020-03-04,0.640979,2.193805,-0.65845,0.497745
2020-03-05,0.522214,-1.257766,-1.243947,-1.523981


In [48]:
data = {'Jack':[20, 30], 'Alice':[13, 14], 'Bob':None, 'Rick':None}
obj = Series(data)
obj_2 = obj
df = pd.DataFrame([obj,obj_2])
df

Unnamed: 0,Alice,Bob,Jack,Rick
0,"[13, 14]",,"[20, 30]",
1,"[13, 14]",,"[20, 30]",


### pandas的重新索引

In [50]:
obj = Series([1.2, 3.2, 4.7], index=['a','b','c'])
print(obj)
obj_2 = obj.reindex(['a','b','c', 'd', 'e'])
print(obj_2)

a    1.2
b    3.2
c    4.7
dtype: float64
a    1.2
b    3.2
c    4.7
d    NaN
e    NaN
dtype: float64


In [51]:
obj_2 = obj.reindex(['a','b','c', 'd', 'e'], fill_value = 1)
print(obj_2)

a    1.2
b    3.2
c    4.7
d    1.0
e    1.0
dtype: float64


In [55]:
obj = Series([1.2, 2.2, 3.6], index=[0,2,4])
print(obj)
# ffill是指前向填充，对于索引到为空的值，我们填充为前一个有数值的数
print(obj.reindex(range(6), method='ffill')) # 后向填充bfill

0    1.2
2    2.2
4    3.6
dtype: float64
0    1.2
1    1.2
2    2.2
3    2.2
4    3.6
5    3.6
dtype: float64


### 算数运算和数据对齐

In [56]:
d1 = Series([1.2, 2.4, 3.6, 4.8], index=['a', 'b', 'c', 'd'])
d2 = Series([-1.2, -2.4, -3.6, -4.8, 0.2, 0.8], index=['a', 'b', 'c', 'd', 'e', 'f'])
d1+d2


a    0.0
b    0.0
c    0.0
d    0.0
e    NaN
f    NaN
dtype: float64

In [58]:
df1 = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list('abc'), index=[1, 2, 3])
print(df1)

Unnamed: 0,a,b,c
1,0,1,2
2,3,4,5
3,6,7,8


In [59]:
df2 = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list('cde'), index=[1, 2, 3, 4])
df2

Unnamed: 0,c,d,e
1,0,1,2
2,3,4,5
3,6,7,8
4,9,10,11


In [60]:
df1 + df2

Unnamed: 0,a,b,c,d,e
1,,,2.0,,
2,,,8.0,,
3,,,14.0,,
4,,,,,


In [62]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
1,0.0,1.0,2.0,1.0,2.0
2,3.0,4.0,8.0,4.0,5.0
3,6.0,7.0,14.0,7.0,8.0
4,,,9.0,10.0,11.0


### DataFrame和Series之间的运算

In [64]:
frame = pd.DataFrame(np.arange(12).reshape((4,3)), columns=list('bde'), index=[1, 2, 3, 4])
series = frame.loc[1]
print(frame)
print(series)

   b   d   e
1  0   1   2
2  3   4   5
3  6   7   8
4  9  10  11
b    0
d    1
e    2
Name: 1, dtype: int32


In [65]:
# 广播相减
frame - series

Unnamed: 0,b,d,e
1,0,0,0
2,3,3,3
3,6,6,6
4,9,9,9


In [66]:
series = Series(range(3), index=list('bef'))
series + frame

Unnamed: 0,b,d,e,f
1,0.0,,3.0,
2,3.0,,6.0,
3,6.0,,9.0,
4,9.0,,12.0,


### 排序

In [69]:
obj = Series(range(4), index=list('deba'))
# 按照索引排序
print(obj.sort_index())
# 按照值来排序
print(obj.sort_values())


a    3
b    2
d    0
e    1
dtype: int32
d    0
e    1
b    2
a    3
dtype: int32


In [72]:
# 根据一个轴上的索引进行排序
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=[2, 1], columns=list('cdab'))
print(frame)
# 指定轴来进行排序
print(frame.sort_index(axis=1))


   c  d  a  b
2  0  1  2  3
1  4  5  6  7
   a  b  c  d
2  2  3  0  1
1  6  7  4  5


In [75]:
frame = pd.DataFrame({'b':[4, -7, 2, 1], 'a':[1, 4, 3, 2]})
print(frame)
# 按照b的数值排列
print(frame.sort_values(by='b'))

   a  b
0  1  4
1  4 -7
2  3  2
3  2  1
   a  b
1  4 -7
3  2  1
2  3  2
0  1  4


### pandas的层次化索引

In [91]:
data = Series(np.random.randn(10), index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'], [1,2,3,4,5,6,7,8,1,2]])
print(data)

a  1    0.165958
   2    0.150672
   3    0.132544
b  4    0.082239
   5    0.121213
   6   -0.516890
c  7   -0.937649
   8    1.920713
d  1    0.321530
   2   -1.665471
dtype: float64


In [92]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3, 4, 5, 6, 7, 8]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 3, 4, 5, 6, 7, 0, 1]])

In [93]:
# 选取子集操作
print(data['b'])
print(data['b'][4])
print(data['a'][2])

4    0.082239
5    0.121213
6   -0.516890
dtype: float64
0.08223946748638968
0.15067181363484483


In [96]:
# 内层选取
data[:,2]  # 第一层索引选取所有元素，第二层索引中选取所有索引值为2的元素

a    0.150672
d   -1.665471
dtype: float64

In [95]:
data.unstack()   # 可以再stack回去 data.unstack().stack()

Unnamed: 0,1,2,3,4,5,6,7,8
a,0.165958,0.150672,0.132544,,,,,
b,,,,0.082239,0.121213,-0.51689,,
c,,,,,,,-0.937649,1.920713
d,0.32153,-1.665471,,,,,,


### 列索引

In [126]:
frame_data = pd.DataFrame(np.arange(12).reshape((4, 3)), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                         columns = [['lj', 'ly', 'lj'], ['aa', 'bb', 'aa']])
# 给行和列索引取名
frame_data.index.names = ['key1', 'key2']
frame_data.columns.names = ['n1', 'n2']
frame_data

Unnamed: 0_level_0,n1,lj,ly,lj
Unnamed: 0_level_1,n2,aa,bb,aa
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [144]:
frame_data[['lj','ly']]
frame_data['lj']['aa']
frame_data

Unnamed: 0_level_0,n1,lj,ly,lj
Unnamed: 0_level_1,n2,aa,bb,aa
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [128]:
# 行和列同时进行
frame_data.loc[['a'], ['lj']]

Unnamed: 0_level_0,n1,lj,lj
Unnamed: 0_level_1,n2,aa,aa
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,1,0,2
a,2,3,5


In [140]:
frame_data.sum(level='key1', axis=0)

n1,lj,ly,lj
n2,aa,bb,aa
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,3,5,7
b,15,17,19


In [141]:
frame_data.sum(level='n1', axis=1)

Unnamed: 0_level_0,n1,lj,ly
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


### pandas对文件的读取

In [145]:
pd.read_csv('./test.csv')  # test.csv是用逗号进行分隔的

Unnamed: 0,1,2,3,4,5
0,a,b,c,d,e
1,f,g,h,i,j


In [147]:
pd.read_table('./test.csv', sep=',')   # read_table默认为制表符，所以读取的时候需要指定分隔符

Unnamed: 0,1,2,3,4,5
0,a,b,c,d,e
1,f,g,h,i,j


In [148]:
pd.read_csv('./test.csv', header=None)  # 声明没有头，会自动生成头部

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,5
1,a,b,c,d,e
2,f,g,h,i,j


In [149]:
# 指定索引（例如指定3列为索引）
pd.read_csv('./test.csv', index_col='3')

Unnamed: 0_level_0,1,2,4,5
3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c,a,b,d,e
h,f,g,i,j


In [150]:
# 指定多列为层次化索引
pd.read_csv('./test.csv', index_col=['3', '4'])

Unnamed: 0_level_0,Unnamed: 1_level_0,1,2,5
3,4,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c,d,a,b,e
h,i,f,g,j


### 读取exel