# Pandas Note

## 创建、存储和读取

### 创建

In [1]:
import pandas as pd
df1 = pd.DataFrame({'工资': [5000, 7000, 8500, 9000], '绩效分': [60, 84, 98, 91], '备注': ['不及格', '良好', '最佳', '优秀']}, index=['老王', '老牛', '老张', '老刘'])
print(df1)

      工资  绩效分   备注
老王  5000   60  不及格
老牛  7000   84   良好
老张  8500   98   最佳
老刘  9000   91   优秀


### 读取

In [5]:
import pandas as pd
df2 = pd.read_csv('00001.vcf')
df2.head()
print(df2)

                                           BEGIN:VCARD
0                                          VERSION:2.1
1    N;CHARSET=UTF-8;ENCODING=QUOTED-PRINTABLE:;=E6...
2    FN;CHARSET=UTF-8;ENCODING=QUOTED-PRINTABLE:=E6...
3                                TEL;VOICE:15996352969
4                                            END:VCARD
..                                                 ...
256                                        VERSION:2.1
257  N;CHARSET=UTF-8;ENCODING=QUOTED-PRINTABLE:;=E6...
258  FN;CHARSET=UTF-8;ENCODING=QUOTED-PRINTABLE:=E6...
259                             TEL;CELL:135 9870 4804
260                                          END:VCARD

[261 rows x 1 columns]


### 存储

In [7]:
df1 = pd.DataFrame({'工资': [5000, 7000, 8500, 9000], '绩效分': [60, 84, 98, 91], '备注': ['不及格', '良好', '最佳', '优秀']}, index=['老王', '老牛', '老张', '老刘'])
df1.to_csv('df1.csv')

## 数据概览

### 掐头看尾

In [9]:
df2.head(10)
df2.tail(10)

Unnamed: 0,BEGIN:VCARD
251,N;CHARSET=UTF-8;ENCODING=QUOTED-PRINTABLE:;=E6...
252,FN;CHARSET=UTF-8;ENCODING=QUOTED-PRINTABLE:=E6...
253,TEL;CELL:156 7058 1861
254,END:VCARD
255,BEGIN:VCARD
256,VERSION:2.1
257,N;CHARSET=UTF-8;ENCODING=QUOTED-PRINTABLE:;=E6...
258,FN;CHARSET=UTF-8;ENCODING=QUOTED-PRINTABLE:=E6...
259,TEL;CELL:135 9870 4804
260,END:VCARD


### 格式查看

In [11]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261 entries, 0 to 260
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   BEGIN:VCARD  261 non-null    object
dtypes: object(1)
memory usage: 2.2+ KB


### 统计信息概览

In [15]:
df2.describe()
df1.describe()     #　只针对数值类型的列

Unnamed: 0,工资,绩效分
count,4.0,4.0
mean,7375.0,83.25
std,1796.988221,16.52019
min,5000.0,60.0
25%,6500.0,78.0
50%,7750.0,87.5
75%,8625.0,92.75
max,9000.0,98.0


## 列的简单操作

### 增

In [18]:
import numpy as np
df3 = pd.DataFrame({'first':np.arange(10)}, index=np.arange(10))
df3

Unnamed: 0,first
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [22]:
df3['second'] = np.zeros(10)
df3

Unnamed: 0,first,second
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,0.0
5,5,0.0
6,6,0.0
7,7,0.0
8,8,0.0
9,9,0.0


### 删

In [23]:
df3.drop('second', axis = 1, inplace = True)
df3

Unnamed: 0,first
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


### 查

In [25]:
df3['second'] = np.zeros(10)
l2 = df3[['first', 'second']]
print(l2)

   first  second
0      0     0.0
1      1     0.0
2      2     0.0
3      3     0.0
4      4     0.0
5      5     0.0
6      6     0.0
7      7     0.0
8      8     0.0
9      9     0.0


### 改

In [26]:
df3['second'] = np.ones(10)
df3

Unnamed: 0,first,second
0,0,1.0
1,1,1.0
2,2,1.0
3,3,1.0
4,4,1.0
5,5,1.0
6,6,1.0
7,7,1.0
8,8,1.0
9,9,1.0


## 常用的数据类型及操作

### 字符串

In [29]:
df3['third'] = '第三列'
df3

Unnamed: 0,first,second,third
0,0,1.0,第三列
1,1,1.0,第三列
2,2,1.0,第三列
3,3,1.0,第三列
4,4,1.0,第三列
5,5,1.0,第三列
6,6,1.0,第三列
7,7,1.0,第三列
8,8,1.0,第三列
9,9,1.0,第三列


In [30]:
df3['third'] = df3['third'].str.replace('第', '')
df3

Unnamed: 0,first,second,third
0,0,1.0,三列
1,1,1.0,三列
2,2,1.0,三列
3,3,1.0,三列
4,4,1.0,三列
5,5,1.0,三列
6,6,1.0,三列
7,7,1.0,三列
8,8,1.0,三列
9,9,1.0,三列


### 数值型

In [32]:
df3['forth'] = df3['first'] + df3['second']
df3

Unnamed: 0,first,second,third,forth
0,0,1.0,三列,1.0
1,1,1.0,三列,2.0
2,2,1.0,三列,3.0
3,3,1.0,三列,4.0
4,4,1.0,三列,5.0
5,5,1.0,三列,6.0
6,6,1.0,三列,7.0
7,7,1.0,三列,8.0
8,8,1.0,三列,9.0
9,9,1.0,三列,10.0


### 时间类型

In [34]:
df3['fifth'] = pd.to_datetime('2020-8-2')
df3['fifth'].head(5)

0   2020-08-02
1   2020-08-02
2   2020-08-02
3   2020-08-02
4   2020-08-02
Name: fifth, dtype: datetime64[ns]

In [35]:
pd.to_datetime('2020-12-31') - df3['fifth']

0   151 days
1   151 days
2   151 days
3   151 days
4   151 days
5   151 days
6   151 days
7   151 days
8   151 days
9   151 days
Name: fifth, dtype: timedelta64[ns]

## 索引

### 行选取

选取所有的一级流量来源，索引号是0到12，由于左闭右开所以选到13.用iloc属性

In [39]:
df4 = pd.read_excel('流量练习数据.xls')
print(df4)
df4.iloc[:13, :]

   流量来源 来源明细    访客数   支付转化率     客单价
0    一级   -A  35188  0.0998   54.30
1    一级   -B  28467  0.1127   99.93
2    一级   -C  13747  0.0254    0.08
3    一级   -D   5183  0.0247   37.15
4    一级   -E   4361  0.0431   91.73
5    一级   -F   4063  0.1157   65.09
6    一级   -G   2122  0.1027   86.45
7    一级   -H   2041  0.0706   44.07
8    一级   -I   1991  0.1652  104.57
9    一级   -J   1981  0.0575   75.93
10   一级   -K   1958  0.1471   85.03
11   一级   -L   1780  0.1315   98.87
12   一级   -M   1447  0.0104   80.07
13   二级   -A  39048  0.1160   91.91
14   二级   -B   3316  0.0709   66.28
15   二级   -C   2043  0.0504   41.91
16   三级   -A  23140  0.0969   83.75
17   三级   -B  14813  0.2014   82.97
18   四级   -A    216  0.0185   94.25
19   四级   -B     31  0.0000     NaN
20   四级   -C     17  0.0000     NaN
21   四级   -D      3  0.0000     NaN


Unnamed: 0,流量来源,来源明细,访客数,支付转化率,客单价
0,一级,-A,35188,0.0998,54.3
1,一级,-B,28467,0.1127,99.93
2,一级,-C,13747,0.0254,0.08
3,一级,-D,5183,0.0247,37.15
4,一级,-E,4361,0.0431,91.73
5,一级,-F,4063,0.1157,65.09
6,一级,-G,2122,0.1027,86.45
7,一级,-H,2041,0.0706,44.07
8,一级,-I,1991,0.1652,104.57
9,一级,-J,1981,0.0575,75.93


### 列选取

选取流量来源和客单价,若连续跨列选取，则直接[0: 4]

In [40]:
df4.iloc[:, [0,4]]

Unnamed: 0,流量来源,客单价
0,一级,54.3
1,一级,99.93
2,一级,0.08
3,一级,37.15
4,一级,91.73
5,一级,65.09
6,一级,86.45
7,一级,44.07
8,一级,104.57
9,一级,75.93


### 行列交叉选择

目标：我们想要看一看二级、三级流量来源、来源明细对应的访客和支付转化率  
思路：先看行，二级三级渠道对应行索引是13:17，再次强调索引含首不含尾的原则，我们传入的行参数是13:18；列的话我们需要流量来源、来源明细、访客和转化，也就是前4列，传入参数0:4。

In [44]:
df4.iloc[13:18, 0:4]

Unnamed: 0,流量来源,来源明细,访客数,支付转化率
13,二级,-A,39048,0.116
14,二级,-B,3316,0.0709
15,二级,-C,2043,0.0504
16,三级,-A,23140,0.0969
17,三级,-B,14813,0.2014


### 基于名称的索引

筛选出具有特定名称的几行,用loc属性

In [46]:
df4['流量来源'] == '二级'

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13     True
14     True
15     True
16    False
17    False
18    False
19    False
20    False
21    False
Name: 流量来源, dtype: bool

In [47]:
df4.loc[df4['流量来源'] == '二级', :]

Unnamed: 0,流量来源,来源明细,访客数,支付转化率,客单价
13,二级,-A,39048,0.116,91.91
14,二级,-B,3316,0.0709,66.28
15,二级,-C,2043,0.0504,41.91


筛选出特定的几列

In [49]:
df4.loc[df4['流量来源'] == '一级', ['流量来源', '访客数', '客单价']]

Unnamed: 0,流量来源,访客数,客单价
0,一级,35188,54.3
1,一级,28467,99.93
2,一级,13747,0.08
3,一级,5183,37.15
4,一级,4361,91.73
5,一级,4063,65.09
6,一级,2122,86.45
7,一级,2041,44.07
8,一级,1991,104.57
9,一级,1981,75.93


我们想要提取二级、三级流量来源、来源明细对应的访客和支付转化率。

In [52]:
df4.loc[df4['流量来源'].isin(['二级', '三级']), ['流量来源', '来源明细', '访客数', '支付转化率']]

Unnamed: 0,流量来源,来源明细,访客数,支付转化率
13,二级,-A,39048,0.116
14,二级,-B,3316,0.0709
15,二级,-C,2043,0.0504
16,三级,-A,23140,0.0969
17,三级,-B,14813,0.2014


### 统计数据

In [58]:
mean = df4['访客数'].mean()
std = df4['访客数'].std()
m = df4['访客数'].median()
ma = df4['访客数'].max()
mi = df4['访客数'].min()
print(mean, '\n', std, '\n', m,'\n', ma, '\n', mi)

8498.0 
 12015.237559825757 
 2082.5 
 39048 
 3


问题：对于流量渠道数据，我们真正应该关注的是优质渠道，假如这里我们定义访客数、转化率、客单价都高于平均值渠道是优质渠道，那怎么找到这些渠道呢？  
思路：优质渠道，得同时满足访客、转化、客单高于平均值这三个条件，这是解题的关键。  
要三个条件同时满足，他们之间是一个“且”的关系（同时满足），在pandas中，要表示同时满足，各条件之间要用"&"符号连接，条件内部最好用括号区分；如果是“或”的关系（满足一个即可），则用“|”符号连接：

In [60]:
df4.loc[(df4['访客数']>df4['访客数'].mean()) & (df4['支付转化率']>df4['支付转化率'].mean()) & (df4['客单价']>df4['客单价'].mean()), :]

Unnamed: 0,流量来源,来源明细,访客数,支付转化率,客单价
1,一级,-B,28467,0.1127,99.93
13,二级,-A,39048,0.116,91.91
16,三级,-A,23140,0.0969,83.75
17,三级,-B,14813,0.2014,82.97
