## Pandas-离散化和面元划分

内容介绍:
 * 离散化：通过数据处理，得到数据在整体中的相对大小
 * 面元划分：分阶段

In [3]:
import numpy as np
import pandas as pd

In [18]:
# 示例数据
s0 = pd.Series(np.random.randint(18,70,size=(10)))
print(s0)
df0 = pd.DataFrame(np.random.randint(-9,9,size=(4,3)),index=['d','b','c','a'],columns=['B','A','C'])
df0

0    58
1    26
2    23
3    35
4    41
5    22
6    42
7    26
8    40
9    32
dtype: int32


Unnamed: 0,B,A,C
d,-2,8,-1
b,5,6,6
c,6,0,-1
a,2,5,-9


In [24]:
arr = np.random.randint(18,100,size=(20))
arr

array([21, 83, 40, 48, 80, 62, 61, 70, 87, 82, 58, 42, 72, 25, 32, 27, 91,
       68, 29, 51])

### 1.1面元划分-cut函数

面元划分及相关操作

In [8]:
# 划分数据的阶段
bins = [18,25,35,60,100]

In [25]:
#获取对应数据的面元。理解为返回的是表示不同面元名称的字符串。
cats = pd.cut(arr,bins)
cats

[(18, 25], (60, 100], (35, 60], (35, 60], (60, 100], ..., (25, 35], (60, 100], (60, 100], (25, 35], (35, 60]]
Length: 20
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [26]:
#针对Series不能显示此属性。python数组和numpy ndarry数组可以使用此属性。
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [28]:
#显示数组对应的阶段
cats.codes

array([0, 3, 2, 2, 3, 3, 3, 3, 3, 3, 2, 2, 3, 0, 1, 1, 3, 3, 1, 2],
      dtype=int8)

In [30]:
# 分类数据统计
pd.value_counts(cats)

(60, 100]    10
(35, 60]      5
(25, 35]      3
(18, 25]      2
dtype: int64

In [33]:
#默认的左开右闭。修改这种方式如下：
cat2 = pd.cut(arr,bins,right=False)
cat2

[[18, 25), [60, 100), [35, 60), [35, 60), [60, 100), ..., [25, 35), [60, 100), [60, 100), [25, 35), [35, 60)]
Length: 20
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [35]:
# 分类数据统计
pd.value_counts(cat2)

[60, 100)    10
[35, 60)      5
[25, 35)      4
[18, 25)      1
dtype: int64

In [38]:
#自定义面元名称。让面元(数据分区)有可理解的意义。
names=['青少年','青年','中年','老年']
cat3 = pd.cut(arr,bins,labels=names)
cat3

['青少年', '老年', '中年', '中年', '老年', ..., '青年', '老年', '老年', '青年', '中年']
Length: 20
Categories (4, object): ['青少年' < '青年' < '中年' < '老年']

In [40]:
#统计数据出现时，按照标签显示各阶段的数量
pd.value_counts(cat3)

老年     10
中年      5
青年      3
青少年     2
dtype: int64

In [41]:
#不输入设定的阶段值，按照数量划分阶段
arr2 = np.random.rand(20)
arr2

array([0.87767343, 0.04761865, 0.83973173, 0.36743851, 0.57763083,
       0.48788535, 0.51063946, 0.60210982, 0.82046727, 0.36311687,
       0.18892736, 0.04363943, 0.79841758, 0.90671213, 0.15947717,
       0.77079391, 0.58197522, 0.38258206, 0.9495849 , 0.31636671])

In [48]:
#在cat函数中传入阶段数量，函数会根据传入的数据自动划分。precision=2是返回结果为两位小数。
# 数值所在的区间四等分。即数值区间在0-100，四块均等分为25.
cat4 = pd.cut(arr2,4,precision=2)
cat4

[(0.72, 0.95], (0.043, 0.27], (0.72, 0.95], (0.27, 0.5], (0.5, 0.72], ..., (0.72, 0.95], (0.5, 0.72], (0.27, 0.5], (0.72, 0.95], (0.27, 0.5]]
Length: 20
Categories (4, interval[float64, right]): [(0.043, 0.27] < (0.27, 0.5] < (0.5, 0.72] < (0.72, 0.95]]

In [49]:
pd.value_counts(cat4)

(0.72, 0.95]     7
(0.27, 0.5]      5
(0.043, 0.27]    4
(0.5, 0.72]      4
dtype: int64

### 1.2面元划分qcut函数

得到每个面元数据量相等的划分

In [50]:
arr12 = np.random.rand(1000)

In [52]:
# 可以实现按照数量的四等分。即总体为100个数据，那么每份分为25个数据。
cat12 = pd.qcut(arr12,4)
cat12

[(-0.00042200000000000007, 0.268], (0.268, 0.514], (-0.00042200000000000007, 0.268], (0.514, 0.752], (-0.00042200000000000007, 0.268], ..., (0.514, 0.752], (0.268, 0.514], (0.268, 0.514], (0.514, 0.752], (0.514, 0.752]]
Length: 1000
Categories (4, interval[float64, right]): [(-0.00042200000000000007, 0.268] < (0.268, 0.514] < (0.514, 0.752] < (0.752, 0.999]]

In [53]:
pd.value_counts(cat12)

(-0.00042200000000000007, 0.268]    250
(0.268, 0.514]                      250
(0.514, 0.752]                      250
(0.752, 0.999]                      250
dtype: int64

In [54]:
#qcut自己设定范围的划分
cat121 = pd.qcut(arr12,[0,0.1,0.5,0.9])
cat121

[(-0.00042200000000000007, 0.113], (0.113, 0.514], (0.113, 0.514], (0.514, 0.896], (0.113, 0.514], ..., (0.514, 0.896], (0.113, 0.514], (0.113, 0.514], (0.514, 0.896], (0.514, 0.896]]
Length: 1000
Categories (3, interval[float64, right]): [(-0.00042200000000000007, 0.113] < (0.113, 0.514] < (0.514, 0.896]]

In [55]:
pd.value_counts(cat121)

(0.113, 0.514]                      400
(0.514, 0.896]                      400
(-0.00042200000000000007, 0.113]    100
dtype: int64

### 2.检测和过滤异常值

In [4]:
df2 = pd.DataFrame(np.random.randn(1000,4))
df2

Unnamed: 0,0,1,2,3
0,1.951637,-0.009127,-0.294744,-0.796159
1,1.293682,-0.001784,0.001199,-0.009707
2,0.441607,0.512919,0.979075,0.323896
3,0.159724,0.473924,-2.286506,0.765562
4,1.671371,-1.180175,-1.133407,1.856926
...,...,...,...,...
995,-0.622098,-0.497169,-0.073101,1.388692
996,-0.062709,0.337784,-1.364190,0.628116
997,-0.107963,-0.130047,0.497731,-1.143566
998,1.219360,-0.483168,-0.993379,-1.207324


In [5]:
np.abs(df2)>2

Unnamed: 0,0,1,2,3
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,False,False,False,False
...,...,...,...,...
995,False,False,False,False
996,False,False,False,False
997,False,False,False,False
998,False,False,False,False


In [10]:
#找到绝对值大于三的数值。筛选异常值。
df2[(np.abs(df2)>2).any(1)]

Unnamed: 0,0,1,2,3
3,0.159724,0.473924,-2.286506,0.765562
12,-1.190548,1.514092,0.635842,2.421032
14,-0.657061,2.075509,0.521587,0.744975
17,-0.012271,0.832700,0.552373,3.003729
36,-1.670941,-2.461845,-1.258139,-0.935025
...,...,...,...,...
986,-0.427006,2.654343,0.478391,0.624780
989,-2.219238,-0.011853,-0.025186,0.817374
993,0.034468,1.213977,0.792636,2.092112
994,0.404317,0.186653,-0.732448,2.489930


In [11]:
#针对某些异常值的设置
df2[(np.abs(df2)>3)] = 3

In [12]:
df2.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.037085,0.035339,0.020592,0.079983
std,1.01467,0.994967,0.952063,0.996232
min,-2.641156,-2.617328,-2.682684,-2.768796
25%,-0.685504,-0.604792,-0.649864,-0.584327
50%,0.030673,-0.009623,0.02843,0.049411
75%,0.707349,0.658782,0.63578,0.733608
max,3.0,3.0,3.0,3.0


### 3.排列和随即采样

In [13]:
df3 = pd.DataFrame(np.arange(5*4).reshape(5,4))
df3

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [17]:
#获取包含随机数的数组
sam = np.random.permutation(5)
sam

array([2, 0, 3, 4, 1])

In [18]:
#使用随即数组，获取行随机排列的数据
#随即数组的生成是根据np.random.permutation函数进行
df3.take(sam)

Unnamed: 0,0,1,2,3
2,8,9,10,11
0,0,1,2,3
3,12,13,14,15
4,16,17,18,19
1,4,5,6,7


In [20]:
#随机选取数行
df3.sample(3)

Unnamed: 0,0,1,2,3
0,0,1,2,3
2,8,9,10,11
4,16,17,18,19


In [21]:
#随机选取数行
df3.sample(n=1)

Unnamed: 0,0,1,2,3
2,8,9,10,11


In [22]:
ch = pd.Series([5,8,1,3,0])
ch

0    5
1    8
2    1
3    3
4    0
dtype: int64

In [24]:
#选取大于数据量本身的随机数据
ch.sample(n=10,replace=True)

2    1
0    5
3    3
4    0
4    0
3    3
1    8
3    3
1    8
2    1
dtype: int64