# 抽样(Sampling)

## 简单抽样

In [10]:
import random 
import numpy as np

# 1~10 的集合
list1 = list(np.arange(1, 10 + 1))
   
# 随机抽出 5 个
print(random.sample(list1, 5))  

[1, 7, 9, 4, 10]


## 放回抽样(Sampling With Replacement)

In [11]:
import random 
import numpy as np

# 1~10 的集合
list1 = list(np.arange(1, 10 + 1))
   
# 随机抽出 5 个
print(random.choices(list1, k=5))  

[9, 1, 1, 5, 8]


## Pandas 抽样

In [12]:
from sklearn import datasets
import pandas as pd

# 载入鸢尾花(iris)资料集
ds = datasets.load_iris()

# x, y 合成一个资料集
df = pd.DataFrame(data=ds.data, columns=ds.feature_names)
df['y'] = ds.target
   
# 随机抽出 5 个
df.sample(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),y
117,7.7,3.8,6.7,2.2,2
19,5.1,3.8,1.5,0.3,0
97,6.2,2.9,4.3,1.3,1
131,7.9,3.8,6.4,2.0,2
53,5.5,2.3,4.0,1.3,1


## Pandas 分层抽样

In [13]:
from sklearn import datasets
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd

# 载入鸢尾花(iris)资料集
ds = datasets.load_iris()

# x, y 合成一个资料集
df = pd.DataFrame(data=ds.data, columns=ds.feature_names)
df['y'] = ds.target

# 随机抽出 6 个
stratified = StratifiedShuffleSplit(n_splits=1, test_size=6)
x = list(stratified.split(df, df['y']))

print('重新洗牌的全部资料:')
print(x[0][0])

print('\n抽出的索引值:')
print(x[0][1])

重新洗牌的全部资料:
[ 16  98  31 105  49  24  75 144 145 115  36  84  58  65  42  29  96  51
  85  82  79  81 131  94   1  41 138   8  67  33  90  68  39 103  25  64
 100  55 118 111  69 139 106  28  11 146 102  53 113  61  62  73  56  71
 135  89  19 129  18  15 147  26  70 143  10   9   7  21 140 127 148  14
 110  86  12  77   3 149 109 125 117 142  17  52  99   2  35  43   0 112
 122  93 101  92  63  40 128  20  66  46 132 119 141 116  72  78  57  95
 114   5  60 126  76  80 108  32  50  13  34  44  22  48 137   6 107 130
  47 104  45 134 124 120  38  54 121  30  91 136  88  74  83  23  59   4]

抽出的索引值:
[ 37 123  87  27  97 133]


In [14]:
print('\n抽出的资料:')
df.iloc[x[0][1]]


抽出的资料:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),y
37,4.9,3.6,1.4,0.1,0
123,6.3,2.7,4.9,1.8,2
87,6.3,2.3,4.4,1.3,1
27,5.2,3.5,1.5,0.2,0
97,6.2,2.9,4.3,1.3,1
133,6.3,2.8,5.1,1.5,2


In [15]:
df['y'].value_counts()

2    50
1    50
0    50
Name: y, dtype: int64

## Pandas 不分层抽样

In [16]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd

# 载入鸢尾花(iris)资料集
ds = datasets.load_iris()

# x, y 合成一个资料集
df = pd.DataFrame(data=ds.data, columns=ds.feature_names)
df['y'] = ds.target

# 随机抽出 6 个
train, test = train_test_split(df, test_size=6)
x = list(stratified.split(df, df['y']))

print('\n抽出的资料:')
test


抽出的资料:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),y
37,4.9,3.6,1.4,0.1,0
85,6.0,3.4,4.5,1.6,1
32,5.2,4.1,1.5,0.1,0
47,4.6,3.2,1.4,0.2,0
98,5.1,2.5,3.0,1.1,1
69,5.6,2.5,3.9,1.1,1
