# 抽樣(Sampling)

## 簡單抽樣

In [63]:
import random 
import numpy as np

# 1~10 的集合
list1 = list(np.arange(1, 10 + 1))
   
# 隨機抽出 5 個
print(random.sample(list1, 5))  

[1, 6, 2, 8, 5]


## 放回抽樣(Sampling With Replacement)

In [67]:
import random 
import numpy as np

# 1~10 的集合
list1 = list(np.arange(1, 10 + 1))
   
# 隨機抽出 5 個
print(random.choices(list1, k=5))  

[8, 4, 7, 4, 6]


## Pandas 抽樣

In [88]:
from sklearn import datasets
import pandas as pd

# 載入鳶尾花(iris)資料集
ds = datasets.load_iris()

# x, y 合成一個資料集
df = pd.DataFrame(data=ds.data, columns=ds.feature_names)
df['y'] = ds.target
   
# 隨機抽出 5 個
df.sample(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),y
102,7.1,3.0,5.9,2.1,2
40,5.0,3.5,1.3,0.3,0
32,5.2,4.1,1.5,0.1,0
48,5.3,3.7,1.5,0.2,0
77,6.7,3.0,5.0,1.7,1


## Pandas 分層抽樣

In [85]:
from sklearn import datasets
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd

# 載入鳶尾花(iris)資料集
ds = datasets.load_iris()

# x, y 合成一個資料集
df = pd.DataFrame(data=ds.data, columns=ds.feature_names)
df['y'] = ds.target

# 隨機抽出 6 個
stratified = StratifiedShuffleSplit(n_splits=1, test_size=6)
x = list(stratified.split(df, df['y']))

print('重新洗牌的全部資料:')
print(x[0][0])

print('\n抽出的索引值:')
print(x[0][1])

重新洗牌的全部資料:
[ 38 115 136  80 111  94   1   0  48 100 108 104   8  51 131  78   9 142
 112  11 126  79  95   2  46 128 125  65  55  10  72 145 130  56 138  96
  88  19   7  43   4  82  32  91 127  87 133  73  85  62 129  42  57  84
  40 105  49  75 113 147  99  27   6 135  58  35  26 124  92  70  69 139
  66 101  74  60 110  15  39  59   3  53  89 107  61 143 118  86  71  98
  50  41  34  12 149  77  23  21 117 121  97  54 119  64 120  45  81 141
 122 114  20 144 134 132  17  24  13  22  44 123  31 116  76  18  47 137
  63  83  29  25  36 102  28  37  33  93 148  14 146  16 103  68  90 140]

抽出的索引值:
[ 52   5  30 109 106  67]


In [87]:
print('\n抽出的資料:')
df.iloc[x[0][1]]


抽出的資料:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),y
52,6.9,3.1,4.9,1.5,1
5,5.4,3.9,1.7,0.4,0
30,4.8,3.1,1.6,0.2,0
109,7.2,3.6,6.1,2.5,2
106,4.9,2.5,4.5,1.7,2
67,5.8,2.7,4.1,1.0,1


In [94]:
df['y'].value_counts()

2    50
1    50
0    50
Name: y, dtype: int64

## Pandas 不分層抽樣

In [98]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd

# 載入鳶尾花(iris)資料集
ds = datasets.load_iris()

# x, y 合成一個資料集
df = pd.DataFrame(data=ds.data, columns=ds.feature_names)
df['y'] = ds.target

# 隨機抽出 6 個
train, test = train_test_split(df, test_size=6)
x = list(stratified.split(df, df['y']))

print('\n抽出的資料:')
print(test)


抽出的資料:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),y
141,6.9,3.1,5.1,2.3,2
147,6.5,3.0,5.2,2.0,2
97,6.2,2.9,4.3,1.3,1
80,5.5,2.4,3.8,1.1,1
22,4.6,3.6,1.0,0.2,0
61,5.9,3.0,4.2,1.5,1
