# 抽样处理示例
## 常用的抽样方法
1）简单随机抽样；

2）等距抽样；

3）分层抽样：将样本按照某些特征进行分类，然后在各类别中采用简单随机抽样或等距抽样组成样本；

4）整群抽样：将样本分成几个小群体，选择其中几个小群体以代表整个群体。

In [1]:
import numpy as np
import random

In [11]:
origin_data = np.loadtxt('./data/data_sample_processing.txt')
print(origin_data.shape)
data_list = origin_data.tolist()
print(len(data_list))
print(data_list[1])

(10000, 5)
10000
[-4.0330736206686115, 8.586240064022087, 3.897471999108929, 1.2698258822333246, -9.01924784722751]


In [4]:
help(random.sample)

Help on method sample in module random:

sample(population, k) method of random.Random instance
    Chooses k unique random elements from a population sequence or set.
    
    Returns a new list containing elements from the population while
    leaving the original population unchanged.  The resulting list is
    in selection order so that all sub-slices will also be valid random
    samples.  This allows raffle winners (the sample) to be partitioned
    into grand prize and second place winners (the subslices).
    
    Members of the population need not be hashable or unique.  If the
    population contains repeats, then each occurrence is a possible
    selection in the sample.
    
    To choose a sample in a range of integers, use range as an argument.
    This is especially fast and space efficient for sampling from a
    large population:   sample(range(10000000), 60)



In [13]:
# 随机抽取2000个样本
data_random_sample = random.sample(data_list, 2000)
print(len(data_random_sample))

2000


In [16]:
# 等距抽样
sample_count = 2000
record_count = len(data_list)
width = record_count / sample_count
data_equal_sample = []
i = 0
while len(data_equal_sample) <= sample_count and i * width <= record_count - 1:
    data_equal_sample.append(data_list[int(i*width)])
    i += 1
print(len(data_equal_sample))

2000


In [24]:
# 分层抽样
data_with_label = np.loadtxt('./data/data_imbalance.txt')
each_sample_count = 200
label_data_unique = np.unique(data_with_label[:, -1])
# 用于存放临时分层数据
sample_list = []
# 用于存放最终抽样的数据
sample_data = []
# 用来表示各分层样本数量
sample_dict = {}
for label_data in label_data_unique:
    for data_tmp in data_with_label:
        if data_tmp[-1] == label_data:
            sample_list.append(data_tmp.tolist())
    each_sample_data = random.sample(sample_list, each_sample_count)
    sample_dict[label_data] = each_sample_data
print(sample_dict)

{0.0: [[0.0, -1.0, -2.0, 2.0, -1.0, 0.0], [0.0, 0.0, 1.0, -1.0, -1.0, 0.0], [0.0, 1.0, 2.0, -2.0, 1.0, 0.0], [0.0, 1.0, 1.0, -1.0, 1.0, 0.0], [0.0, -1.0, 1.0, -1.0, -2.0, 0.0], [1.0, 1.0, -1.0, 1.0, 2.0, 0.0], [-2.0, 0.0, 1.0, -1.0, -1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [-1.0, 0.0, 1.0, -1.0, -1.0, 0.0], [0.0, 1.0, 1.0, -1.0, 1.0, 0.0], [0.0, -1.0, 1.0, -1.0, -1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [2.0, 1.0, 2.0, -2.0, 1.0, 0.0], [0.0, 1.0, 2.0, -2.0, 1.0, 0.0], [0.0, -1.0, 1.0, -1.0, -1.0, 0.0], [-1.0, 1.0, -1.0, 1.0, 1.0, 0.0], [2.0, 1.0, 1.0, -1.0, 1.0, 0.0], [0.0, 1.0, 2.0, -2.0, 1.0, 0.0], [1.0, 0.0, 1.0, -1.0, -1.0, 0.0], [0.0, -1.0, 1.0, -1.0, -1.0, 0.0], [1.0, 1.0, 0.0, 0.0, 1.0, 0.0], [0.0, -1.0, 1.0, -1.0, -2.0, 0.0], [0.0, 0.0, 1.0, -1.0, -1.0, 0.0], [0.0, 0.0, 1.0, -1.0, -1.0, 0.0], [0.0, 0.0, 1.0, -1.0, -1.0, 0.0], [-1.0, 1.0, 1.0, -1.0, 1.0, 0.0], [0.0, 0.0, 1.0, -1.0, -1.0, 0.0], [0.0, 0.0, 1.0, -1.0, -1.0, 0.0], [0.0, 1.0, 0.0, 0.0, 1.0, 0.0], [-2.0, -1.0, 1.

In [32]:
# 整群抽样
origin_data = np.loadtxt('./data/data_sample_processing_1.txt')
label_data_unique = np.unique(origin_data[:, -1]).tolist()
print(type(label_data_unique))
sample_label = random.sample(label_data_unique, 2)
data_cluster_sample = []
for data in origin_data:
    if data[-1] in sample_label:
        data_cluster_sample.append(data.tolist())
        
print(data_cluster_sample)

<class 'list'>
[[-1.4703337402272054, 0.686988332719648, 0.3715240067696685, 1.0501597659918624, 0.5507503562967921, 2.0], [-1.2800628667664489, -0.4009069321933828, -0.37182889740783803, 1.323295998059595, 0.14438850855781304, 2.0], [-1.3088346252748462, -0.7371268979501195, -0.3053647614130207, 0.9140263209904719, -0.5202690621668642, 2.0], [0.9300210143934775, -0.8059332799801517, -0.07148352745787079, -1.3113855317964394, -1.2514428712937418, 0.0], [0.430728955388364, -1.9450543411921912, -1.507191075038501, 0.75616712781897, -0.4055062860667502, 2.0], [0.7064760561590295, -0.6581463445554587, 0.23829828282387933, -1.5920497295405545, -1.5648090721817653, 0.0], [-0.39810069994877906, -0.6491115800796431, -0.6949778012381655, 1.032217361610982, 0.26442902304047056, 2.0], [-2.056547948476653, -0.5139760348369573, 0.07918423202230068, 0.9629235503824591, -0.805442335114946, 2.0], [-0.7114180252582694, 1.0002717915165862, 2.3505111547139133, -2.9961169751787917, -2.6566008237661602, 0.