# Amostragem

In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
base_census = pd.read_csv('Bases de dados/census.csv')
print(base_census.shape)
base_census.head()

(32561, 15)


Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
print(base_census.info())
base_census.describe(include='all').T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   final-weight    32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loos    32561 non-null  int64 
 12  hour-per-week   32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
age,32561,,,,38.5816,13.6404,17.0,28.0,37.0,48.0,90.0
workclass,32561,9.0,Private,22696.0,,,,,,,
final-weight,32561,,,,189778.0,105550.0,12285.0,117827.0,178356.0,237051.0,1484700.0
education,32561,16.0,HS-grad,10501.0,,,,,,,
education-num,32561,,,,10.0807,2.57272,1.0,9.0,10.0,12.0,16.0
marital-status,32561,7.0,Married-civ-spouse,14976.0,,,,,,,
occupation,32561,15.0,Prof-specialty,4140.0,,,,,,,
relationship,32561,6.0,Husband,13193.0,,,,,,,
race,32561,5.0,White,27816.0,,,,,,,
sex,32561,2.0,Male,21790.0,,,,,,,


## Amostragem aleatória simples

In [4]:
df_amostra_aleat_simples = base_census.sample(n=100, replace=False, random_state=1)  # replace permite que o mesmo registro nao seja selecionado mais de uma vez (já pe dado como False)
print(df_amostra_aleat_simples.shape)   # 100 registros completos da base
#list(df_amostra_aleat_simples.index)

(100, 15)


In [5]:
def amostragem_aleat_simples(dataset, amostras):
    return dataset.sample(n=amostras)

In [6]:
df_amostragem_aleat_simples = amostragem_aleat_simples(base_census, 2)
df_amostragem_aleat_simples

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
16493,23,Private,42706,Some-college,10,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,45,United-States,<=50K
6950,38,Private,107630,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,<=50K


## Amostragem sistemática

In [7]:
base_census.shape

(32561, 15)

In [8]:
# amostra de 100 pessoas
len(base_census) // 100

# selecionar um número aleatório entre 0 e 325
# o próximo será do número escolhido até o numero + 325

325

In [9]:
random.seed(1)  # retornar sempre o mesmo
random.randint(0, 325)

68

In [10]:
print(68 + 325)
random.seed(1)  
random.randint(68, 68+325)

393


136

Com a biblioteca numpy:

In [11]:
np.arange(68, len(base_census), step=325)  # retorna 100 numeros

array([   68,   393,   718,  1043,  1368,  1693,  2018,  2343,  2668,
        2993,  3318,  3643,  3968,  4293,  4618,  4943,  5268,  5593,
        5918,  6243,  6568,  6893,  7218,  7543,  7868,  8193,  8518,
        8843,  9168,  9493,  9818, 10143, 10468, 10793, 11118, 11443,
       11768, 12093, 12418, 12743, 13068, 13393, 13718, 14043, 14368,
       14693, 15018, 15343, 15668, 15993, 16318, 16643, 16968, 17293,
       17618, 17943, 18268, 18593, 18918, 19243, 19568, 19893, 20218,
       20543, 20868, 21193, 21518, 21843, 22168, 22493, 22818, 23143,
       23468, 23793, 24118, 24443, 24768, 25093, 25418, 25743, 26068,
       26393, 26718, 27043, 27368, 27693, 28018, 28343, 28668, 28993,
       29318, 29643, 29968, 30293, 30618, 30943, 31268, 31593, 31918,
       32243])

In [12]:
def amostragem_sistematica(dataset, amostras):
    intervalo = len(dataset) // amostras
    random.seed(1)
    inicio = random.randint(0, intervalo)
    indices = np.arange(inicio, len(dataset), step=intervalo)
    amostra_sist = dataset.iloc[indices]
    return amostra_sist

In [13]:
df_amostragem_sistematica = amostragem_sistematica(base_census, 100)
df_amostragem_sistematica.shape

(100, 15)

## Amostragem por Grupos

Define-se um numero de grupos e pega-se registros de cada grupo (aleatoriamente)

In [14]:
len(base_census) / 10  # numero de pessoas por grupo

3256.1

In [15]:
grupos = []
id_grupo = 0
contagem = 0

for _ in base_census.iterrows():
    grupos.append(id_grupo)
    contagem +=1
    if contagem > (len(base_census) // 10):
        contagem = 0
        id_grupo +=1

In [16]:
len(grupos)  # para cada registro

32561

In [17]:
np.unique(grupos, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([3257, 3257, 3257, 3257, 3257, 3257, 3257, 3257, 3257, 3248],
       dtype=int64))

In [18]:
base_census['grupo'] = grupos
base_census.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,grupo
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0


In [19]:
random.randint(0, 9)  # escolhendo o grupo

9

In [20]:
df_agrupamento = base_census[base_census['grupo']==1]
df_agrupamento.shape

(3257, 16)

In [21]:
def amostragem_agrup(dataset, numero_grupos):
    intervalo = len(dataset) / numero_grupos
    grupos = []
    id_grupo = 0
    contagem = 0

    for _ in dataset.iterrows():
        grupos.append(id_grupo)
        contagem +=1
        if contagem > intervalo:
            contagem = 0
            id_grupo +=1
    
    dataset['grupo'] = grupos
    random.seed(1)
    grupo_selecionado = random.randint(0, numero_grupos)
    return dataset[dataset['grupo'] == grupo_selecionado]

In [22]:
df_amostragem_agrup = amostragem_agrup(base_census, 100)
df_amostragem_agrup.shape

(326, 16)

In [23]:
df_amostragem_agrup['grupo'].value_counts()

17    326
Name: grupo, dtype: int64

## Amostragem estratificada

In [24]:
base_census['income'].value_counts(normalize=True)

 <=50K    0.75919
 >50K     0.24081
Name: income, dtype: float64

In [25]:
split = StratifiedShuffleSplit(test_size=0.1)
for x, y in split.split(base_census, base_census['income']):
    df_x = base_census.iloc[x]
    df_y = base_census.iloc[y]

In [26]:
df_x.shape, df_y.shape # 90% e 10% da base de dados

((29304, 16), (3257, 16))

In [27]:
df_x['income'].value_counts(normalize=True)

 <=50K    0.75918
 >50K     0.24082
Name: income, dtype: float64

In [28]:
def amostragem_estratificada(dataset, percentual: float, classe: str):
    split = StratifiedShuffleSplit(test_size=percentual, random_state=1)
    
    for _, y in split.split(dataset, dataset[classe]):
        df_y = dataset.iloc[y]
        
    return df_y

In [29]:
df_amostragem_estratificada = amostragem_estratificada(base_census, 0.1, 'income')
df_amostragem_estratificada.shape

(3257, 16)

## Amostragem de reservatório

- Data stream
- Teste validado

In [30]:
stream = [i for i in range(len(base_census))]
    
#print(stream)

In [31]:
def amostragem_reservatorio(dataset, amostras):
    stream = [i for i in range(len(dataset))]
    tamanho = len(dataset)
    i = 0
    
    reservatorio = [0] * amostras
    for i in range(amostras):
        reservatorio[i] = stream[i]
    
    while i < tamanho:
        j = random.randrange(i+1)
        if j < amostras:
            reservatorio[j] = stream[i]
        i +=1
        
    return dataset.iloc[reservatorio]

In [32]:
df_amostragem_reservatorio = amostragem_reservatorio(base_census, 100)
print(df_amostragem_reservatorio.shape)
df_amostragem_reservatorio.head()

(100, 16)


Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,grupo
29608,41,Self-emp-inc,114580,Prof-school,15,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,2415,55,United-States,>50K,90
21696,37,Federal-gov,329088,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,Black,Male,0,0,40,United-States,<=50K,66
30676,42,Private,355728,Assoc-voc,11,Never-married,Craft-repair,Not-in-family,White,Male,0,0,44,United-States,<=50K,94
28550,43,Private,110970,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1902,40,United-States,>50K,87
8768,52,Federal-gov,221532,Bachelors,13,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,45,United-States,>50K,26


## Comparativo

In [33]:
base_census['age'].mean()

38.58164675532078

In [34]:
df_amostragem_aleat_simples['age'].mean()

30.5

In [35]:
df_amostragem_sistematica['age'].mean()

37.57

In [36]:
df_amostragem_agrup['age'].mean()

39.23312883435583

In [37]:
df_amostragem_estratificada['age'].mean()

38.58213079521032

In [38]:
df_amostragem_reservatorio['age'].mean()

37.85

A amostragem estratificada foi a única que deu o mesmo valor!

## EXERCICIO