In [34]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import StratifiedShuffleSplit

## Loading dataset

In [35]:
df = pd.read_csv('../Bases de dados/census.csv')
df.shape

(32561, 15)

In [36]:
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [37]:
df.tail()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


## Simple random sampling

In [38]:
df_simple_random_sample = df.sample(n=100, random_state=42)
df_simple_random_sample.shape

(100, 15)

In [39]:
df_simple_random_sample.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
14160,27,Private,160178,Some-college,10,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,38,United-States,<=50K
27048,45,State-gov,50567,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
28868,29,Private,185908,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,55,United-States,>50K
5667,30,Private,190040,Bachelors,13,Never-married,Machine-op-inspct,Not-in-family,White,Female,0,0,40,United-States,<=50K
7827,29,Self-emp-not-inc,189346,Some-college,10,Divorced,Craft-repair,Not-in-family,White,Male,2202,0,50,United-States,<=50K


In [40]:
def simple_random_sampling(df:pd.DataFrame, n_sample:int)->pd.DataFrame:
    return df.sample(n=n_sample, random_state=42)

simple_random_sampling(df=df, n_sample=100)

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
14160,27,Private,160178,Some-college,10,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,38,United-States,<=50K
27048,45,State-gov,50567,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
28868,29,Private,185908,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,55,United-States,>50K
5667,30,Private,190040,Bachelors,13,Never-married,Machine-op-inspct,Not-in-family,White,Female,0,0,40,United-States,<=50K
7827,29,Self-emp-not-inc,189346,Some-college,10,Divorced,Craft-repair,Not-in-family,White,Male,2202,0,50,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7251,18,Private,236262,11th,7,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,12,United-States,<=50K
6039,55,?,103654,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,0,20,United-States,<=50K
196,36,Private,99374,Some-college,10,Divorced,Craft-repair,Not-in-family,White,Male,0,0,40,United-States,<=50K
5803,27,?,224421,Some-college,10,Divorced,?,Own-child,White,Male,0,0,40,United-States,<=50K


## Sistematic sampling

In [41]:
def sistematic_sampling(df:pd.DataFrame, n_sample:int)->pd.DataFrame:
    random.seed(42)
    population = len(df)
    interval = population // n_sample
    start_index = random.randint(0, interval)
    sistematic_sample_indexes = np.arange(start_index, population, step=interval)
    return df.iloc[sistematic_sample_indexes]

In [42]:
df_sistematic_sample = sistematic_sampling(df=df, n_sample=100)
df_sistematic_sample.shape

(101, 15)

In [43]:
df_sistematic_sample.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
57,35,Private,56352,Assoc-voc,11,Married-civ-spouse,Other-service,Husband,White,Male,0,0,40,Puerto-Rico,<=50K
382,25,Private,344991,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
707,45,Private,242552,Some-college,10,Never-married,Sales,Not-in-family,Black,Male,0,0,40,United-States,<=50K
1032,27,Private,389713,Some-college,10,Never-married,Sales,Not-in-family,White,Male,0,0,40,United-States,<=50K
1357,31,Private,42900,Bachelors,13,Never-married,Tech-support,Not-in-family,White,Male,0,0,37,United-States,<=50K


## Grouped sampling

In [108]:
def grouped_sampling(df:pd.DataFrame, n_sample:int)->pd.DataFrame:
    population = len(df)
    n_groups = population // n_sample
    interval = population // n_groups
    
    groups = []
    id_group = 0
    count = 0

    for _ in df.iterrows():
        groups.append(id_group)
        if count > interval:
            count = 0
            id_group += 1
        count += 1

    random.seed(42)
    df_grouped_sample = df.copy()
    df_grouped_sample['groups'] = groups
    random_selected_group = random.randint(0, n_groups - 1)
    grouped_sample_indexes = df_grouped_sample[df_grouped_sample['groups'] == random_selected_group].index
    return df.iloc[grouped_sample_indexes]

In [45]:
df_grouped_sample = grouped_sampling(df=df, n_sample=100)
df_grouped_sample.shape

(101, 15)

In [46]:
df_grouped_sample.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
5758,20,Private,165804,Some-college,10,Never-married,Adm-clerical,Own-child,Other,Female,0,0,40,United-States,<=50K
5759,46,Private,318259,Assoc-voc,11,Divorced,Tech-support,Other-relative,White,Female,0,0,36,United-States,<=50K
5760,21,Private,117606,Some-college,10,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States,<=50K
5761,37,Private,170718,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,<=50K
5762,42,Private,413297,Some-college,10,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States,<=50K


## Stratified sampling

In [47]:
df['income'].value_counts()

income
 <=50K    24720
 >50K      7841
Name: count, dtype: int64

In [48]:
print(f'{24720 / len(df)} %, {7841 / len(df)} %')

0.7591904425539756 %, 0.2408095574460244 %


In [49]:
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [65]:
def stratified_sampling(df:pd.DataFrame, n_sample:int, column:str)->pd.DataFrame:
    population = len(df)
    sample_size = n_sample / population
    stratified_sampling = StratifiedShuffleSplit(test_size=sample_size)
    stratified_sample = stratified_sampling.split(X=df, y=df[column])
    for _, y in stratified_sample:
        df_stratified_sample = df.iloc[y]
    return df_stratified_sample

In [66]:
df_stratified_sample = stratified_sampling(df=df, n_sample=100, column='income')
df_stratified_sample.shape

(100, 15)

In [67]:
df_stratified_sample.income.value_counts()

income
 <=50K    76
 >50K     24
Name: count, dtype: int64

## Reservoir sampling

In [54]:
def reservoir_sampling(df:pd.DataFrame, n_sample:int)->pd.DataFrame:
    # it gets the dataset indexes
    stream = [i for i in range(len(df))]

    i = 0
    population = len(df)

    # Inicializing the reservoir list
    reservoir = [stream[i] for i in range(n_sample)]

    random.seed(42)
    while i < population:
        j = random.randrange(i + 1)
        if j < n_sample:
            reservoir[j] = stream[i]
        i += 1

    return df.iloc[reservoir]
    

In [55]:
df_reservoir_sample = reservoir_sampling(df=df, n_sample=100)
df_reservoir_sample.shape

(100, 15)

In [56]:
df_reservoir_sample.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
18151,41,State-gov,108945,HS-grad,9,Never-married,Transport-moving,Not-in-family,White,Female,14344,0,40,United-States,>50K
22771,40,Self-emp-inc,209344,HS-grad,9,Divorced,Adm-clerical,Unmarried,White,Female,0,0,15,?,<=50K
29551,33,Private,288840,HS-grad,9,Married-spouse-absent,Other-service,Unmarried,Black,Female,0,0,38,United-States,<=50K
19794,32,Self-emp-not-inc,129497,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,55,United-States,>50K
30278,38,Private,219902,HS-grad,9,Separated,Transport-moving,Unmarried,Black,Female,0,0,30,United-States,<=50K


## Testing sampling methods

Defining evaluate sampling function over age column

In [124]:
def evaluate_sampling_method(df:pd.DataFrame, sampling_func, column:str, stratified_column:str=None)->float:
    df_sampling_method = sampling_func(df=df, n_sample=100) if stratified_column == None else sampling_func(df=df, n_sample=100, column=stratified_column)

    overall_mean = df['age'].mean()
    sampling_method_mean = df_sampling_method[column].mean()

    error = abs(overall_mean - sampling_method_mean)

    return error

Evaluating simple random sampling

In [119]:
evaluate_sampling_method(df=df, column='age', sampling_func=simple_random_sampling)

0.26164675532078263

Evaluating sistematic sampling

In [120]:
evaluate_sampling_method(df=df, column='age', sampling_func=sistematic_sampling)

0.9182804186871181

Evaluating grouped sampling

In [121]:
evaluate_sampling_method(df=df, column='age', sampling_func=grouped_sampling)

1.1658051711623685

Evaluating stratified sampling

In [122]:
evaluate_sampling_method(df=df, column='age', sampling_func=stratified_sampling, stratified_column='income')

2.0083532446792205

Evaluating reservoir sampling

In [123]:
evaluate_sampling_method(df=df, column='age', sampling_func=reservoir_sampling)

4.148353244679214

## Testing on another database

In [63]:
df_credit = pd.read_csv('../Bases de dados/credit_data.csv')
df_credit.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [80]:
df_credit.tail()

Unnamed: 0,i#clientid,income,age,loan,c#default
1995,1996,59221.044874,48.518179,1926.729397,0
1996,1997,69516.127573,23.162104,3503.176156,0
1997,1998,44311.449262,28.017167,5522.786693,1
1998,1999,43756.056605,63.971796,1622.722598,0
1999,2000,69436.579552,56.152617,7378.833599,0


## Simple random sampling

In [82]:
df_credit_simple_random_sample = simple_random_sampling(df=df_credit, n_sample=1000)
df_credit_simple_random_sample.shape

(1000, 5)

In [83]:
df_credit_simple_random_sample.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
1860,1861,40240.727556,26.959005,7498.630447,1
353,354,46706.458861,18.830336,7084.263509,1
1333,1334,51211.654039,45.628568,4093.360006,0
905,906,67675.804771,37.740396,4396.076877,0
1289,1290,36965.742479,53.762359,6333.391588,0


## Sistematic sampling

In [92]:
df_credit_sistematic_sample = sistematic_sampling(df=df_credit, n_sample=1000)
df_credit_sistematic_sample.shape

(999, 5)

In [85]:
df_credit_sistematic_sample.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
2,3,57317.170063,63.108049,8020.953296,0
4,5,66952.688845,18.584336,8770.099235,1
6,7,48430.359613,26.809132,5722.581981,0
8,9,40654.892537,55.496853,4755.82528,0
10,11,64131.415372,25.679575,4351.028971,0


## Grouped sampling

In [109]:
df_credit_grouped_sample = grouped_sampling(df=df_credit, n_sample=1000)
df_credit_grouped_sample.shape

(1002, 5)

In [110]:
df_credit_grouped_sample.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


## Stradified sample

In [111]:
df_stradified_sample = stratified_sampling(df=df_credit, n_sample=1000, column='c#default')
df_stradified_sample.shape

(1000, 5)

In [112]:
df_stradified_sample.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
1083,1084,62724.636116,39.835377,9255.137755,0
521,522,25789.209798,26.49417,2410.277414,0
876,877,46283.067465,48.288585,2166.1231,0
1854,1855,54957.449674,59.506367,6976.463275,0
1164,1165,64162.649615,53.855563,6938.01252,0


In [114]:
df_credit['c#default'].value_counts()

c#default
0    1717
1     283
Name: count, dtype: int64

In [113]:
df_stradified_sample['c#default'].value_counts()

c#default
0    859
1    141
Name: count, dtype: int64

## Reservoir sampling

In [115]:
df_credit_reservoir_sample = reservoir_sampling(df=df_credit, n_sample=1000)
df_credit_reservoir_sample.shape

(1000, 5)

In [116]:
df_credit_reservoir_sample.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
559,560,53741.371019,49.729433,6513.150125,0
536,537,51121.656869,58.527181,6471.628202,0
1077,1078,58910.291775,31.013962,3671.923094,0
3,4,42709.534201,45.751972,6103.64226,0
428,429,62678.645451,25.839394,333.441398,0


## Testing sampling methods

In [125]:
for col in ['income', 'age', 'loan']:
    res = evaluate_sampling_method(df=df_credit, sampling_func=simple_random_sampling, column=col)
    print(f'Error in col {col} : {res}')

Error in col income : 45950.165255470245
Error in col age : 2.4007753395162936
Error in col loan : 4778.2516680482595


In [126]:
for col in ['income', 'age', 'loan']:
    res = evaluate_sampling_method(df=df_credit, sampling_func=grouped_sampling, column=col)
    print(f'Error in col {col} : {res}')

Error in col income : 44956.546356548955
Error in col age : 2.2955330226487405
Error in col loan : 4348.069821398954


In [128]:
for col in ['income', 'age', 'loan']:
    res = evaluate_sampling_method(df=df_credit, sampling_func=stratified_sampling, column=col, stratified_column='c#default')
    print(f'Error in col {col} : {res}')

Error in col income : 44901.149369319945
Error in col age : 2.606345031543917
Error in col loan : 4590.096428376535


In [129]:
for col in ['income', 'age', 'loan']:
    res = evaluate_sampling_method(df=df_credit, sampling_func=reservoir_sampling, column=col)
    print(f'Error in col {col} : {res}')

Error in col income : 44166.10318772499
Error in col age : 2.445209433369861
Error in col loan : 4557.352212362099
