In [1]:
import pandas as pd
import numpy as np
import random
import os

In [2]:
FILE_PATH = os.path.join('..', 'datasets', 'econdata.csv')

In [3]:
econdata = pd.read_csv(FILE_PATH)

### Simple Random Sampling 

In [4]:
aleat_8 = econdata.sample(n=8)
aleat_8

Unnamed: 0,id,geo_point_2d,geo_shape,clave_cat,delegacion,perimetro,tipo,nom_id
166,166,"19.4394422606,-99.1501035229","{""type"": ""MultiPoint"", ""coordinates"": [[-99.15...",012_153_08,Cuautémoc,B,Hotel,Polly
131,131,"19.4389531509,-99.1479144659","{""type"": ""MultiPoint"", ""coordinates"": [[-99.14...",003_080_06,Cuautémoc,A,Museo,Panteón de San Fernando
202,202,"19.43916542,-99.1331749","{""type"": ""MultiPoint"", ""coordinates"": [[-99.13...",004_081_23,Cuautémoc,A,Hotel,Río de Janeiro
209,209,"19.4236610108,-99.1437572917","{""type"": ""MultiPoint"", ""coordinates"": [[-99.14...",002_087_02,Cuautémoc,B,Hotel,Catalina
69,69,"19.43558625,-99.12965746","{""type"": ""MultiPoint"", ""coordinates"": [[-99.12...",005_129_08,Cuautémoc,A,Hotel,Templo Mayor
223,223,"19.4285106481,-99.1367967407","{""type"": ""MultiPoint"", ""coordinates"": [[-99.13...",001_055_10,Cuautémoc,A,Hotel,Niza
36,36,"19.4425777264,-99.1292760518","{""type"": ""Polygon"", ""coordinates"": [[[-99.1288...",005_077_01,Cuauhtémoc,B,Mercado,Granaditas
225,225,"19.43094655,-99.12455418","{""type"": ""MultiPoint"", ""coordinates"": [[-99.12...",323_027_04,Venustiano Carranza,B,Hotel,Gran Veracruz


In [5]:
prop_25 = econdata.sample(frac=0.25)
prop_25.shape

(58, 8)

### Systematic Sampling

In [6]:
def systematic_sampling(econdata, step):
    indexes = np.arange(0, len(econdata), step=step)
    systematic_sample = econdata.iloc[indexes]
    return systematic_sample

In [7]:
systematic_sample = systematic_sampling(econdata, 3)
systematic_sample

Unnamed: 0,id,geo_point_2d,geo_shape,clave_cat,delegacion,perimetro,tipo,nom_id
0,0,"19.424781053,-99.1327537959","{""type"": ""Polygon"", ""coordinates"": [[[-99.1332...",307_130_11,Cuauhtémoc,B,Mercado,Pino Suárez
3,3,"19.42489472,-99.12073393","{""type"": ""MultiPoint"", ""coordinates"": [[-99.12...",323_102_06,Venustiano Carranza,B,Hotel,Balbuena
6,6,"19.43553422,-99.12324801","{""type"": ""MultiPoint"", ""coordinates"": [[-99.12...",318_116_11,Venustiano Carranza,B,Hotel,San Antonio Tomatlan
9,9,"19.4407152937,-99.1498060057","{""type"": ""MultiPoint"", ""coordinates"": [[-99.14...",012_146_22,Cuautémoc,B,Hotel,Detroit
12,12,"19.43990186,-99.14813347","{""type"": ""MultiPoint"", ""coordinates"": [[-99.14...",003_079_16,Cuautémoc,B,Hotel,La Paz
...,...,...,...,...,...,...,...,...
216,216,"19.4247697438,-99.1249707246","{""type"": ""Polygon"", ""coordinates"": [[[-99.1250...",323_118_37,Venustiano Carranza,B,Mercado,Lamininas de La Merced
219,219,"19.4234096295,-99.1438351156","{""type"": ""MultiPoint"", ""coordinates"": [[-99.14...",002_118_03,Cuautémoc,B,Hotel,Prado Floresta
222,222,"19.4397905042,-99.134628735","{""type"": ""MultiPoint"", ""coordinates"": [[-99.13...",004_063_10,Cuautémoc,A,Hotel,Ladero
225,225,"19.43094655,-99.12455418","{""type"": ""MultiPoint"", ""coordinates"": [[-99.12...",323_027_04,Venustiano Carranza,B,Hotel,Gran Veracruz


### Stratified Sampling

In [9]:
econdata['stratified'] = econdata['delegacion'] + ',' + econdata['tipo']
((econdata['stratified'].value_counts()/len(econdata)) * 100).sort_values(ascending=False)

Cuautémoc,Hotel                64.347826
Cuautémoc,Museo                15.652174
Venustiano Carranza,Hotel       7.826087
Cuauhtémoc,Mercado              7.391304
Venustiano Carranza,Mercado     4.782609
Name: stratified, dtype: float64

Our data states that the proportion is the following:

- Hoteles in Cuauhtemoc 0.5
- Museos in Cuauhtemoc 0.2
- Hoteles in Venustiana Carranza 0.1
- Mercados in Cuauhtemoc 0.1
- Mercados in Venustiana Carranza 0.1

In [12]:
def stratified_data(data, column_names, strat_values, strat_prop, random_state=None):
    df_strat = pd.DataFrame(columns=data.columns)
    position = -1
    
    for i in range(len(strat_values)):
        position += 1
        if position == len(strat_values) - 1:
            ratio_len = len(data) - len(df_strat)
        else:
            ratio_len = int(len(data) * strat_prop[i])
    
    df_filtered = data[data[column_names] == strat_values[i]]
    df_tmp = df_filtered.sample(replace=True, n=ratio_len, random_state=random_state)

    df_strat = pd.concat([df_strat, df_tmp])

    return df_strat

In [13]:
stratified_values = [
    "Cuautémoc,Hotel",
    "Cuautémoc,Museo",
    "Venustiano Carranza,Hotel",
    "Cuauhtémoc,Mercado",
    "Venustiano Carranza,Mercado",
]
prop_strat = [0.5, 0.2, 0.1, 0.1, 0.1]

df_strat = stratified_data(econdata, 'stratified', stratified_values, prop_strat)

In [14]:
df_strat.head()

Unnamed: 0,id,geo_point_2d,geo_shape,clave_cat,delegacion,perimetro,tipo,nom_id,stratified
216,216,"19.4247697438,-99.1249707246","{""type"": ""Polygon"", ""coordinates"": [[[-99.1250...",323_118_37,Venustiano Carranza,B,Mercado,Lamininas de La Merced,"Venustiano Carranza,Mercado"
216,216,"19.4247697438,-99.1249707246","{""type"": ""Polygon"", ""coordinates"": [[[-99.1250...",323_118_37,Venustiano Carranza,B,Mercado,Lamininas de La Merced,"Venustiano Carranza,Mercado"
156,156,"19.4255480371,-99.1249308096","{""type"": ""Polygon"", ""coordinates"": [[[-99.1253...",323_138_04 (3),Venustiano Carranza,B,Mercado,Mariscos,"Venustiano Carranza,Mercado"
163,163,"19.4265454033,-99.1224859032","{""type"": ""Polygon"", ""coordinates"": [[[-99.1231...",323_063_05,Venustiano Carranza,B,Mercado,,"Venustiano Carranza,Mercado"
216,216,"19.4247697438,-99.1249707246","{""type"": ""Polygon"", ""coordinates"": [[[-99.1250...",323_118_37,Venustiano Carranza,B,Mercado,Lamininas de La Merced,"Venustiano Carranza,Mercado"
