In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

##### Simple Random Sampling

In [6]:
np.random.randint(1,100)

6

In [68]:
#np.random.seed(3)
np.random.randint(1,100)

57

In [10]:
df = pd.DataFrame({
    'id': range(1,101),
    'value': np.random.randint(1,100, 100)
})

In [11]:
df

Unnamed: 0,id,value
0,1,4
1,2,57
2,3,73
3,4,1
4,5,22
...,...,...
95,96,61
96,97,89
97,98,56
98,99,12


In [13]:
sample_data = df.sample(n=10, random_state=0)

In [14]:
sample_data

Unnamed: 0,id,value
26,27,52
86,87,92
2,3,73
55,56,47
75,76,34
93,94,10
16,17,15
73,74,49
54,55,38
95,96,61


In [15]:
df['value'].mean()

47.65

In [16]:
sample_data['value'].mean()

47.1

In [17]:
df['value'].std(), sample_data['value'].std()

(27.67246197061533, 24.87502093801464)

##### Simple Stratified Sampling

In [21]:
np.random.choice(['A','B','C'], 10)

array(['B', 'C', 'A', 'B', 'B', 'A', 'A', 'A', 'A', 'A'], dtype='<U1')

In [22]:
df = pd.DataFrame({
    'id': range(1,101),
    'value': np.random.randint(1,100, 100),
    'category': np.random.choice(['A','B','C'], 100)
})

In [23]:
df

Unnamed: 0,id,value,category
0,1,28,C
1,2,54,C
2,3,9,A
3,4,21,C
4,5,9,B
...,...,...,...
95,96,99,A
96,97,22,C
97,98,35,B
98,99,30,C


In [24]:
df['category'].value_counts()

category
C    36
B    33
A    31
Name: count, dtype: int64

In [25]:
gr = df.groupby('category')

In [26]:
gr.groups

{'A': [2, 7, 8, 12, 16, 17, 19, 21, 24, 26, 27, 31, 34, 41, 44, 47, 50, 54, 55, 56, 57, 59, 62, 67, 68, 71, 80, 82, 89, 95, 99], 'B': [4, 5, 6, 9, 11, 15, 20, 22, 33, 38, 39, 43, 51, 53, 58, 61, 63, 66, 69, 70, 73, 74, 75, 76, 77, 79, 81, 83, 84, 87, 91, 92, 97], 'C': [0, 1, 3, 10, 13, 14, 18, 23, 25, 28, 29, 30, 32, 35, 36, 37, 40, 42, 45, 46, 48, 49, 52, 60, 64, 65, 72, 78, 85, 86, 88, 90, 93, 94, 96, 98]}

In [27]:
gr.ngroups

3

In [33]:
def stratified_sample(df, strata_col, sample_size):
    # Group by the strata column
    grouped = df.groupby(strata_col)
    # Calculate sample size per group
    n_per_group = sample_size // grouped.ngroups
    # Perform sampling
    stratified_sample = grouped.apply(lambda x: x.sample(
        n=n_per_group, random_state=1)).reset_index(drop=True)
    return stratified_sample

In [35]:
stratified_sample = stratified_sample(df, 'category',10)

In [36]:
stratified_sample

Unnamed: 0,id,value,category
0,83,12,A
1,13,61,A
2,57,64,A
3,59,14,B
4,71,93,B
5,10,10,B
6,89,94,C
7,97,22,C
8,86,60,C


In [38]:
def propotional_stratified_sample(df, strata_col, sample_size):
    # Group by the strata column
    grouped = df.groupby(strata_col)
    # Perform propotional sampling
    stratified_sample = grouped.apply(lambda x: x.sample(
        frac = sample_size/len(df), random_state=1)).reset_index(drop=True)
    return stratified_sample

In [41]:
stratified_sample = propotional_stratified_sample(df, 'category',10)

In [42]:
stratified_sample

Unnamed: 0,id,value,category
0,83,12,A
1,13,61,A
2,57,64,A
3,59,14,B
4,71,93,B
5,10,10,B
6,89,94,C
7,97,22,C
8,86,60,C
9,11,27,C


##### Cluster Sampling

In [44]:
df = pd.DataFrame({
    'id': range(1,101),
    'value': np.random.randint(1,100, 100),
    'cluster': np.random.choice(['cluster1','cluster2','cluster3',
                                 'cluster4','cluster5'], 100)
})

In [45]:
df

Unnamed: 0,id,value,cluster
0,1,61,cluster5
1,2,35,cluster4
2,3,66,cluster2
3,4,39,cluster2
4,5,51,cluster1
...,...,...,...
95,96,39,cluster5
96,97,84,cluster5
97,98,97,cluster1
98,99,84,cluster1


In [46]:
def cluster_sample(df, cluster_col, num_clusters):
    # Get unique clusters
    clusters = df[cluster_col].unique()
    # Randomly select the desired number of clusters
    selected_clusters = np.random.choice(clusters, num_clusters, replace=False)
    # Select all rows from the chosen clusters
    cluster_sampled_data = df[df[cluster_col].isin(selected_clusters)]
    return cluster_sampled_data

In [51]:
cl = cluster_sample(df, 'cluster', 2)

In [52]:
cl

Unnamed: 0,id,value,cluster
1,2,35,cluster4
2,3,66,cluster2
3,4,39,cluster2
5,6,60,cluster2
6,7,19,cluster4
9,10,22,cluster2
12,13,24,cluster4
15,16,37,cluster4
20,21,99,cluster4
21,22,52,cluster4


##### Systematic Sampling

In [54]:
df = pd.DataFrame({
    'id': range(1,101),
    'value': np.random.randint(1,100, 100)
})

In [55]:
def systematic_sample(df, sample_size):
    # Calculate the sampling interval
    k = len(df) // sample_size
    # Generate a random start
    start = np.random.randint(0, k)
    # Select every k-th element starting from the random start
    systematic_sample = df.iloc[start::k].reset_index(drop=True)
    return systematic_sample

In [64]:
sample = systematic_sample(df, 15)

In [65]:
sample

Unnamed: 0,id,value
0,3,51
1,9,83
2,15,2
3,21,97
4,27,30
5,33,6
6,39,37
7,45,34
8,51,65
9,57,60
