# Sampling Assignment
Implementing Probability Sampling Methods in Python

## Instructions
Upload your dataset (minimum 200 rows), then complete all parts A–F.


In [4]:
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,Gender,Age,NS1,IgG,IgM,Area,AreaType,HouseType,District,Outcome
0,Female,45,0,0,0,Mirpur,Undeveloped,Building,Dhaka,0
1,Male,17,0,0,1,Chawkbazar,Developed,Building,Dhaka,0
2,Female,29,0,0,0,Paltan,Undeveloped,Other,Dhaka,0
3,Female,63,1,1,0,Motijheel,Developed,Other,Dhaka,1
4,Male,22,0,0,0,Gendaria,Undeveloped,Building,Dhaka,0


## Part A — Setup
- Report dataset size (rows, columns)

In [5]:
print("Dataset size:", df.shape)

Dataset size: (1000, 10)


## Part B — Simple Random Sampling

In [7]:
sample_size = 50
srs = df.sample(n=sample_size, random_state=42)
display(srs.head())
population_mean = df['Age'].mean()
print("Population mean:", population_mean)
srs_mean = srs['Age'].mean()
print("Sample mean:", srs_mean)

Unnamed: 0,Gender,Age,NS1,IgG,IgM,Area,AreaType,HouseType,District,Outcome
521,Male,23,1,1,0,Kamrangirchar,Developed,Tinshed,Dhaka,1
737,Male,56,1,1,0,Kalabagan,Developed,Building,Dhaka,1
740,Female,32,0,0,1,Shahbagh,Undeveloped,Tinshed,Dhaka,0
660,Male,18,0,0,0,New Market,Undeveloped,Building,Dhaka,0
411,Male,24,0,0,1,Hazaribagh,Developed,Other,Dhaka,0


Population mean: 35.924
Sample mean: 36.64


## Part C — Systematic Sampling

In [8]:
n = 50
k = len(df) // n
start = np.random.randint(0, k)
sys_sample = df.iloc[start::k][:n]
display(sys_sample.head())
sys_mean = sys_sample['Age'].mean()
print("Sample mean:", sys_mean)

Unnamed: 0,Gender,Age,NS1,IgG,IgM,Area,AreaType,HouseType,District,Outcome
7,Male,26,0,0,0,New Market,Developed,Other,Dhaka,0
27,Female,60,0,0,0,Pallabi,Developed,Other,Dhaka,0
47,Male,17,0,0,1,Ramna,Developed,Building,Dhaka,0
67,Female,38,1,1,0,Kadamtali,Developed,Tinshed,Dhaka,1
87,Female,54,1,1,1,Demra,Developed,Building,Dhaka,1


Sample mean: 36.0


## Part D — Stratified Sampling

In [9]:
strata_col = "Area"  # your column
sample_size = 50

# proportional fraction for each group
frac = sample_size / len(df)

# stratified sample
stratified_sample = df.groupby(strata_col, group_keys=False).sample(frac=frac, random_state=42)

display(stratified_sample.head())
strat_mean = stratified_sample['Age'].mean()
print("Sample mean:", strat_mean)

Unnamed: 0,Gender,Age,NS1,IgG,IgM,Area,AreaType,HouseType,District,Outcome
21,Female,23,0,0,0,Adabor,Developed,Other,Dhaka,0
631,Male,54,1,1,1,Badda,Developed,Building,Dhaka,1
365,Female,50,1,1,1,Badda,Developed,Tinshed,Dhaka,1
45,Female,13,0,0,0,Banasree,Developed,Tinshed,Dhaka,0
42,Male,51,1,1,0,Bangshal,Undeveloped,Tinshed,Dhaka,1


Sample mean: 35.82692307692308


## Part E — Cluster Sampling

In [10]:
df['cluster_id'] = df.index // (len(df)//10)  # 10 clusters
selected_clusters = np.random.choice(df['cluster_id'].unique(), size=2, replace=False)
cluster_sample = df[df['cluster_id'].isin(selected_clusters)]
print("Selected clusters:", selected_clusters)
display(cluster_sample.head())
cluster_mean = cluster_sample['Age'].mean()
print("Sample mean:", cluster_mean)

Selected clusters: [1 2]


Unnamed: 0,Gender,Age,NS1,IgG,IgM,Area,AreaType,HouseType,District,Outcome,cluster_id
100,Female,34,0,0,0,Sutrapur,Undeveloped,Other,Dhaka,0,1
101,Female,58,0,0,0,Dhanmondi,Developed,Other,Dhaka,0,1
102,Female,51,0,0,1,Shyampur,Undeveloped,Building,Dhaka,0,1
103,Female,50,1,1,0,Bangshal,Developed,Tinshed,Dhaka,1,1
104,Female,9,0,0,0,Chawkbazar,Undeveloped,Tinshed,Dhaka,0,1


Sample mean: 36.81


# Part F — Comparison & Reflection

In [11]:
comparison = pd.DataFrame({
    'Method': ['Simple Random', 'Systematic', 'Stratified', 'Cluster'],
    'Sample Mean': [srs_mean, sys_mean, strat_mean, cluster_mean],
    'Population Mean': [population_mean]*4,
    'Difference': [abs(srs_mean - population_mean),
                   abs(sys_mean - population_mean),
                   abs(strat_mean - population_mean),
                   abs(cluster_mean - population_mean)]
})
print(comparison)

          Method  Sample Mean  Population Mean  Difference
0  Simple Random    36.640000           35.924    0.716000
1     Systematic    36.000000           35.924    0.076000
2     Stratified    35.826923           35.924    0.097077
3        Cluster    36.810000           35.924    0.886000
