# Sampling Assignment
Implementing Probability Sampling Methods in Python

## Instructions
Upload your dataset (minimum 200 rows), then complete all parts A–F.


In [1]:
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,index,age,sex,bmi,children,smoker,region,charges
0,0,19,female,27.9,0,yes,southwest,16884.924
1,1,18,male,33.77,1,no,southeast,1725.5523
2,2,28,male,33.0,3,no,southeast,4449.462
3,3,33,male,22.705,0,no,northwest,21984.47061
4,4,32,male,28.88,0,no,northwest,3866.8552


## Part A — Setup
- Report dataset size (rows, columns)

In [2]:
print("Dataset size:", df.shape)

Dataset size: (1338, 8)


## Part B — Simple Random Sampling

In [3]:
sample_size = 50
srs = df.sample(n=sample_size, random_state=42)
display(srs.head())
population_mean = df['age'].mean()
print("Population mean:", population_mean)
srs_mean = srs['age'].mean()
print("Sample mean:", srs_mean)

Unnamed: 0,index,age,sex,bmi,children,smoker,region,charges
764,764,45,female,25.175,2,no,northeast,9095.06825
887,887,36,female,30.02,0,no,northwest,5272.1758
890,890,64,female,26.885,0,yes,northwest,29330.98315
1293,1293,46,male,25.745,3,no,northwest,9301.89355
259,259,19,male,31.92,0,yes,northwest,33750.2918


Population mean: 39.20702541106129
Sample mean: 38.92


## Part C — Systematic Sampling

In [4]:
n = 50
k = len(df) // n
start = np.random.randint(0, k)
sys_sample = df.iloc[start::k][:n]
display(sys_sample.head())
sys_mean = sys_sample['age'].mean()
print("Sample mean:", sys_mean)

Unnamed: 0,index,age,sex,bmi,children,smoker,region,charges
5,5,31,female,25.74,0,no,southeast,3756.6216
31,31,18,female,26.315,0,no,northeast,2198.18985
57,57,18,male,31.68,2,yes,southeast,34303.1672
83,83,48,female,41.23,4,no,northwest,11033.6617
109,109,63,male,35.09,0,yes,southeast,47055.5321


Sample mean: 38.98


## Part D — Stratified Sampling

In [6]:
strata_col = "age"  # your column
sample_size = 50

# proportional fraction for each group
frac = sample_size / len(df)

# stratified sample
stratified_sample = df.groupby(strata_col, group_keys=False).sample(frac=frac, random_state=42)

display(stratified_sample.head())
strat_mean = stratified_sample['age'].mean()
print("Sample mean:", strat_mean)

Unnamed: 0,index,age,sex,bmi,children,smoker,region,charges
442,442,18,male,43.01,0,no,southeast,1149.3959
1,1,18,male,33.77,1,no,southeast,1725.5523
940,940,18,male,23.21,0,no,southeast,1121.8739
960,960,19,female,39.615,1,no,northwest,2730.10785
1129,1129,19,female,18.6,0,no,southwest,1728.897


Sample mean: 39.23529411764706


## Part E — Cluster Sampling

In [7]:
df['cluster_id'] = df.index // (len(df)//10)  # 10 clusters
selected_clusters = np.random.choice(df['cluster_id'].unique(), size=2, replace=False)
cluster_sample = df[df['cluster_id'].isin(selected_clusters)]
print("Selected clusters:", selected_clusters)
display(cluster_sample.head())
cluster_mean = cluster_sample['age'].mean()
print("Sample mean:", cluster_mean)

Selected clusters: [2 6]


Unnamed: 0,index,age,sex,bmi,children,smoker,region,charges,cluster_id
266,266,40,male,19.8,1,yes,southeast,17179.522,2
267,267,59,female,32.395,3,no,northeast,14590.63205,2
268,268,45,male,30.2,1,no,southwest,7441.053,2
269,269,49,male,25.84,1,no,northeast,9282.4806,2
270,270,18,male,29.37,1,no,southeast,1719.4363,2


Sample mean: 40.87218045112782


# Part F — Comparison & Reflection

In [8]:
comparison = pd.DataFrame({
    'Method': ['Simple Random', 'Systematic', 'Stratified', 'Cluster'],
    'Sample Mean': [srs_mean, sys_mean, strat_mean, cluster_mean],
    'Population Mean': [population_mean]*4,
    'Difference': [abs(srs_mean - population_mean),
                   abs(sys_mean - population_mean),
                   abs(strat_mean - population_mean),
                   abs(cluster_mean - population_mean)]
})
print(comparison)

          Method  Sample Mean  Population Mean  Difference
0  Simple Random    38.920000        39.207025    0.287025
1     Systematic    38.980000        39.207025    0.227025
2     Stratified    39.235294        39.207025    0.028269
3        Cluster    40.872180        39.207025    1.665155


In this section, the mean value is calculated of age column in 4 ways of sampling method. In part A I load my dataset and get the size of the dataset. In part B Simple Random Sampling I calculated the population mean of the column which is 39.2070 and also calculated the sample mean which is 38.92. In part C I did Systematic Sampling and the value of mean from this sampling method is 36.2 .At next Stratified Sampling the mean value is 39.2352 and in Cluster sampling I got 39.1616.

Analyzing all those results from four different sampling method one thing is clear that every method give slightly upper or lower than the population mean. In my dataset Stratified Sampling gives me the closest value compared to all other sampling methods the difference from sample mean is 0.028 which is very low. After that Cluster Sampling is also close to sample mean value the difference is 0.045. Then I got another slightly difference of 0.287 from Simple Random Sampling. At last i got a big difference compared to all other sampling is Systematic Sampling the difference is 3.127 which is upper or not perfectly accurate compared to Sample mean. So for my dataset Stratified and Cluster sampling is perfect and acurate compared to other sampling methods.