## Part 3

In [4]:
import pandas as pd
import matplotlib as plt
from scipy import stats
import numpy as np

In [5]:
df = pd.read_csv('rainfall_data/rainfall_india.csv')

df.head()

Unnamed: 0,SUBDIVISION,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,ANNUAL,Jan-Feb,Mar-May,Jun-Sep,Oct-Dec
0,ANDAMAN & NICOBAR ISLANDS,1901,49.2,87.1,29.2,2.3,528.8,517.5,365.1,481.1,332.6,388.5,558.2,33.6,3373.2,136.3,560.3,1696.3,980.3
1,ANDAMAN & NICOBAR ISLANDS,1902,0.0,159.8,12.2,0.0,446.1,537.1,228.9,753.7,666.2,197.2,359.0,160.5,3520.7,159.8,458.3,2185.9,716.7
2,ANDAMAN & NICOBAR ISLANDS,1903,12.7,144.0,0.0,1.0,235.1,479.9,728.4,326.7,339.0,181.2,284.4,225.0,2957.4,156.7,236.1,1874.0,690.6
3,ANDAMAN & NICOBAR ISLANDS,1904,9.4,14.7,0.0,202.4,304.5,495.1,502.0,160.1,820.4,222.2,308.7,40.1,3079.6,24.1,506.9,1977.6,571.0
4,ANDAMAN & NICOBAR ISLANDS,1905,1.3,0.0,3.3,26.9,279.5,628.7,368.7,330.5,297.0,260.7,25.4,344.7,2566.7,1.3,309.7,1624.9,630.8


In [6]:
from scipy.stats import t


population1 = df[df['SUBDIVISION'] == 'LAKSHADWEEP']['ANNUAL'].dropna()
population2 = df[df['SUBDIVISION'] == 'ANDAMAN & NICOBAR ISLANDS']['ANNUAL'].dropna()


## Calculate population means 
mean1 = population1.mean()
mean2 = population2.mean()
actual_population_diff = mean1 - mean2
print(f"Mean rainfall in Lakshadweep: {mean1:.4f}")  
print(f"Mean rainfall in Andaman&Nicobar Islands: {mean2:.4f}")
print(f"Population mean difference in rainfall: {mean1 - mean2:.4f}")

## TAKE A SAMPLE from each population
sample_size = 100
sample1 = population1.sample(n=sample_size, random_state=1)
sample2 = population2.sample(n=sample_size, random_state=1)

## Calculate sample means
n1, n2 = len(sample1), len(sample2)

sample_mean1 = sample1.mean()
sample_mean2 = sample2.mean()
print(f"Sample mean rainfall in Lakshadweep: {sample_mean1:.4f}")
print(f"Sample mean rainfall in Andaman&Nicobar Islands: {sample_mean2:.4f}")

var1, var2 = np.var(sample1, ddof=1), np.var(sample2, ddof=1)

# Compute degrees of freedom (Welch-Satterthwaite approximation)
numerator = ((var1/n1 + var2/n2))**2
denominator = ((var1/n1)**2/ (n1 - 1)) + ((var2/n2)**2 / (n2 - 1))
degrees_of_freedom = numerator / denominator




std_error = np.sqrt(var1/n1 + var2/n2)
t_critical = t.ppf(0.975, degrees_of_freedom)  # Two-tailed test at 95% confidence level

mean_diff = sample_mean1 - sample_mean2
ci_lower = mean_diff - t_critical * std_error
ci_upper = mean_diff + t_critical * std_error

print(f"Mean Difference: {mean_diff:.4f}")
print(f"95% Confidence Interval: ({ci_lower:.4f}, {ci_upper:.4f})")


Mean rainfall in Lakshadweep: 1590.8864
Mean rainfall in Andaman&Nicobar Islands: 2927.4394
Population mean difference in rainfall: -1336.5530
Sample mean rainfall in Lakshadweep: 1590.2980
Sample mean rainfall in Andaman&Nicobar Islands: 2924.9880
Mean Difference: -1334.6900
95% Confidence Interval: (-1432.0621, -1237.3179)


In [7]:

alphas = [0.75,0.80,0.90,0.95,0.975]
n_samples = 100
sample_size = 50
sample_sizes =[30,50,100]

for sample_size in sample_sizes:
    print(f'Sample Size {sample_size}')
    for alpha in alphas:
        count = 0
        for i in range(n_samples):
            sample1 = population1.sample(n=sample_size)
            sample2 = population2.sample(n=sample_size)

            sample_mean1 = sample1.mean()
            sample_mean2 = sample2.mean()

            n1, n2 = len(sample1), len(sample2)


            var1, var2 = np.var(sample1, ddof=1), np.var(sample2, ddof=1)

            # Compute degrees of freedom (Welch-Satterthwaite approximation)
            numerator = ((var1/n1 + var2/n2))**2
            denominator = ((var1/n1)**2/ (n1 - 1)) + ((var2/n2)**2 / (n2 - 1))
            degrees_of_freedom = numerator / denominator

            std_error = np.sqrt(var1/n1 + var2/n2)
            t_critical = t.ppf(alpha, degrees_of_freedom)  # Two-tailed test at 95% confidence level

            mean_diff = sample_mean1 - sample_mean2
            ci_lower = mean_diff - t_critical * std_error
            ci_upper = mean_diff + t_critical * std_error


            if actual_population_diff > ci_lower and actual_population_diff < ci_upper:
                count += 1


     
        print(f'Percentage of Samples that contained the population mean difference for confidence level {alpha *100} : {count/n_samples*100:.2f}%')

Sample Size 30
Percentage of Samples that contained the population mean difference for confidence level 75.0 : 51.00%
Percentage of Samples that contained the population mean difference for confidence level 80.0 : 67.00%
Percentage of Samples that contained the population mean difference for confidence level 90.0 : 83.00%
Percentage of Samples that contained the population mean difference for confidence level 95.0 : 97.00%
Percentage of Samples that contained the population mean difference for confidence level 97.5 : 95.00%
Sample Size 50
Percentage of Samples that contained the population mean difference for confidence level 75.0 : 67.00%
Percentage of Samples that contained the population mean difference for confidence level 80.0 : 75.00%
Percentage of Samples that contained the population mean difference for confidence level 90.0 : 94.00%
Percentage of Samples that contained the population mean difference for confidence level 95.0 : 97.00%
Percentage of Samples that contained the po

In [8]:
from scipy.stats import t

region1 = 'LAKSHADWEEP'
region2= 'ANDAMAN & NICOBAR ISLANDS'

population1 = df[df['SUBDIVISION'] == region1]['ANNUAL'].dropna()
population2 = df[df['SUBDIVISION'] == region2]['ANNUAL'].dropna()


## Calculate population means 
mean1 = population1.mean()
mean2 = population2.mean()
actual_population_diff = mean1 - mean2
print(f"Mean rainfall in {region1}: {mean1:.4f}")  
print(f"Mean rainfall in {region2}: {mean2:.4f}")
print(f"Population mean difference in rainfall: {mean1 - mean2:.4f}")

## TAKE A SAMPLE from each population
sample_size = 50
sample1 = population1.sample(n=sample_size, random_state=1)
sample2 = population2.sample(n=sample_size, random_state=1)

## Calculate sample means
n1, n2 = len(sample1), len(sample2)

sample_mean1 = sample1.mean()
sample_mean2 = sample2.mean()
print(f"Sample mean rainfall in {region1}: {sample_mean1:.4f}")
print(f"Sample mean rainfall in {region2}: {sample_mean2:.4f}")

var1, var2 = np.var(sample1, ddof=1), np.var(sample2, ddof=1)

# Compute degrees of freedom (Welch-Satterthwaite approximation)
numerator = ((var1/n1 + var2/n2))**2
denominator = ((var1/n1)**2/ (n1 - 1)) + ((var2/n2)**2 / (n2 - 1))
degrees_of_freedom = numerator / denominator




std_error = np.sqrt(var1/n1 + var2/n2)
t_critical = t.ppf(0.975, degrees_of_freedom)  # Two-tailed test at 95% confidence level

mean_diff = sample_mean1 - sample_mean2
ci_lower = mean_diff - t_critical * std_error
ci_upper = mean_diff + t_critical * std_error

print(f"Mean Difference: {mean_diff:.4f}")
print(f"95% Confidence Interval: ({ci_lower:.4f}, {ci_upper:.4f})")

Mean rainfall in LAKSHADWEEP: 1590.8864
Mean rainfall in ANDAMAN & NICOBAR ISLANDS: 2927.4394
Population mean difference in rainfall: -1336.5530
Sample mean rainfall in LAKSHADWEEP: 1581.6360
Sample mean rainfall in ANDAMAN & NICOBAR ISLANDS: 2842.0400
Mean Difference: -1260.4040
95% Confidence Interval: (-1404.3736, -1116.4344)


In [9]:

alphas = [0.75,0.80,0.90,0.95,0.975,1]
n_samples = 1000
sample_sizes =[30,50,100]

for sample_size in sample_sizes:
    print(f'Sample Size {sample_size}')
    for alpha in alphas:
        count = 0
        for i in range(n_samples):
            sample1 = population1.sample(n=sample_size)
            sample2 = population2.sample(n=sample_size)

            sample_mean1 = sample1.mean()
            sample_mean2 = sample2.mean()

            n1, n2 = len(sample1), len(sample2)


            var1, var2 = np.var(sample1, ddof=1), np.var(sample2, ddof=1)

            # Compute degrees of freedom (Welch-Satterthwaite approximation)
            numerator = ((var1/n1 + var2/n2))**2
            denominator = ((var1/n1)**2/ (n1 - 1)) + ((var2/n2)**2 / (n2 - 1))
            degrees_of_freedom = numerator / denominator

            std_error = np.sqrt(var1/n1 + var2/n2)
            t_critical = t.ppf(alpha, degrees_of_freedom)  # Two-tailed test at 95% confidence level

            mean_diff = sample_mean1 - sample_mean2
            ci_lower = mean_diff - t_critical * std_error
            ci_upper = mean_diff + t_critical * std_error

            # print(ci_lower , " " , ci_upper)


            if actual_population_diff > ci_lower and actual_population_diff < ci_upper:
                count += 1


     
        print(f'Percentage of Samples that contained the population mean difference for confidence level {alpha *100} : {count/n_samples*100:.2f}%')

Sample Size 30
Percentage of Samples that contained the population mean difference for confidence level 75.0 : 54.70%
Percentage of Samples that contained the population mean difference for confidence level 80.0 : 67.60%
Percentage of Samples that contained the population mean difference for confidence level 90.0 : 87.90%
Percentage of Samples that contained the population mean difference for confidence level 95.0 : 95.10%
Percentage of Samples that contained the population mean difference for confidence level 97.5 : 97.80%
Percentage of Samples that contained the population mean difference for confidence level 100 : 100.00%
Sample Size 50
Percentage of Samples that contained the population mean difference for confidence level 75.0 : 66.00%
Percentage of Samples that contained the population mean difference for confidence level 80.0 : 75.30%
Percentage of Samples that contained the population mean difference for confidence level 90.0 : 92.20%
Percentage of Samples that contained the po