In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from scipy.stats import ttest_ind, chisquare
from statsmodels.stats.proportion import proportions_ztest

## US Health Insurance Dataset [(source)](https://www.kaggle.com/datasets/teertha/ushealthinsurancedataset)

This dataset contains 1338 rows of insured data, where the Insurance charges are given against the following attributes of the insured: Age, Sex, BMI, Number of Children, Smoker and Region. There are no missing or undefined values in the dataset.

For the following exercises, assume that the data was taken from a random sample of users.

In [None]:
df = pd.read_csv('/content/drive/MyDrive/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### The solutions for the following problems have already been laid out for you. Fill in the missing values to produce the correct answer, or if you prefer, write your own code from scratch.

## Using the US Health Insurance Dataset, I'm testing if the insurance charges are different for males vs. females.


In [None]:
"""
Null Hypothesis: The insurance charges are the same for males and females.
Alternative Hypothesis: The insurance charges are different for males and females.
"""

male_charges = df.loc[df.sex == 'male', 'charges']
female_charges = df.loc[df.sex== 'female', 'charges']

alpha = 0.05;

t_stat, p_val = ttest_ind(male_charges, female_charges, alternative='two-sided')    # test statistics and p_value
print ('Test statistic: ', t_stat)
print ('p-value: ', p_val)

if p_val < alpha:    # greater than, less than, etc.?
    print (f'Reject the Null Hypothesis (alpha = {alpha}).')
else:
    print (f'Fail to Reject the Null Hypothesis (alpha = {alpha}).')

Test statistic:  2.097546590051688
p-value:  0.03613272100592976
Reject the Null Hypothesis (alpha = 0.05).


Conclucion: The insurance charges are different for males and females.

In [None]:
# Calculate the total charges for males and females
total_charges_by_sex = df.groupby('sex')['charges'].sum()

print(total_charges_by_sex)

sex
female    8.321061e+06
male      9.434764e+06
Name: charges, dtype: float64


## Testing if the insurance charges are higher for those with children.



In [None]:
"""
Null Hypothesis: The insurance charges are higher for children
Alternative Hypothesis: The insurance charges are not higher for children
"""

no_child = df.loc[df.children == 0, 'charges']
with_child = df.loc[df.children > 0, 'charges']

alpha = 0.05;

t_stat, p_val = ttest_ind(no_child, with_child, alternative='greater')    # get t-stat and p-value
print ('Test statistic: ', t_stat)
print ('p-value: ', p_val)

if p_val < alpha:
    print (f'Reject the Null Hypothesis (alpha = {alpha}).')
else:
    print (f'Fail to Reject the Null Hypothesis (alpha = {alpha}).')

Test statistic:  -2.3720611301669337
p-value:  0.9910849126582172
Fail to Reject the Null Hypothesis (alpha = 0.05).


Conclusion: The insurance charges are not higher for those with children.

In [None]:
# Calculate the total charges for males and females
total_charges_by_children = df.groupby('children')['charges'].sum()

print(total_charges_by_children)

children
0    7.098070e+06
1    4.124900e+06
2    3.617655e+06
3    2.410785e+06
4    3.462664e+05
5    1.581486e+05
Name: charges, dtype: float64


## Testing if the proportion of smokers is more than 20%


In [None]:
from statsmodels.stats.proportion import proportions_ztest

In [None]:
"""
Null Hypothesis: The proportion of smokers is more than 20%
Alternative Hypothesis: The proportion of smokers is less-than or equal to 20%
"""

smokers = df[df.smoker == 'yes']
non_smokers = df[df.smoker == 'no']

alpha = 0.05;

# compute for the p-value
z_stat, p_val = proportions_ztest(count=len(smokers),
                                  nobs=len(smokers)+len(non_smokers),
                                  value=0.2,
                                  alternative='larger')
print('p-value:', p_val)

if p_val < alpha:
    print (f'Reject the Null Hypothesis (alpha = {alpha}).')
else:
    print (f'Fail to Reject the Null Hypothesis (alpha = {alpha}).')

p-value: 0.33229937909683
Fail to Reject the Null Hypothesis (alpha = 0.05).


Conclusion: There is not enough evidence to say that the Proportion of smoker is more than 20%.

In [None]:
# Calculate the count of smokers and non-smokers
num_smokers = len(df[df['smoker'] == 'yes'])
num_non_smokers = len(df[df['smoker'] == 'no'])

# Calculate the percentages
total_count = num_smokers + num_non_smokers
percent_smokers = (num_smokers / total_count) * 100
percent_non_smokers = (num_non_smokers / total_count) * 100

print('Number of smokers:', num_smokers)
print('Number of non-smokers:', num_non_smokers)
print('Percentage of smokers:', percent_smokers, '%')
print('Percentage of non-smokers:', percent_non_smokers, '%')

Number of smokers: 274
Number of non-smokers: 1064
Percentage of smokers: 20.47832585949178 %
Percentage of non-smokers: 79.52167414050822 %


## testing if the proportion of smokers from each region match the table

|           | Non-Smoker | Smoker |
|-----------|------------|-----------|
| northeast     | 20%        | 5%        |
| northwest | 20%        | 5%       |
| southeast | 20%        | 5%       |
| southwest | 20%        | 5%       |


In [None]:
from scipy.stats import chisquare

In [None]:
"""
Null Hypothesis: Proportion of smokers from each region follows the table
Alternative Hypothesis: Proportion of smokers from each region do not follow the table
"""

obs = df.groupby(['region', 'smoker']).charges.count()    # smokers per region
n_obs = len(df)    # number of observations

f_obs = obs.values
f_exp = []    # expected frequencies
for exp in [0.2, 0.2, 0.2, 0.2, 0.05, 0.05, 0.05, 0.05]:
  f_exp.append(exp*n_obs)

alpha = 0.05;

chi_sq, p_val = chisquare(f_obs=f_obs, f_exp=f_exp)
print('p-value:', p_val)

if p_val < alpha:
    print (f'Reject the Null Hypothesis (alpha = {alpha}).')
else:
    print (f'Fail to Reject the Null Hypothesis (alpha = {alpha}).')

p-value: 0.0
Reject the Null Hypothesis (alpha = 0.05).


Conclusion: The proportion of smokers are not follow the table.