# DATA IMPORTATION WITH PANDAS

In [2]:
import pandas as pd 
import numpy as np
health = pd.read_csv(r"health_lifestyle_dataset.csv")

Data Quality and Preparation

In [None]:
# DATATYPE CHECK
health.dtypes

In [3]:
# total number of information collected
len(health.columns)
# check for duplicated id.... to ensure accurate vresults
health["id"].duplicated()
np.unique(health["id"])
# missing values check 
health.isnull()
health["id"].isnull().nunique()
health["age"].isnull().nunique()
health["gender"].isnull().nunique()
health["bmi"].isnull().nunique()
health["daily_steps"].isnull().nunique()
health["sleep_hours"].isnull().nunique()
health["water_intake_l"].isnull().nunique()
health["calories_consumed"].isnull().nunique()
health["smoker"].isnull().nunique()
health["alcohol"].isnull().nunique()
health["resting_hr"].isnull().nunique()
health["systolic_bp"].isnull().nunique()
health["diastolic_bp"].isnull().nunique()
health["cholesterol"].isnull().nunique()
health["family_history"].isnull().nunique()
health["disease_risk"].isnull().nunique()
# no missing values

1

In [4]:
# columns formatting 
# Restructring the columns from lower case to title case
health = health.rename(columns={"id":"Id","age":"Age","gender":"Gender",
                                "bmi":"BMI","daily_steps":"Avg_daily_steps","sleep_hours":"Avg_sleep_hours",
                                "water_intake_l":"Avg_water_intake (L)","calories_consumed":"Total_calories_Consumed(per day)",
                                "smoker":"Smoking_status","alcohol":"Alcohol_status","resting_hr":"Resting_hours",
                                "systolic_bp":"Systolic_BP","diastolic_bp":"Diastolic_BP","cholesterol":"Cholestherol_level",
                                "family_history":"Family_History","disease_risk":"Disease_risk_status"})

In [5]:
# writing a function weight detector, for weight classification
#  function for activity level classification based on avg_daily_steps
#  function for age categorization 
#  function for water intake indicator

#  for weight classification
def weight_detector(x):
    if x <= 18.5:
        bmi = "Under-weight"
    elif x <= 24.9:
        bmi = "Normal-weight"
    elif x <=29.9:
        bmi = "Over-weight"
    else:
        bmi = "Obese"
    return bmi



# daily steps in relation to activity level
def Activity_level(ds):
    if ds <= 5000:
        active = "Low"
    elif ds <=7499:
        active = "Light"
    elif ds <= 9999:
        active = "Moderate"
    elif ds <= 11999:
        active = "Active"
    else:
        active = "Highly-Active"
    return active


#  Age classification
def Age_category(age):
    if age <= 18:
        Age_class = "Teenager"
    elif age <=39:
        Age_class = "Youth"
    elif age <=60:
        Age_class = "Adult"
    else:
        Age_class = "Elder"
    return Age_class


#  CHOLESTEROL LEVEL INDICATOR
def cholesterol_indicator(cho):
    if cho <= 200:
        level = "Desirable"
    elif cho <=239:
        level = "Boderline High"
    else:
        level = "High"
    return level

# Water intake indicator
def water_indicator(intake):
    if intake <= 1.5:
        indicator = "Low"
    elif intake <=2:
        indicator = "Moderate"
    elif  intake <= 2.99:
        indicator = "Good"
    elif intake <= 4:
        indicator = "High"
    else:
        indicator = "Too High"
    return indicator

In [6]:
health["Weight_category"]= health["BMI"].apply(weight_detector)
health["Activity_level"] = health["Avg_daily_steps"].apply(Activity_level)
health["Age_Class"] = health["Age"].apply(Age_category)
health["Cholestherol_indicator"]= health["Cholestherol_level"].apply(cholesterol_indicator)
health["Water_intake_indicator"] = health["Avg_water_intake (L)"].apply(water_indicator)

GENERAL INFORMATION


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.stats as st
from statsmodels.stats.descriptivestats import describe
describe(health["Age"])
# the average age in the entire population is 48.5 years
# 65 % of the population are older than 33 years 
# only 10% of the entire population are below 24 years
describe(health["Avg_daily_steps"])
# the average daily steps is 10,480 steps for an active human in the population
# only 10% of the entire population take above 18,076 steps per day
#  10% of the entire population take below 2,800 steps per day

In [None]:
describe(health["Avg_sleep_hours"])
# the average daily hours of sleep for indivitauls in the population is 6.5 hours.
# this means that 50% of the entire polulation sleep less than 6.5 hours daily i.e 50% of the population
# sleep less than 6.5 hours daily
#  10 % of the population sleep more than 9.3 hours daily
# 10% of the population sleep less than 3.7 hours daily
# most individual have an average sleep of 8.5 hours daily


In [None]:
health["Avg_sleep_hours"].plot(kind="kde")

#  INSIGHT
# from the density distribution, the peak is almost flat.
# this indicates that average sleep hours for most people lies between the range 3-10 hours daily
# i.e very few persons sleep less or more outside the range

# *INFERENTIAL STATISTICS*
---

## **RESEARCH QUESTIONS**:
---
1. Is the daily average hours of sleep for men below 40 years equal to 6.5 hours.
1. Is the daily average hours of sleep for women below 40 years equal to 6.5 hours.
1. Do men below 40 years sleep more than 6.5 hours daily average hours of sleep.
1. Do women below 40 years sleep more than 6.5 hours daily average hours of sleep.
1. Do men below 40 years sleep less than 6.5 hours daily average hours of sleep.
1. Do women below 40 years sleep less than 6.5 hours daily average hours of sleep.  
1. Is there any significant statistical difference in the hours of sleep observed between the male and female gender (below 40 years).



### *RESEARCH QUESTION 1: Is the daily average hours of sleep for men below 40 years equal to 6.5 hours. ?*

##### Null hypothesis = the average daily hours of sleep for men below 40 years is not equal to 6.5 hours.
##### Alt hypothesis = the average daily hours of sleep for men below 40 years is equal to 6.5 hours.



In [9]:
import scipy.stats as st
below_40_males = health.loc[(health["Age"]<40) & (health["Gender"]=="Male")]
below_40_males

result_under40 = st.ttest_1samp(below_40_males["Avg_sleep_hours"],6.5)
t_stats = result_under40.statistic 
p_value = result_under40.pvalue
df_40 = result_under40.df
print(t_stats,p_value,df_40)

-1.9460945255439304 0.05165920539568092 17683


### *RESEARCH QUESTION 2: Is the daily average hours of sleep for women below 40 years equal to 6.5 hours. ?*

##### Null hypothesis = the average daily hours of sleep for women below 40 years is not equal to 6.5 hours.
##### Alt hypothesis = the average daily hours of sleep for women below 40 years is equal to 6.5 hours.

In [None]:
below_40_women = health.loc[(health["Age"]<40) & (health["Gender"]=="Female")]
below_40_women

under_40_women = st.ttest_1samp(below_40_women["Avg_sleep_hours"],6.5)
t_stats_w = under_40_women.statistic 
pvalue_w = under_40_women.pvalue
df_40_w = under_40_women.df
print(t_stats_w,pvalue_w,df_40_w) 

### *RESEARCH QUESTION 3:  Do men below 40 years sleep more than 6.5 hours on average daily. ?*

##### Null hypothesis = men below 40 years do not sleep more than 6.5 hours on average daily..
##### Alt hypothesis = men below 40 years sleep more than 6.5 hours on average daily..

In [10]:
# do they sleep more
result_under40 = st.ttest_1samp(below_40_males["Avg_sleep_hours"],6.5,alternative="greater")
t_stats = result_under40.statistic 
p_value = result_under40.pvalue
df_40 = result_under40.df
print(t_stats,p_value,df_40)

-1.9460945255439304 0.9741703973021596 17683


### *RESEARCH QUESTION 4:  Do women below 40 years sleep more than 6.5 hours on average daily. ?*

##### Null hypothesis = women below 40 years do not sleep more than 6.5 hours on average daily..
##### Alt hypothesis = women below 40 years sleep more than 6.5 hours on average daily..

In [11]:
# do they sleep more
below_40_women = health.loc[(health["Age"]<40) & (health["Gender"]=="Female")]
below_40_women

under_40_women = st.ttest_1samp(below_40_women["Avg_sleep_hours"],6.5,alternative="greater")
t_stats_w = under_40_women.statistic 
pvalue_w = under_40_women.pvalue
df_40_w = under_40_women.df
print(t_stats_w,pvalue_w,df_40_w) 

0.1782545754507264 0.4292625411132883 17776


### *RESEARCH QUESTION 5:  Do men below 40 years sleep less than 6.5 hours on average daily. ?*

##### Null hypothesis = men below 40 years do not sleep less than 6.5 hours on average daily..
##### Alt hypothesis = men below 40 years sleep less than 6.5 hours on average daily..

In [None]:
# do they sleep less
result_under40 = st.ttest_1samp(below_40_males["Avg_sleep_hours"],6.5,
                                alternative="less")
t_stats = result_under40.statistic 
p_value = result_under40.pvalue
df_40 = result_under40.df
print(t_stats,p_value,df_40)

### *RESEARCH QUESTION 6:  Do women below 40 years sleep less than 6.5 hours on average daily. ?*

##### Null hypothesis = women below 40 years do not sleep less than 6.5 hours on average daily..
##### Alt hypothesis = women below 40 years sleep less than 6.5 hours on average daily..


In [None]:
# do they sleep less
below_40_women = health.loc[(health["Age"]<40) & (health["Gender"]=="Female")]
below_40_women

under_40_women = st.ttest_1samp(below_40_women["Avg_sleep_hours"],6.5,alternative="less")
t_stats_w = under_40_women.statistic 
pvalue_w = under_40_women.pvalue
df_40_w = under_40_women.df
print(t_stats_w,pvalue_w,df_40_w) 

## *RESEARCH QUESTION 7:  Is there any significant statistical difference in the hours of sleep observed between the male and female gender (below 40 years)?*

##### Null hypothesis = there is no significant difference in the hours of sleep between the male and female gender (below 40 years).
##### Alt hypothesis = there is a significant difference in the hours of sleep between the male and female gender (below 40 years).

In [None]:
# any statistical difference between the 2
st.levene(below_40_males["Avg_sleep_hours"],
          below_40_women["Avg_sleep_hours"])
diff = st.ttest_ind(a=below_40_males["Avg_sleep_hours"],
                    b=below_40_women["Avg_sleep_hours"],equal_var=True)
diff