# Statistical Methods Project

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import shapiro
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.diagnostic import het_breuschpagan, het_white
from scipy.stats.mstats import winsorize
import statsmodels.formula.api as smf
import statsmodels.api as sm

# EDA (data description, data preprocessing)

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/ov3ipo/SM_Project/main/life_expectancy.csv")
# remove trailing space in columns name and format display function
pd.options.display.float_format = '{:.4f}'.format
df = df.rename(columns=lambda x: x.strip())

# overview on data statistic
display(df.head(10))
display(df.info())

# get quantitative and qualitative data
numeric_cols = df.drop(columns=["Status", "Country"], axis=1).columns

## Data description

### Univariate

#### Qualitative

In [None]:
plt.figure(figsize = (10,5))
plt.subplot(1, 2, 1)
x = df['Status'].value_counts().reset_index()
plt.pie(x=x['count'], labels=x['Status'], autopct="%0.1f%%")
plt.subplot(1, 2, 2)
sns.countplot(df, x="Status")
plt.tight_layout()
plt.show()

``` {markdown}
Remark:
-- Why didn't we consider the Country col in this case?
    + Categorical variable: the "Status" column represent a categorical variable, which is ideal for qualitative analysis.
    + Comparison and Analysis: using this column, one can compare other quantitative metrics (Life expectancy, GDP, etc) between developed and developing
countries, allowing for insightful analyses on how development status affects various health and economic indicators.
    + Simple and Interpretative: making any audience to understand easily.
- State what you see in the chart
- Pie chart: This chart shows the proportion of countries classified as "Developing" and "Developed". About 82.6% of the entries in the dataset are classified as
"Developing" and 17.4% as "Developed". The large section in blue represents developing countries, while the smaller section in orange represents developed
countries.
- Bar chart: This chart displays the count of countries classified as "Developing" versus "Developed". It visually confirms the numbers seen in the pie chart,
with a significantly higher count of developing countries compared to developed countries. The height of the bars indicates the count of countries in each 
ategory, reinforcing the disparity in number.
```

#### Quantitative

In [None]:
plt.figure(figsize=(10, 20))
for i, col in enumerate(numeric_cols):
    plt.subplot(10, 2, i + 1)
    sns.histplot(df, x=col, bins=30, kde=True)
plt.tight_layout()
plt.show()

``` {markdown}
Remark:
- Adult Mortality: This histogram shows a skewed distribution with a peak around low mortality rates and a long tail extending to higher rates.
- Income Composition of Resources: Shows a broad distribution with a peak towards higher values.
- Schooling Years: Roughly normally distributed centered around 10-12 years of schooling.
These histograms are useful for understanding the central tendencies and variability within the data, as well as for identifying potential outliers and skewness
in the distributions.
```

### Bivariate

In [None]:
plt.figure(figsize=(10, 20))
index = np.argwhere(numeric_cols=="Life expectancy")
for i, col in enumerate(np.delete(numeric_cols, index)):
    plt.subplot(10, 2, i + 1)
    sns.scatterplot(df, x=col, y="Life expectancy", hue="Status", legend="auto")
plt.tight_layout()
plt.show()

``` {markdown}
Remark:
- State what you see in the chart
    + Life Expectancy vs. Adult Mortality: Displays a clear negative correlation. As adult mortality increases, life expectancy decreases. This relationship
appears to be linear and strong.
    + Life Expectancy vs. GDP: Shows a positive correlation; countries with higher GDP tend to have higher life expectancy.
    + Life Expectancy vs. Income Composition of Resources and Schooling: Both show strong positive correlations. Higher income composition and more years of
schooling are associated with higher life expectancy.
- Base on the scatter plot can you spot any variables that seem to have a linear relationship with the target variabel?
Based on these plots, the variables that show a linear relationship with life expectancy and are most prominent include Adult Mortality, GDP, Schooling, Income
Composition of Resources, and to a lesser extent, health expenditure and vaccination coverages. These factors appear to have a direct and significant influence on
life expectancy, making them critical indicators for health and development studies.
```

### Overall statistic
- Life Expectancy: Average is around 69 years but varies widely influenced by health, economic, and educational factors.
- Health Metrics: Both adult and infant mortality rates show significant impact on life expectancy, with data suggesting that lower mortality rates correlate with
higher life expectancy.
- Economic Factors: GDP per capita and percentage expenditure on health significantly influence life expectancy, emphasizing the importance of economic stability
and health investment.
- Social Factors: Higher income resources and more years of schooling are strongly associated with higher life expectancy, highlighting the socio-economic
foundations of health.

In [None]:
display(df.describe().T)

## Data preprocessing (NAs, outliers, duplicateds, label encoding)

### Missing

In [None]:
print("\nPreprocessing\n")
print(df.isna().sum())
df = df.interpolate(method='linear', limit_direction='forward')
print("\nPostprocessing\n")
print(df.isna().sum())

### Duplicated

In [None]:
print(f"Total duplicated values: {df.duplicated().sum()}")

### Outliers

#### Detect outliers

In [None]:
plt.figure(figsize=(10, 15))
for i, col in enumerate(numeric_cols):
    plt.subplot(10, 2, i + 1)
    sns.boxplot(df, x=col)
plt.tight_layout()
plt.show()

# detect outliers
def detectOutliers(data):
    outliers_arr = []
    for col in data.columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)][col].count()
        outliers_arr.append(outliers)
    return pd.DataFrame(outliers_arr, index=data.columns, columns=["Total outliers"])

numeric_data = df.drop(columns=["Status", "Country"], axis=1)
outliers = detectOutliers(numeric_data)
outliers

``` {markdonw}
Why we should not use variable with high outliers -> because regression can heavily be affected by these outliers, hence we should only choose those with low outliers
Potential variable for regression of target
- Year
- Adult Mortality
- Alcohol
- BMI
- Total expenditure
- thinness 1-19 years
- thinness 5-9 years
- Income composition of resources
- Schooling
```

#### Dealing with outliers

In [None]:
# get potentital variables and variables that have outliers
potential_var = outliers[(outliers["Total outliers"] < 150)].index
outliers_var = outliers[(outliers["Total outliers"] > 0) & (outliers["Total outliers"] < 150)].index

# using transformation to deal with outliers
df_outliers = df.copy()
for col in outliers_var:
    if col != "Life expectancy":
        df_outliers[col] = np.sqrt(df_outliers[col])

# detect outliers again
display(detectOutliers(df_outliers[potential_var]))

# notice that outliers of Total expenditure, Income composition of resources and schooling does not change and or increase
for col in ["Total expenditure", "Income composition of resources", "Schooling"]:
    df_outliers[col] = df[col]
    df_outliers[col] = winsorize(df_outliers[col], limits=[0.05, 0.05])

# detect outliers again
display(detectOutliers(df_outliers[potential_var]))

In [None]:
# plot outliers
plt.figure(figsize=(10, 15))
for i, col in enumerate(potential_var):
    plt.subplot(10, 2, i + 1)
    sns.boxplot(df_outliers, x=col)
plt.tight_layout()
plt.show()

# assign new dataframe to use for regression
df_regress = df_outliers[potential_var]

``` {markdown}
Remark
- Explain reason why we should not use variables that have many outliers
    + Skewness: Outliers can skew the probability distribution of your data
    + Increased Variability: Outliers increase the variability in your data, which decreases statistical power
    + Model Fit: Outliers can cause your regression model to be skewed towards these extreme values
- Explain reason why we choose to use transformation and why is log (or anything else)
    + Reduce Impact of Outliers: Winsorizing reduces the impact of outliers on statistical measures such as the mean and standard deviation1. It sets extreme
outliers equal to a specified percentile of the data. This allows us to get a more accurate view of the mean and standard deviation of the dataset.
    + Retain Observations: Unlike trimming, which removes outliers entirely, Winsorizing retains the observations that are at the extremes. It makes sense to
Winsorize data when we want to keep the observations that are at the extremes but we don’t want to take them too literally.
    + Avoid Data Modification: If there aren’t extreme outliers, then Winsorizing the data will only modify the smallest and largest values slightly. This is
generally not a good idea since it means we’re just modifying data values for the sake of modifications.
    + Investigate Outliers: Outliers can represent interesting edge cases in the data. Thus, before modifying outliers it’s a good idea to take a closer look at
them to see what could have caused them.
```

### Comprare preprocess and postprocess

In [None]:
# compare statistic
display(df[potential_var].describe().T)
display(df_regress.describe().T)

``` {markdown}
Remark:
•	Pre-Transformation Statistics
Before the transformation, several variables had a significant number of outliers. For instance, ‘Adult Mortality’ had 82 outliers, ‘Alcohol’ had 1 outlier, and
‘Income composition of resources’ had 130 outliers. These outliers could potentially skew our data and impact the results of any subsequent data analysis or
modeling.
•	Post-Transformation Statistics
After applying the square root transformation and Winsorization techniques, we observed a significant reduction in the number of outliers across multiple
variables. For example, the number of outliers in ‘Adult Mortality’ reduced from 82 to 18, and ‘Alcohol’ no longer had any outliers. This indicates that our
transformation techniques were effective in reducing the impact of extreme values.
```

# Linear Regression Analysis

## Correlation Matrix

### Before dealing with outliers

In [None]:
numeric_df = df.drop(columns=["Country", "Status"])
plt.figure(figsize=(15, 10))
sns.heatmap(numeric_df.corr(), annot=True)
plt.title('Heatmap Correlation')
plt.show()

# get variables that has high correlation with Life expectancy
corrs = numeric_df.corr()['Life expectancy'].drop('Life expectancy')
high_corr = corrs[corrs.abs() > 0.5]
print("Variables have correlation larger than 0.5: ")
high_corr

### After dealing with outliers

In [None]:
numeric_df_outliers = df_outliers.drop(columns=["Country", "Status"])
plt.figure(figsize=(15, 10))
sns.heatmap(numeric_df_outliers.corr(), annot=True)
plt.title('Heatmap Correlation')
plt.show()

# get variables that has high correlation with Life expectancy
corrs = numeric_df_outliers.corr()['Life expectancy'].drop('Life expectancy')
high_corr = corrs[corrs.abs() > 0.5]
print("Variables have correlation larger than 0.5: ")
high_corr

Remark
- We have 8 variables having correlation larger than 0.5.
- Base on the heatmap above, we have two couples "Schooling"-"Income composition of resources",and "thinness  1-19 years"-"thinness 5-9 years " have high correlation with each other, so they are multicollinearity and do not satisfy the condition of linear regression.
- According to the outliers detection in the previous step, we can see that "HIV/AIDS" have more outliers than the others.
- Therefore, This leaves us with the last 2 variables that is BMI and Adult Mortality, hence we will use them for linear regression. 

In [None]:
# update data use for regress
df_regress = df_regress[["Life expectancy", "BMI", "Adult Mortality"]]

``` {markdown}
Remark
- Base on qualitative analysis remark, outliers detection remark, and this correlation matix make the final conclusion on which variable we should use for linear regression of target
- thiness 1-19 and 5-9 have high correlation with each other, also for income composition of resources make them not satisfy the condition of linear regression that is each variables should be independent
- HIV/AIDS is introduced in here but it has too much outliers hence we will skip it
- This leaves us with the last 2 variables that is BMI and Adult Mortality, hence we will use them for linear regression
```

## Least square regression

### $ \hat{Y} = Intercept + Slope*X $
$Slope = \frac{\sum_{i=1}^{n}(x_i - \bar{x})(y_i - \bar{y})}{\sum_{i=1}^{n}(x_i - \bar{x})^2}                   $ <br>
$Intercept = \bar{y} - Slope*\bar{x}$               <br>
<br>
$R^2 = \frac{\sum_{i=1}^n (\hat{y}i - \bar{y})^2}{\sum{i=1}^n (y_i - \bar{y})^2}$ <br>
<br>
The coefficient of determination, $R^2$, measures the proportion of the variance in the dependent variable that is predictable from the independent variable(s). It ranges from 0 to 1, with a higher value indicating a better fit of the model to the data.

In [None]:
X1 = df_regress['Adult Mortality']
X2 = df_regress['BMI']
y = df_regress['Life expectancy']

model1 = smf.ols(formula='y ~ X1', data=df_regress).fit()
residual1 = model1.resid
print(model1.summary())

Remark
- The R-squared value is 0.370. This indicates that the independent variable(s), "Adult Mortality", in the model explain 37.0% of the variation in the dependent variable. 
- The Adjusted R-squared value is also 0.370, which suggests that the inclusion of additional predictor variables would not significantly improve the model's explanatory power.

In [None]:
model2 = smf.ols(formula='y ~ X2', data=df_regress).fit()
residual2 = model2.resid
print(model2.summary())

Remark
- The Adjusted R-squared value is 0.309, which indicates that the independent variables in the model explain approximately 30.9% of the variation in the dependent variable.

- The R-squared value is 0.310, which is slightly higher than the Adjusted R-squared. This suggests that the inclusion of additional predictor variables may not significantly improve the model's explanatory power.

In [None]:
model3 = smf.ols(formula='y ~ X1 + X2', data=df_regress).fit()
residual3 = model3.resid
print(model3.summary())

Remark

- The R-squared value is 0.502, indicating that the model explains 50.2% of the variation in the dependent variable.
- The Adjusted R-squared value is 0.502, which is the same as the R-squared value. This suggests that the inclusion of the independent variables in the model does not significantly improve the model's explanatory power beyond what the R-squared value already captures.


## Check residuals for 4 assumptions

### Assumption 1: Linearity

In [None]:
plt.figure(figsize=(8, 10))
for i, col in enumerate(df_regress.drop(['Life expectancy'], axis=1).columns):
    plt.subplot(2, 1, i + 1)
    sns.scatterplot(df_regress, x=col, y="Life expectancy")
plt.tight_layout()
plt.show()

``` {markdonw}
Remark
- What does the chart conclude about the liearity?
- What to do if this assumption is violated?
```

### Assumption 2: Independent

``` {markdonw}
Remark
- What does the chart conclude about the linearity?
- What to do if this assumption is violated?
```

### Assumption 3: Homoscedasticity

``` {markdonw}
Remark
- What does the chart conclude about the linearity?
- What to do if this assumption is violated?
```

### Assumption 4: Normality of Residuals

# ----------------------- raw

### Assumption 2: Homoscedasticity

In [None]:
def Check_Homo(residual,model_input):
    # Homoscedasticity (White test)
    white_test = het_white(residual, model_input.model.exog)
    print(f'White test: Test statistic = {white_test[0]}, p-value = {white_test[1]}')

    if white_test[1] <= 0.05:
        print("There is significant evidence of heteroscedasticity.\n => This implies that the assumption of homoscedasticity is violated.")
    else :
        print("There is no significant evidence to suggest heteroscedasticity.\n => This implies that the assumption of homoscedasticity is likely satisfied.")
    

Test for "Adult Mortality"

In [None]:
Check_Homo(residual1,model1)

Test for "BMI"

In [None]:
Check_Homo(residual2,model2)

Test for "Adult Mortality" and "BMI"

In [None]:
Check_Homo(residual3,model3)

### Assumption 2: Independent


The Durbin-Watson test is used to detect the presence of autocorrelation in the residuals of a regression model. Autocorrelation refers to the correlation between the residuals of the model, which violates the assumption of independent errors.

The Durbin-Watson test statistic, denoted as "d", can range from 0 to 4. The interpretation of the Durbin-Watson statistic is as follows:

    If d = 2, it indicates no autocorrelation.
    If d < 1.5, it suggests positive autocorrelation.
    If d > 2.5, it suggests negative autocorrelation.
    If d is between 1.5 and 2.5, it generally indicates no autocorrelation.


In [None]:
def Check_independent(residual):
    dw_statistic = durbin_watson(residual)
    print(f'Durbin-Watson statistic: {dw_statistic}')
    if 1.5<dw_statistic<2.5 :
        print("A value indicates no autocorrelation.")
    elif dw_statistic < 1.5:
        print("A value indicates positive autocorrelation")
    else:
        print("A value indicates negative autocorrelation")

Test for "Adult Mortality"

In [None]:
Check_independent(residual1)

Test for "BMI"

In [None]:
Check_independent(residual2)

Test for "Adult Mortality" and "BMI"

In [None]:
Check_independent(residual3)

Assumption 4: Normality of Residuals

In [None]:
def check_Normality(residuals):
    sm.qqplot(residuals, line ='45')
    plt.title('Q-Q plot')
    plt.show()
    # Normality (Shapiro-Wilk test)
    shapiro_test = shapiro(residuals)
    print(f'Shapiro-Wilk test: Test statistic = {shapiro_test[0]}, p-value = {shapiro_test[1]}')
    if shapiro_test[1] > 0.05:
        print("There is no significant evidence to suggest that the residuals are not normally distributed.\n => This implies that the normality assumption is likely satisfied.")
    else :
        print("There is significant evidence that the residuals are not normally distributed.\n => This implies that the normality assumption is violated.")


Test for "BMI"

In [None]:
check_Normality(residual1)

Test for "Adult Mortality"

In [None]:
check_Normality(residual2)

Test for "Adult Mortality" and "BMI"

In [None]:
check_Normality(residual3)

# ----------------- raw

In [None]:
# test = shapiro(df_regress[])
# print(f'Test statistic = {test[0]}, p-value = {test[1]}')
# if test[1] > 0.05: print(f'{col} looks normal distributed (fail to reject H0)\n')
# else: print(f'{col} does not normal distributed (reject H0)\n')

``` {markdonw}
Remark
- What does the chart conclude about the linearity?
- What to do if this assumption is violated?
```

## Normality check

In [None]:
# add semicolon to prevent duplicated graph issue
sm.qqplot(df_regress["Life expectancy"], line='45');
sm.qqplot(df_regress['BMI'], line='45');
sm.qqplot(df_regress['Adult Mortality'], line='45');

In [None]:
# use Shapiro-Wilk test to test again
for col in df_regress.columns:
    test = shapiro(df_regress[col])
    print(f'Test statistic = {test[0]}, p-value = {test[1]}')
    if test[1] > 0.05: print(f'{col} looks normal distributed (fail to reject H0)\n')
    else: print(f'{col} does not normal distributed (reject H0)\n')

``` {markdown}
Remark
Both the QQ plot and the Shapiro-Wilk test indicate that our dataset is not normally distributed. However, as our dataset is a sample from the years 2000 to 2015
and continues to be updated annually, the increase in size may lead it to approximate normality over time, following the Central Limit Theorem (CLT). The CLT
suggests that with a large enough sample size, the sampling distribution of the mean will approximate a normal distribution, regardless of the initial
distribution of the data. This principle is particularly important in inferential statistics, where the assumption of normality supports the validity of various
statistical tests and the calculation of confidence intervals.
```


In [None]:
import matplotlib.pyplot as plt
data = df_regress['Life expectancy']
plt.hist(data, color='lightgreen', ec='black', bins=100)
print(data)

In [None]:
x = df_regress['Life expectancy']
n = len(x)

y = (stats.rankdata(x)/(n+1))*2 -1
print(np.min(x), np.max(x))
y = np.arctanh(y)
y = np.asarray(y)
print(y)


fig,ax = plt.subplots(1,2,figsize = (7,4))
ax[0].hist(x,bins=100 )
ax[0].set_title('Original Data')
ax[1].hist(y,bins=100 )
ax[1].set_title('Transformed Data')
sm.qqplot(y, line='45');
from scipy.stats import normaltest
stat, p = normaltest(y)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
 print('Sample looks Gaussian (fail to reject H0)')
else:
 print('Sample does not look Gaussian (reject H0)')

In [None]:

x1 = df_regress['BMI']
n1 = len(x1)

y1 = (stats.rankdata(x1)/(n+1))*2 -1

y1 = np.arctanh(y1)
print(np.min(y1), np.max(y1))
print(x1)


fig,ax = plt.subplots(1,2,figsize = (7,4))
ax[0].hist(x1,bins=100 )
ax[0].set_title('Original Data')
ax[1].hist(y1,bins=100 )
ax[1].set_title('Transformed Data')
sm.qqplot(y1, line='45');


In [None]:
x2 = df_regress['Adult Mortality']
n2 = len(x)

y2 = (stats.rankdata(x2)/(n+6))*2 -1

y2 = np.arctanh(y2)
print(np.min(y2), np.max(y2))


print(x2)


fig,ax = plt.subplots(1,2,figsize = (7,4))
ax[0].hist(x2,bins=100 )
ax[0].set_title('Original Data')
ax[1].hist(y2,bins=100 )
ax[1].set_title('Transformed Data')
sm.qqplot(y2, line='45');

## Construct confidence interval

In [None]:
confidence_level = 0.95
def CI_mean(data, name):

  data1 = data.values
  data_mean = np.mean(data1)
  data_std = np.std(data1, ddof=1)
  n = len(data1)
  data_interval = stats.t.interval(confidence_level, df = n-1, loc = data_mean, scale = data_std/np.sqrt(n))
  print(f"95% Confident that the mean of {name} lie between {data_interval}")
CI_mean(df_regress['BMI'], "BMI")
CI_mean(df_regress['Adult Mortality'], 'Adult Mortality')
CI_mean(df_regress['Life expectancy'], 'Life expectancy')

In [None]:
confidence_level = 0.95
def CI_var_std(data, name):
  data_var = np.var(data, ddof = 1)
  data_std = np.std(data, ddof =1)
  n = len(data) - 1
  chi2 = stats.chi2.ppf((1 + confidence_level)/2, n)
  CI_var = (n * data_var / chi2, n * data_var / stats.chi2.ppf((1-confidence_level)/2,n))
  CI_std = np.sqrt(CI_var)
  print(f"95% Confident that Variance of {name} lie between {CI_var} :")
  print(f"95% Confident that Standard Deviation of {name} lie between {CI_std} :")
CI_var_std(df_regress['BMI'], 'BMI')
CI_var_std(df_regress['Adult Mortality'], 'Adult Mortality')
CI_var_std(df_regress['Life expectancy'],'Life expectancy' )

In [None]:
#Confidence interval for proportion of Life expectancy higher than 60
LElarger60 = df_regress[df_regress['Life expectancy'] >= 60]
confidence_level=0.95
def CI_proportion(data, sample, name):
    n = len(data)
    s = len(LElarger60)
    p = s / n

    se = np.sqrt(p * (1 - p) / n)
    z = stats.norm.ppf(1 - (1 - confidence_level) / 2)
    lower_bound = p - z * se
    upper_bound = p + z * se
    #return lower_bound, upper_bound
    print(f" 95% Confident that proportion of {name} is between ({lower_bound}, {upper_bound})")

CI_proportion(df_regress['Life expectancy'], LElarger60, 'People whose Life expectancy is greater than 60')

## Perform hypothesis testing