## DSC-609: Machine Learning
### Module 2 - Regularization
**Michael Albers**

**7/15/2022**

### Overview
Use Ridge and LASSO regression to predict Life Expectancy

### Data Set
**Life Expectancy (WHO)** 

This data set contains immunization data, mortality, economic, and social factors, and other health related factors from 2000 to 2015 from world countries.

https://www.kaggle.com/datasets/kumarajarshi/life-expectancy-who

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [131]:
import warnings
warnings.filterwarnings('ignore')

### Read and Inspect data file

In [2]:
life_exp_df = pd.read_csv('data/Life Expectancy Data.csv')

In [3]:
life_exp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2928 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2744 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2385 non-null   float64
 9   Measles                          2938 non-null   int64  
 10   BMI                             2904 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Polio               

In [4]:
# Rename columns
life_exp_df.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)
life_exp_df.columns

Index(['country', 'year', 'status', 'life_expectancy', 'adult_mortality',
       'infant_deaths', 'alcohol', 'percentage_expenditure', 'hepatitis_b',
       'measles', 'bmi', 'under-five_deaths', 'polio', 'total_expenditure',
       'diphtheria', 'hiv/aids', 'gdp', 'population', 'thinness__1-19_years',
       'thinness_5-9_years', 'income_composition_of_resources', 'schooling'],
      dtype='object')

In [6]:
life_exp_df['status'].value_counts()

Developing    2426
Developed      512
Name: status, dtype: int64

In [19]:
life_exp_df['year'].value_counts(sort=False)

2015    183
2014    183
2013    193
2012    183
2011    183
2010    183
2009    183
2008    183
2007    183
2006    183
2005    183
2004    183
2003    183
2002    183
2001    183
2000    183
Name: year, dtype: int64

In [133]:
le_2015 = life_exp_df[life_exp_df['year'] == 2015]

In [134]:
le_2015.sample(12)

Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,alcohol,percentage_expenditure,hepatitis_b,measles,...,polio,total_expenditure,diphtheria,hiv/aids,gdp,population,thinness__1-19_years,thinness_5-9_years,income_composition_of_resources,schooling
480,Cameroon,2015,Developing,57.3,357.0,45,,0.0,84.0,1809,...,83.0,,84.0,3.5,1244.429421,22834522.0,5.6,5.5,0.514,10.4
320,Bosnia and Herzegovina,2015,Developing,77.4,88.0,0,,0.0,82.0,1677,...,74.0,,82.0,0.1,4574.9787,3535961.0,2.3,2.3,0.747,14.2
1522,Lithuania,2015,Developed,73.6,165.0,0,,0.0,94.0,50,...,93.0,,93.0,0.1,14252.42853,29491.0,2.6,2.6,0.846,16.5
1506,Libya,2015,Developing,72.7,138.0,1,,0.0,97.0,82,...,97.0,,97.0,0.1,,,5.8,5.5,0.719,13.4
1796,Namibia,2015,Developing,65.8,248.0,2,,0.0,92.0,212,...,92.0,,92.0,2.1,4737.66996,2425561.0,8.2,8.1,0.637,11.7
2810,Uruguay,2015,Developing,77.0,116.0,0,,0.0,95.0,0,...,95.0,,95.0,0.1,15524.84247,3431552.0,1.5,1.4,0.794,15.5
2521,Switzerland,2015,Developed,83.4,49.0,0,,0.0,,35,...,97.0,,97.0,0.1,8989.8424,8282396.0,0.4,0.3,0.938,16.0
1602,Maldives,2015,Developing,78.5,61.0,0,,0.0,99.0,0,...,99.0,,99.0,0.1,8395.785198,49163.0,13.6,13.6,0.701,12.7
1959,Panama,2015,Developing,77.8,118.0,1,,0.0,73.0,0,...,72.0,,73.0,0.1,13134.4367,3969249.0,1.9,1.8,0.785,13.0
288,Bhutan,2015,Developing,69.8,211.0,0,,0.0,99.0,11,...,98.0,,99.0,0.5,2613.645177,787386.0,15.4,16.0,0.604,12.5


> The predicitons on life expectancy will be based on year 2015

#### Check for duplicate records

In [135]:
duplicate_rec_count = le_2015[le_2015.duplicated(['country', 'year', 'status']) == True]

In [136]:
len(duplicate_rec_count)

0

In [137]:
msg = f'There are {len(duplicate_rec_count)} duplicate rows'

In [138]:
print(msg)

There are 0 duplicate rows


#### Check for missing data

In [164]:
def report_missing_data(col_name):
    '''
    Description: Check if col_name parameter is missing any values
    
    args: col_name - name of dataframe column
    
    returns: column name if missing values. Otherwise return None
    '''
    
    num_missing = le_2015[col_name].isnull().sum()
    msg = f'{col_name} has {num_missing} missing values'
    
    if (num_missing > 0):
        print(msg)
        return col_name
        
    return None

In [141]:
column_names = ['country', 'year', 'status', 'life_expectancy', 'adult_mortality',
       'infant_deaths', 'alcohol', 'percentage_expenditure', 'hepatitis_b',
       'measles', 'bmi', 'under-five_deaths', 'polio', 'total_expenditure',
       'diphtheria', 'hiv/aids', 'gdp', 'population', 'thinness__1-19_years',
       'thinness_5-9_years', 'income_composition_of_resources', 'schooling']

In [142]:
missing_col_names = []

In [143]:
for c in column_names:
    col_ret = report_missing_data(c)
    
    if col_ret != None:
        missing_col_names.append(c)

alcohol has 177 missing values
hepatitis_b has 9 missing values
bmi has 2 missing values
total_expenditure has 181 missing values
gdp has 29 missing values
population has 41 missing values
thinness__1-19_years has 2 missing values
thinness_5-9_years has 2 missing values
income_composition_of_resources has 10 missing values
schooling has 10 missing values


In [144]:
missing_col_names

['alcohol',
 'hepatitis_b',
 'bmi',
 'total_expenditure',
 'gdp',
 'population',
 'thinness__1-19_years',
 'thinness_5-9_years',
 'income_composition_of_resources',
 'schooling']

In [145]:
# drop alcohol feature since column is mostly empty
le_2015.drop(['alcohol'], axis=1, inplace=True)
missing_col_names.remove('alcohol')

In [154]:
# drop total_expenditure since all observations are null for this column
le_2015.drop(['total_expenditure'], axis=1, inplace=True)
missing_col_names.remove('total_expenditure')

In [155]:
missing_col_names

['hepatitis_b',
 'bmi',
 'gdp',
 'population',
 'thinness__1-19_years',
 'thinness_5-9_years',
 'income_composition_of_resources',
 'schooling']

In [157]:
le_2015[missing_col_names].isnull().sum()

hepatitis_b                         9
bmi                                 2
gdp                                29
population                         41
thinness__1-19_years                2
thinness_5-9_years                  2
income_composition_of_resources    10
schooling                          10
dtype: int64

In [158]:
le_2015.head()

Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,percentage_expenditure,hepatitis_b,measles,bmi,under-five_deaths,polio,diphtheria,hiv/aids,gdp,population,thinness__1-19_years,thinness_5-9_years,income_composition_of_resources,schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,71.279624,65.0,1154,19.1,83,6.0,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
16,Albania,2015,Developing,77.8,74.0,0,364.975229,99.0,0,58.0,0,99.0,99.0,0.1,3954.22783,28873.0,1.2,1.3,0.762,14.2
32,Algeria,2015,Developing,75.6,19.0,21,0.0,95.0,63,59.5,24,95.0,95.0,0.1,4132.76292,39871528.0,6.0,5.8,0.743,14.4
48,Angola,2015,Developing,52.4,335.0,66,0.0,64.0,118,23.3,98,7.0,64.0,1.9,3695.793748,2785935.0,8.3,8.2,0.531,11.4
64,Antigua and Barbuda,2015,Developing,76.4,13.0,0,0.0,99.0,0,47.7,0,86.0,99.0,0.2,13566.9541,,3.3,3.3,0.784,13.9


#### Impute missing values with the mean

In [165]:
def impute_missing_values(col_name):
    pass

In [162]:
mean_hepatitis_b = np.mean(le_2015['hepatitis_b'])

In [163]:
mean_hepatitis_b

82.42528735632185