# PROJECT 3 CLEANED VERSION

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
from sklearn import metrics
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
from sklearn.linear_model import Lasso, Ridge
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor



%matplotlib inline

In [None]:
df = pd.read_csv('Life Expectancy Data.csv')

we are going to be binning years to see if we can use it as a categorical value, so we drop 2015 since it has less values than the other years, then bin our 15 year period into 3 year groups
 

In [None]:
#drop last year due to bad data
df = df[df['Year'] < 2015]

In [None]:
#we drop the following countries due to missing values
country_drops = ["Czechia", "Côte d'Ivoire" ,"Democratic People's" ,"Republic of Korea",
                 "Democratic Republic of the Congo","Marshall Islands","Monaco", "Niue",
                 "Republic of Korea","Republic of Moldova","Somalia","Tuvala",'Nauru',
                 "United Republic of Tanzania", "Democratic People's Republic of Korea",
                 "The former Yugoslav republic of Macedonia",'Cook Islands','Dominica',
                 'Palau','Saint Kitts and Nevis','San Marino','Tuvalu','Sudan','South Sudan',
                "Montenegro", 'Timor-Leste']                         

df = df[~df["Country"].isin(country_drops)]

### Cleaning the DataFrame

In [None]:
# we need to create a map from country to continent
continents = '''Asia Europe Europe Africa Oceania Europe Africa Americas Americas Americas Asia Americas Oceania Europe Asia Americas Asia Asia Americas Europe Europe Americas Africa Americas Asia Americas Americas Europe Africa Americas Asia Europe Africa Africa Asia Africa Americas Africa Americas Africa Africa Americas Asia Americas Africa Africa Africa Oceania Americas Africa Europe Americas Americas Asia Europe Europe Africa Americas Americas Americas Africa Americas Africa Africa Europe Africa Americas Europe Oceania Europe Europe Americas Oceania Africa Africa Asia Europe Africa Europe Europe Americas Americas Americas Oceania Americas Europe Africa Africa Americas Americas Europe Americas Asia Europe Europe Asia Asia Asia Asia Europe Europe Asia Europe Americas Asia Europe Asia Asia Africa Oceania Asia Asia Asia Asia Asia Europe Asia Africa Africa Africa Europe Europe Europe Asia Europe Africa Africa Asia Asia Africa Europe Oceania Americas Africa Africa Africa Americas Oceania Europe Europe Asia Europe Americas Africa Africa Asia Africa Oceania Asia Europe Oceania Oceania Americas Africa Africa Oceania Oceania Oceania Europe Asia Asia Oceania Asia Americas Oceania Americas Americas Asia Oceania Europe Europe Americas Asia Africa Europe Europe Africa Americas Africa Americas Americas Americas Americas Americas Oceania Europe Africa Asia Africa Europe Africa Africa Asia Americas Europe Europe Oceania Africa Africa Africa Europe Asia Africa Americas Europe Africa Europe Europe Asia Asia Asia Africa Asia Asia Africa Oceania Oceania Americas Africa Asia Asia Americas Oceania Africa Europe Asia Europe Americas Americas Asia Oceania Americas Asia Americas Americas Oceania Africa Asia Africa Africa'''
continents = continents.split()

country = '''Afghanistan, Ã…land, Islands, Albania, Algeria, American Samoa, Andorra, Angola, Anguilla, Antigua and Barbuda, Argentina, Armenia, Aruba, Australia, Austria, Azerbaijan, Bahamas, Bahrain, Bangladesh, Barbados, Belarus, Belgium, Belize, Benin, Bermuda, Bhutan, Bolivia (Plurinational State of), Bonaire Sint Eustatius and Saba, Bosnia and Herzegovina, Botswana, Brazil, Brunei, Darussalam, Bulgaria, Burkina Faso, Burundi, Cambodia, Cameroon, Canada, Cabo Verde, Cayman Islands, Central African Republic, Chad, Chile, China, Colombia, Comoros, Congo, Congo (Democratic Republic of the), Cook Islands, Costa Rica, CÃ´te d'Ivoire, Croatia, Cuba, CuraÃ§ao, Cyprus, Czech Republic, Denmark, Djibouti, Dominica, Dominican Republic, Ecuador, Egypt, El Salvador, Equatorial Guinea, Eritrea, Estonia, Ethiopia Falkland Islands (Malvinas), Faroe Islands, Fiji, Finland, France, French Guiana, French Polynesia, Gabon, Gambia, Georgia, Germany, Ghana, Gibraltar, Greece, Greenland, Grenada, Guadeloupe, Guam, Guatemala, Guernsey, Guinea, Guinea-Bissau, Guyana, Haiti, Holy See, Honduras, Hong Kong, Hungary, Iceland, India, Indonesia, Iran (Islamic Republic of), Iraq, Ireland, Isle of Man, Israel, Italy, Jamaica, Japan, Jersey, Jordan, Kazakhstan, Kenya, Kiribati, Korea (Democratic People's Republic of), Korea (Republic of), Kuwait, Kyrgyzstan, Lao People's Democratic Republic, Latvia, Lebanon,Lesotho, Liberia, Libya, Liechtenstein, Lithuania, Luxembourg, Macao, Macedonia (the former Yugoslav Republic of), Madagascar, Malawi, Malaysia, Maldives, Mali, Malta, Marshall Islands, Martinique, Mauritania, Mauritius, Mayotte, Mexico, Micronesia (Federated States of), Moldova (Republic of), Monaco, Mongolia, Montenegro, Montserrat, Morocco, Mozambique, Myanmar, Namibia, Nauru, Nepal, Netherlands, New Caledonia, New Zealand, Nicaragua, Niger, Nigeria, Niue, Norfolk Island, Northern Mariana Islands, Norway, Oman, Pakistan, Palau, Palestine State of, Panama, Papua New Guinea, Paraguay, Peru, Philippines, Pitcairn, Poland, Portugal, Puerto Rico, Qatar, RÃ©union, Romania, Russian Federation, Rwanda, Saint BarthÃ©lemy, Saint Helena Ascension and Tristan da Cunha, Saint Kitts and Nevis, Saint Lucia, Saint Martin (French part), Saint Pierre and Miquelon, Saint Vincent and the Grenadines, Samoa, San Marino, Sao Tome and Principe, Saudi Arabia, Senegal, Serbia, Seychelles, Sierra Leone, Singapore, Sint Maarten (Dutch part), Slovakia, Slovenia, Solomon Islands, Somalia, South Africa, South Sudan, Spain, Sri Lanka, Sudan, Suriname, Svalbard and Jan Mayen, Swaziland, Sweden, Switzerland, Syrian Arab Republic, Taiwan Province of China, Tajikistan, Tanzania, United Republic of Thailand, Timor-Leste, Togo, Tokelau, Tonga, Trinidad and Tobago, Tunisia, Turkey, Turkmenistan, Turks and Caicos Islands, Tuvalu, Uganda, Ukraine, United Arab Emirates, United Kingdom of Great Britain and Northern Ireland, United States of America, Uruguay, Uzbekistan, Vanuatu, Venezuela (Bolivarian Republic of), Viet Nam, Virgin Islands (British), Virgin Islands (U.S.), Wallis and Futuna, Western Sahara, Yemen, Zambia, Zimbabwe'''
country = country.split(', ')



In [None]:
#create a dictionary then perform the mapping
d ={}
for i in range(len(country)):
    d[country[i]] = continents[i]
    
df['cont'] = df['Country'].apply(lambda x: d[x] if x in d.keys() else np.nan)

###### Impute missing values

In [None]:
#Only UK and USA are missing education so since they are known as being educated countries, 
#I impute their missing values at the 75th percentile
school_75th_percentile = 14.2
df['Schooling'].fillna(school_75th_percentile,inplace=True)

In [None]:
#impute Iraqs missing value with its own mean
iraq_mean = df[df['Country'] == 'Iraq']['Total expenditure'].mean()

###### Drop unnecessary columns

In [None]:
#drop columns due to Multicollinearity
df.drop(['Hepatitis B','Adult Mortality',' thinness 5-9 years',' BMI ','infant deaths','Diphtheria ','Polio'],axis=1,inplace=True)

In [None]:
#drop columns due to incorrect data or unneeded data
df.drop(['Year','GDP','Population','Status','Country','percentage expenditure','Income composition of resources'],axis=1,inplace=True)

###### Create Dummies and bin variables

In [None]:
cont_df = pd.get_dummies(df['cont'],prefix='continent',drop_first=True)
df = pd.concat([df,cont_df],axis=1)
df.drop(['cont'],axis=1,inplace=True)

In [None]:
df.to_csv('life.csv')

# Modeling

In [None]:
#if you need to reload the dataset
df = pd.read_csv('life.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
#scatter matrix of continues columns in dataframe
pd.plotting.scatter_matrix(df.iloc[:,:8],figsize=(16,16));

In [None]:
#creates a triangle shaped correlation matrix with color coded values

plt.style.use('default')
corr = df.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(6, 6))

# Generate a custom diverging colormap
cmap = sns.color_palette("colorblind", 10)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})


In [None]:
df.hist(figsize=(10,10));

# Modeling in StatsModels

In [None]:
#Drop column with highest p-value and rerun until all p-values below alpha threshold
cols = ['Life_expectancy', 'Alcohol', 'Measles', 'under_five_deaths','total_expenditure',
        'HIV_AIDS','thinness_1_19_years', 'Schooling', 'continent_Americas',
       'continent_Asia', 'continent_Europe', 'continent_Oceania']
df.columns = cols
cols.pop(0)
formula = 'Life_expectancy~' + '+'.join(cols)

In [None]:
#we take the log of the following columns to try and normalize each 
for col in ['HIV_AIDS','thinness_1_19_years']:
    df[col] = df[col].apply(lambda x: np.log(x))

In [None]:
model = ols(formula=formula, data=df).fit()

In [None]:
model.summary()

In [None]:
#columns dropped due to high p-values
df.drop(['total_expenditure','Measles', 'under_five_deaths'],axis=1, inplace=True)

In [None]:
#Regression Plots
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model,exog_idx='Schooling', fig=fig)


In [None]:
#QQ Plot
residuals = model.resid
fig = sm.graphics.qqplot(residuals, dist=stats.norm, line='45', fit=True)


# Modeling in Sklearn

### Linear Regression

In [None]:
X = df.drop('Life_expectancy', axis=1)
y = df['Life_expectancy']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=4)

In [None]:
linreg = LinearRegression()
linreg.fit(x_train,y_train)

In [None]:
y_pred = linreg.predict(x_test)
print('RMSE: {}'.format(np.sqrt(metrics.mean_squared_error(y_test,y_pred))))
print('R-Squared: {}'.format(np.sqrt(metrics.r2_score(y_test,y_pred))))

In [None]:
linreg.coef_, linreg.intercept_

In [None]:
#Check the variable inflation factor
cols = ['Alcohol', 'HIV_AIDS',
       'thinness_1_19_years', 'Schooling', 'continent_Americas',
       'continent_Asia', 'continent_Europe', 'continent_Oceania']
vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
list(zip(cols, vif))

### Ridge Linear Regression

In [None]:
ridge = Ridge(alpha=0.2)
ridge.fit(x_train, y_train)

In [None]:
y_pred = ridge.predict(x_test)
print('RMSE: {}'.format(np.sqrt(metrics.mean_squared_error(y_test,y_pred))))
print('R-Squared: {}'.format(np.sqrt(metrics.r2_score(y_test,y_pred))))

In [None]:
ridge.coef_,ridge.intercept_

### Lasso Linear Regression

In [None]:
lasso = Lasso(alpha=0.2)
lasso.fit(x_train, y_train)

In [None]:
y_pred = lasso.predict(x_test)
print('RMSE: {}'.format(np.sqrt(metrics.mean_squared_error(y_test,y_pred))))
print('R-Squared: {}'.format(np.sqrt(metrics.r2_score(y_test,y_pred))))

In [None]:
lasso.coef_,lasso.intercept_

### Polynomial Linear Regression

In [None]:
poly_feat = PolynomialFeatures(degree = 2)
x_train_poly = poly_feat.fit_transform(x_train)

In [None]:
poly_reg = LinearRegression()
poly_reg.fit(x_train_poly,y_train)

In [None]:
x_test_poly = poly_feat.transform(x_test)
y_pred = poly_reg.predict(x_test_poly)
print('RMSE: {}'.format(np.sqrt(metrics.mean_squared_error(y_test,y_pred))))
print('R-Squared: {}'.format(np.sqrt(metrics.r2_score(y_test,y_pred))))

In [None]:
poly_reg.coef_, poly_reg.intercept_

# Polynomial Ridge Linear Regression

In [None]:
ridge_poly = Ridge(alpha=0.2)
ridge_poly.fit(x_train_poly, y_train)

In [None]:
y_pred = ridge_poly.predict(x_test_poly)
print('RMSE: {}'.format(np.sqrt(metrics.mean_squared_error(y_test,y_pred))))
print('R-Squared: {}'.format(np.sqrt(metrics.r2_score(y_test,y_pred))))

In [None]:
ridge_poly.coef_,ridge_poly.intercept_

# Polynomial Lasso Linear Regression


In [None]:
lasso_poly = Lasso(alpha=0.2)
lasso_poly.fit(x_train_poly, y_train)

In [None]:
y_pred = lasso_poly.predict(x_test_poly)
print('RMSE: {}'.format(np.sqrt(metrics.mean_squared_error(y_test,y_pred))))
print('R-Squared: {}'.format(np.sqrt(metrics.r2_score(y_test,y_pred))))

In [None]:
lasso_poly.coef_,lasso_poly.intercept_