In [1]:
import numpy as np
import pandas as pd
import matplotlib as mp
import seaborn as sns
import sklearn
from sklearn.metrics import mean_squared_error,accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from ydata_profiling import ProfileReport
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score as r2
from sklearn.ensemble import RandomForestRegressor



In [2]:
df = pd.read_csv(r"./Life Expectancy Data.csv")

In [3]:
df.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [4]:
df.shape

(2938, 22)

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,2938.0,2007.519,4.613841,2000.0,2004.0,2008.0,2012.0,2015.0
Life expectancy,2928.0,69.22493,9.523867,36.3,63.1,72.1,75.7,89.0
Adult Mortality,2928.0,164.7964,124.2921,1.0,74.0,144.0,228.0,723.0
infant deaths,2938.0,30.30395,117.9265,0.0,0.0,3.0,22.0,1800.0
Alcohol,2744.0,4.602861,4.052413,0.01,0.8775,3.755,7.7025,17.87
percentage expenditure,2938.0,738.2513,1987.915,0.0,4.685343,64.91291,441.5341,19479.91
Hepatitis B,2385.0,80.94046,25.07002,1.0,77.0,92.0,97.0,99.0
Measles,2938.0,2419.592,11467.27,0.0,0.0,17.0,360.25,212183.0
BMI,2904.0,38.32125,20.04403,1.0,19.3,43.5,56.2,87.3
under-five deaths,2938.0,42.03574,160.4455,0.0,0.0,4.0,28.0,2500.0


## Analysing data using pairplot and profile report:

In [6]:
# sns.pairplot(df)

In [7]:
# profile = ProfileReport(df)
# profile

## performing preprocessing

### conclusion from Profile report:
- columns Country, Total expenditure, Population, Year has very low impact on expectancy so they must be dropped
- data is missing in many rows, so that data must be approximated/dropped

In [8]:
df_mod = df.drop(['Country','Total expenditure','Population', 'Year'], axis='columns')
df_mod = pd.get_dummies(df_mod, columns = ["Status"])

In [9]:
df_mod.columns.values

array(['Life expectancy ', 'Adult Mortality', 'infant deaths', 'Alcohol',
       'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ',
       'under-five deaths ', 'Polio', 'Diphtheria ', ' HIV/AIDS', 'GDP',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling', 'Status_Developed',
       'Status_Developing'], dtype=object)

In [10]:
imputer = SimpleImputer(strategy='mean')
df_mod_imputed = imputer.fit_transform(df_mod)
df_mod = pd.DataFrame(df_mod_imputed, columns=df_mod.columns)

In [11]:
df_mod.head()

Unnamed: 0,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Diphtheria,HIV/AIDS,GDP,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling,Status_Developed,Status_Developing
0,65.0,263.0,62.0,0.01,71.279624,65.0,1154.0,19.1,83.0,6.0,65.0,0.1,584.25921,17.2,17.3,0.479,10.1,0.0,1.0
1,59.9,271.0,64.0,0.01,73.523582,62.0,492.0,18.6,86.0,58.0,62.0,0.1,612.696514,17.5,17.5,0.476,10.0,0.0,1.0
2,59.9,268.0,66.0,0.01,73.219243,64.0,430.0,18.1,89.0,62.0,64.0,0.1,631.744976,17.7,17.7,0.47,9.9,0.0,1.0
3,59.5,272.0,69.0,0.01,78.184215,67.0,2787.0,17.6,93.0,67.0,67.0,0.1,669.959,17.9,18.0,0.463,9.8,0.0,1.0
4,59.2,275.0,71.0,0.01,7.097109,68.0,3013.0,17.2,97.0,68.0,68.0,0.1,63.537231,18.2,18.2,0.454,9.5,0.0,1.0


In [12]:
# profile = ProfileReport(df_mod)
# profile

In [13]:
y = df['Life expectancy ']
x = df_mod.drop('Life expectancy ', axis=1)
y = y.fillna(y.mean()) ## fill missing values with mean

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.2, test_size = 0.8, random_state=0)

### Trying out linear regression:

In [15]:
lr = LinearRegression()

In [16]:
lr.fit(x_train, y_train)

In [17]:
y_pred = lr.predict(x_test)

In [18]:
r2_score_linear_reg = r2(y_test, y_pred)

In [19]:
r2_score_linear_reg

0.8138315942450927

#### getting 81.3% accuracy with linear regression

### trying out Random Forest regression


In [20]:
regressor = RandomForestRegressor(n_estimators=10, random_state=0, oob_score=True)


In [21]:
regressor.fit(x_train, y_train)


  warn(


In [22]:
y_pred_rf = regressor.predict(x_test)


In [23]:
r2_rand_forest = r2(y_test, y_pred_rf)
r2_rand_forest*100

92.74659722254496

In [24]:
mse = mean_squared_error(y_test, y_pred_rf)
mse

6.5526474937261465

### Getting 92.74% accuracy with random forest