In [None]:
# Install the required package from Pypi
# pip install pca

In [None]:
# only for kaggle
import os

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
life_exep_data = pd.read_csv('../input/life-expectancy-dataset/data/impv/final.csv')

In [None]:
life_exep_data.head()

In [None]:
life_exep_data.describe()

In [None]:
# life_exep_data['Gender'] = life_exep_data['Gender'].map({'Male': 0,'Female': 1})

In [None]:
life_exep_data.skew(axis = 0, numeric_only = True)

Lets visualise the distribution of Life Expectancy at birth.

In [None]:
def plot_histogram(data, ylabel, xlabel, title):
    plt.figure(figsize = (10, 6))
    plt.hist(data, edgecolor="white")
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)
    plt.title(title);

In [None]:
def get_outlier(df, column):
    Q1 = np.percentile(df[column], 25, interpolation = 'midpoint')
    Q3 = np.percentile(df[column], 75, interpolation = 'midpoint')
    IQR = Q3 - Q1
    return df[(df[column]>=(Q3+1.5*IQR)) | (df[column]<=(Q1-1.5*IQR))].sort_values(by=[column]).reset_index()

In [None]:
plot_histogram(life_exep_data['Life expectancy'], 'Count', 'Life Expectancy at Birth', "Histogram of Life Expectancy at Birth")

In [None]:
life_exep_data['Life expectancy'].skew()

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['Life expectancy'])

From the above histogram we can conclude that most of data lies between 70 and 80.

Next we are going to do visualise the distribution of Unemployment column.

In [None]:
df = get_outlier(life_exep_data, 'Life expectancy')

df[['Country','Year','Gender','Life expectancy']]

In [None]:
plot_histogram(life_exep_data['Unemployment'], 'Count', 'Unemployment', "Histogram of unemployment")

From the above histogram we can conclude most of the data are between 5 to 10.

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['Unemployment'])

In [None]:
df = get_outlier(life_exep_data, 'Unemployment')

df[['Country','Year','Gender','Unemployment']]

In [None]:
plot_histogram(life_exep_data['Infant Mortality'], 'Count', 'Infant Mortality', "Histogram of Infant Mortality")

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['Infant Mortality'])

In [None]:
df = get_outlier(life_exep_data, 'Infant Mortality')

df[['Country','Year','Gender','Infant Mortality']]

In [None]:
plot_histogram(life_exep_data['GDP'], 'Count', 'GDP', "Histogram of GDP")

From the above histogram we can conclude that the data has left skewed.

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['GDP'])

In [None]:
life_exep_data['GDP'].skew()

In [None]:
df = get_outlier(life_exep_data, 'GDP')

df[['Country','Year','Gender','GDP']]

In [None]:
outlier_countries = df['Country'].unique()
remove_list = ["Korea, Rep.", "Spain", "Mexico", "Italy", "Russian Federation", "Canada", "Australia", "China", "India", "France", "Brazil", "United Kingdom", "Germany", "St. Martin (French part)", "Korea, Dem. People's Rep.", "Japan", "United States"]
remove_list = np.setdiff1d(outlier_countries,remove_list)

In [None]:
life_exep_data[life_exep_data.Country.isin(remove_list)]

In [None]:
life_exep_data = life_exep_data[~life_exep_data.Country.isin(remove_list)]

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['GDP'])

In [None]:
df = get_outlier(life_exep_data, 'GDP')

df[['Country','Year','Gender','GDP']]

In [None]:
plot_histogram(life_exep_data['GDP'], 'Count', 'GDP', "Histogram of GDP")

In [None]:
life_exep_data['GDP'] = np.log(life_exep_data['GDP'])

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['GDP'])

In [None]:
life_exep_data[(life_exep_data['Year']==2019)]['GDP'].mean()

In [None]:
sub = life_exep_data[life_exep_data['Country'] == 'United States']
sub[['Country','Year','Gender','GDP', 'GNI']]

In [None]:
plot_histogram(life_exep_data['GNI'], 'Count', 'GNI', "Histogram of GNI")

From the above histogram we can conclude that the data has left skewed.

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['GNI'])

In [None]:
df = get_outlier(life_exep_data, 'GNI')

df[['Country','Year','Gender','GNI']]

In [None]:
life_exep_data['GNI'] = np.log(life_exep_data['GNI'])

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['GNI'])

In [None]:
plot_histogram(life_exep_data['Clean fuels and cooking technologies'], 'Count', 'Clean fuels and cooking technologies', "Histogram of Clean fuels and cooking technologies")

From the above histogram we can find that most of the data lies between 90 and 100.

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['Clean fuels and cooking technologies'])

In [None]:
df = get_outlier(life_exep_data, 'Clean fuels and cooking technologies')
df[['Country','Year','Gender','Clean fuels and cooking technologies']]

In [None]:
plot_histogram(life_exep_data['Per Capita'], 'Count', 'Per Capita', "Histogram of Per Capita")

Per Capita income has distribution of left skewed.

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['Per Capita'])

In [None]:
df = get_outlier(life_exep_data, 'Per Capita')
df[['Country','Year','Gender','Per Capita']]

In [None]:
plot_histogram(life_exep_data['Mortality caused by road traffic injury'], 'Count', 'Mortality caused by road traffic injury', "Histogram of Mortality caused by road traffic injury")

The above histogram shows that the data are normaly distributed.

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['Mortality caused by road traffic injury'])

In [None]:
df = get_outlier(life_exep_data, 'Mortality caused by road traffic injury')
df[['Country','Year','Gender','Mortality caused by road traffic injury']]

In [None]:
plot_histogram(life_exep_data['Tuberculosis Incidence'], 'Count', 'Tuberculosis Incidence', "Histogram of Tuberculosis Incidence")

Tuberculosis Incidence has distribution of left skewed.

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['Tuberculosis Incidence'])

In [None]:
df = get_outlier(life_exep_data, 'Tuberculosis Incidence')
df[['Country','Year','Gender','Tuberculosis Incidence']]

In [None]:
plot_histogram(life_exep_data['DPT Immunization'], 'Count', 'DPT Immunization', "Histogram of DPT Immunization")

The data of DPT Immunization is skewed towards right.

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['DPT Immunization'])

In [None]:
df = get_outlier(life_exep_data, 'DPT Immunization')
df[['Country','Year','Gender','DPT Immunization']]

In [None]:
plot_histogram(life_exep_data['HepB3 Immunization'], 'Count', 'HepB3 Immunization', "Histogram of HepB3 Immunization")

The data of HepB3 Immunization is skewed towards right.

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['HepB3 Immunization'])

In [None]:
df = get_outlier(life_exep_data, 'HepB3 Immunization')
df[['Country','Year','Gender','HepB3 Immunization']]

In [None]:
plot_histogram(life_exep_data['Measles Immunization'], 'Count', 'Measles Immunization', "Histogram of Measles Immunization")

The data of Measles Immunization is skewed towards right.

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['Measles Immunization'])

In [None]:
df = get_outlier(life_exep_data, 'Measles Immunization')
df[['Country','Year','Gender','Measles Immunization']]

In [None]:
plot_histogram(life_exep_data['Hospital beds'], 'Count', 'Hospital beds', "Histogram of Hospital beds")

The data of Hospital beds is skewed towards left.

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['Hospital beds'])

In [None]:
df = get_outlier(life_exep_data, 'Hospital beds')
df[['Country','Year','Gender','Hospital beds']]

In [None]:
plot_histogram(life_exep_data['Basic sanitation services'], 'Count', 'Basic sanitation services', "Histogram of Basic sanitation services")

From the above histogram we can find that the most of the data lies between 90 to 100.

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['Basic sanitation services'])

In [None]:
df = get_outlier(life_exep_data, 'Basic sanitation services')
df[['Country','Year','Gender','Basic sanitation services']]

In [None]:
plot_histogram(life_exep_data['Tuberculosis treatment'], 'Count', 'Tuberculosis treatment', "Histogram of Tuberculosis treatment")

From the above histogram we can find that the most of the data of Tuberculosis treatment lies between 70 to 90.

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['Tuberculosis treatment'])

In [None]:
df = get_outlier(life_exep_data, 'Tuberculosis treatment')
df[['Country','Year','Gender','Tuberculosis treatment']]

In [None]:
plot_histogram(life_exep_data['Urban population'], 'Count', 'Urban population', "Histogram of Urban population")

The above histogram shows that the data has comb distribution.

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['Urban population'])

In [None]:
df = get_outlier(life_exep_data, 'Urban population')
df[['Country','Year','Gender','Urban population']]

In [None]:
plot_histogram(life_exep_data['Rural population'], 'Count', 'Rural population', "Histogram of Rural population")

Distribution of rural population is similar to urban population and has comb distribution.

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['Rural population'])

In [None]:
df = get_outlier(life_exep_data, 'Rural population')
df[['Country','Year','Gender','Rural population']]

In [None]:
plot_histogram(life_exep_data['Non-communicable Mortality'], 'Count', 'Non-communicable Mortality', "Histogram of Non-communicable Mortality")

Non-communicable data is normally distributed and skewed towards left.

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['Non-communicable Mortality'])

In [None]:
df = get_outlier(life_exep_data, 'Non-communicable Mortality')
df[['Country','Year','Gender','Non-communicable Mortality']]

In [None]:
plot_histogram(life_exep_data['Sucide Rate'], 'Count', 'Sucide Rate', "Histogram of Sucide Rate")

Sucide Rate data is kewed towards left.

In [None]:
sns.boxplot(data=life_exep_data, x=life_exep_data['Sucide Rate'])

Its seems that there are some outliers. Lets check them.

In [None]:
df = get_outlier(life_exep_data, 'Sucide Rate')
df[['Country','Year','Gender','Sucide Rate']]

From the above analysis we have found that there is serious issue of outlier on GDP and GNI. So, lets solve this issue by removing these data.

In [None]:
plt.figure(figsize = (10, 6))
ax = sns.boxplot(x='Gender', y='Life expectancy', data=life_exep_data)
plt.setp(ax.artists, alpha=.5, linewidth=2, edgecolor="k")
plt.xticks(rotation=45)

From the above boxplot, we can easily pridict that female has higher Life expectancy than male.

In [None]:
sns.set(rc={'figure.figsize':(10,10)})
sns.regplot(x="Life expectancy", y="Unemployment", data=life_exep_data, scatter_kws={'s':1})

In [None]:
sns.regplot(x="Life expectancy", y="Infant Mortality", data=life_exep_data, scatter_kws={'s':1})

In [None]:
sns.regplot(x="Life expectancy", y="GDP", data=life_exep_data, scatter_kws={'s':1})

In [None]:
sns.regplot(x="Life expectancy", y="GNI", data=life_exep_data, scatter_kws={'s':1})

In [None]:
sns.regplot(x="Life expectancy", y="Clean fuels and cooking technologies", data=life_exep_data, scatter_kws={'s':1})

In [None]:
sns.regplot(x="Life expectancy", y="Per Capita", data=life_exep_data, scatter_kws={'s':1})

In [None]:
sns.regplot(x="Life expectancy", y="Mortality caused by road traffic injury", data=life_exep_data, scatter_kws={'s':1})

In [None]:
sns.regplot(x="Life expectancy", y="Tuberculosis Incidence", data=life_exep_data, scatter_kws={'s':1})

In [None]:
sns.regplot(x="Life expectancy", y="DPT Immunization", data=life_exep_data, scatter_kws={'s':1})

In [None]:
sns.regplot(x="Life expectancy", y="HepB3 Immunization", data=life_exep_data, scatter_kws={'s':1})

In [None]:
sns.regplot(x="Life expectancy", y="Measles Immunization", data=life_exep_data, scatter_kws={'s':1})

In [None]:
sns.regplot(x="Life expectancy", y="Hospital beds", data=life_exep_data, scatter_kws={'s':1})

In [None]:
sns.regplot(x="Life expectancy", y="Basic sanitation services", data=life_exep_data, scatter_kws={'s':1})

In [None]:
sns.regplot(x="Life expectancy", y="Tuberculosis treatment", data=life_exep_data, scatter_kws={'s':1})

In [None]:
sns.regplot(x="Life expectancy", y="Urban population", data=life_exep_data, scatter_kws={'s':1})

In [None]:
sns.regplot(x="Life expectancy", y="Rural population", data=life_exep_data, scatter_kws={'s':1})

In [None]:
sns.regplot(x="Life expectancy", y="Non-communicable Mortality", data=life_exep_data, scatter_kws={'s':1})

In [None]:
sns.regplot(x="Life expectancy", y="Sucide Rate", data=life_exep_data, scatter_kws={'s':1})

In [None]:
plt.figure(figsize=(15, 15))
sns.heatmap(life_exep_data.corr(), annot=True, fmt='.2f')

From the above corelation plot we can easily say that there are some multicollinearity issues between the variables. To solve this issue we are going to implement the PCA technique.

In [None]:
life_exep_data.head()

## Standarize the data

We going to encode the Gender column because based on our research question we are going to predict the life expectancy base on the development indicators.

In [None]:
# Target variable
y = life_exep_data.iloc[:,3:4]

# Features
#X = life_exep_data.iloc[:, 2:]
X = life_exep_data.drop(['Country', 'Year', 'Life expectancy', 'Gender'], axis=1)

In [None]:
X

In [None]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
X_train

In [None]:
scaler = StandardScaler()
transformed_X_train = scaler.fit_transform(X_train)
transformed_X_test = scaler.transform(X_test)

In [None]:
# code chunk only for kaggle to create folder
improveDir = '/kaggle/working/impv/'
withoutPCADir = improveDir + 'without_pca/' 
if not os.path.exists(improveDir):
    os.mkdir(improveDir)

if not os.path.exists(withoutPCADir):
    os.mkdir(withoutPCADir)

In [None]:
pd.DataFrame(transformed_X_train).to_csv('/kaggle/working/impv/without_pca/X_train.csv', index=False, header=X_test.columns)
pd.DataFrame(transformed_X_test).to_csv('/kaggle/working/impv/without_pca/X_test.csv', index=False, header=X_test.columns)
y_train.to_csv('/kaggle/working/impv/without_pca/y_train.csv', index=False)
y_test.to_csv('/kaggle/working/impv/without_pca/y_test.csv', index=False)

In [None]:
pd.DataFrame(transformed_X_train, columns=X_train.columns).head(2)

Lets create PCA object from sklearn. And fit the standarised data to PCA. Here, we have use 80% and a threshold to reduct the dimension and noise.

In [None]:
# pca = PCA()
pca = PCA(n_components=0.9)
pca_data = pca.fit(transformed_X_train)
pca_data.explained_variance_ratio_

From the 80% threshold our 18 attributes has been reduced to 7 attributes. This will help use to improve the performance of algorithm significantly.

Lets convert the variance ratio to percentage so that it will be easy to under stand the variance of each component.

In [None]:
np.cumsum(np.round(pca_data.explained_variance_ratio_, decimals = 4) * 100)

Now lets visualise the weight of attribute on each component.

In [None]:
loadings = pca_data.components_

labels = ['PC' + str(x) for x in range(1, pca_data.n_components_+1)]

loadings_df = pd.DataFrame.from_dict(dict(zip(labels, loadings)))
loadings_df['variable'] = X_train.columns.values
loadings_df = loadings_df.set_index('variable')
loadings_df

Lets visualise the weight on heatmap.

In [None]:
plt.figure(figsize=(15, 15))
ax = sns.heatmap(loadings_df, annot=True, cmap='Spectral')

Transform the data to proper format so that we can visualise the component on biplot.

In [None]:
weight_df = loadings_df.reset_index().iloc[:, :3]
weight_df = pd.melt(weight_df, id_vars="variable", var_name="Component", value_name="Weight")

In [None]:
sns.barplot(x='Component', y='Weight', hue='variable', data=weight_df, palette=sns.color_palette("Set2")).set_title("Weight plot of PC1 and PC2")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

Lets visualise the variance and component in scree plot.

In [None]:
pca_df = pd.DataFrame({'Explained variance':pca_data.explained_variance_ratio_,'Components':labels})
plt.figure(figsize=(15, 15))
sns.barplot(x='Components',y="Explained variance",  data=pca_df, color="c").set_title("PCA: variance plot")

In [None]:
plt.plot(np.cumsum(pca_data.explained_variance_ratio_))
plt.xlabel("Number of components")
plt.ylabel("Explained variance")

In [None]:
# column_names = X_test.columns

In [None]:
# pd.DataFrame(column_names).to_csv('data/impv/columns.csv', index=False)

In [None]:
X_train = pca.transform(transformed_X_train)
X_test = pca.transform(transformed_X_test)

In [None]:
X_train.shape

With the help of Principal Component Analysisc (PCA) the feature has been reduced from 18 to 7. This will help to reduct the noise that is present in the dataset.

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
pd.DataFrame(X_train).to_csv('/kaggle/working/impv/X_train.csv', index=False)
pd.DataFrame(X_test).to_csv('/kaggle/working/impv/X_test.csv', index=False)
y_train.to_csv('/kaggle/working/impv/y_train.csv', index=False)
y_test.to_csv('/kaggle/working/impv/y_test.csv', index=False)