<a href="https://colab.research.google.com/github/mvince33/Coding-Dojo/blob/main/pipelines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Imports
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(display = 'diagram')

In [6]:
# Load the data
url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTWgxeo-6msf7S-r9plFHodzGup8gTQifOIwJ78Al9a15aWglsJCR9VcU2VwwbT7RpdozD3EV6Vwg82/pub?output=csv'
df = pd.read_csv(url, index_col = 'CountryYear')
df.head()

Unnamed: 0_level_0,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
CountryYear,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Afghanistan2015,0,65.0,263,62,0.01,71.279624,65.0,1154,19.1,83,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
Afghanistan2014,0,59.9,271,64,0.01,73.523582,62.0,492,18.6,86,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
Afghanistan2013,0,59.9,268,66,0.01,73.219243,64.0,430,18.1,89,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
Afghanistan2012,0,59.5,272,69,0.01,78.184215,67.0,2787,17.6,93,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
Afghanistan2011,0,59.2,275,71,0.01,7.097109,68.0,3013,17.2,97,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [7]:
# Inspect the data
print(df.info(), '\n')
print(df.isna().sum())

<class 'pandas.core.frame.DataFrame'>
Index: 2928 entries, Afghanistan2015 to Zimbabwe2000
Data columns (total 20 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Status                           2928 non-null   int64  
 1   Life expectancy                  2928 non-null   float64
 2   Adult Mortality                  2928 non-null   int64  
 3   infant deaths                    2928 non-null   int64  
 4   Alcohol                          2735 non-null   float64
 5   percentage expenditure           2928 non-null   float64
 6   Hepatitis B                      2375 non-null   float64
 7   Measles                          2928 non-null   int64  
 8   BMI                              2896 non-null   float64
 9   under-five deaths                2928 non-null   int64  
 10  Polio                            2909 non-null   float64
 11  Total expenditure                2702 non-null   float64
 12  Dip

In [8]:
# Set the features and target
y = df['Life expectancy']
X = df.drop(columns = 'Life expectancy')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [9]:
# Instantiate the imputer and scaler
median_imputer = SimpleImputer(strategy = 'median')
scaler = StandardScaler()

In [10]:
# Instantiate the pipeline
preprocessing_pipeline = make_pipeline(median_imputer, scaler)
preprocessing_pipeline

In [11]:
# Fit the pipeline on the training data
preprocessing_pipeline.fit(X_train)

In [12]:
# Transform the train and test sets
X_train_processed = preprocessing_pipeline.transform(X_train)
X_test_processed = preprocessing_pipeline.transform(X_test)

In [13]:
# Make sure there are no missing values
np.isnan(X_train_processed).sum().sum()

# Inspect the data
X_train_processed

array([[ 0.        , -0.81229166, -0.26366021, ..., -0.87868801,
         1.19451878,  1.92222335],
       [ 0.        ,  1.43809769,  0.15576412, ...,  0.58477555,
         0.22791761,  0.08271906],
       [ 0.        ,  2.02690924, -0.18501814, ...,  0.87303352,
        -0.68443553, -0.80637468],
       ...,
       [ 0.        , -1.10266448, -0.11511409, ..., -0.10260885,
        -0.88170108, -1.17427554],
       [ 0.        , -0.73163255, -0.24618419, ..., -0.96738278,
         0.97259504,  0.87983758],
       [ 0.        ,  1.43003177, -0.20249416, ...,  1.07259673,
        -3.11080174, -2.24731971]])