## Pipeline

In [6]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

In [2]:
import os
os.chdir("D:\\meridianthe4\\PML\\Cases\\Chemical Process Data")

In [3]:
chemical = pd.read_csv('ChemicalProcess.csv')
X, y = chemical.drop('Yield', axis=1), chemical['Yield']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

In [5]:
imputer = SimpleImputer(strategy='mean')
knn = KNeighborsRegressor(n_neighbors=5)
pipe = Pipeline(steps=[('IMP', imputer), ("KNN", knn)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(mean_absolute_error(y_test, y_pred))

1.2861132075471702


In [10]:
imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()
knn = KNeighborsRegressor(n_neighbors=3)
pipe = Pipeline(steps=[('IMP', imputer), ('SCL', scaler), ("KNN", knn)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(mean_absolute_error(y_test, y_pred))

1.0376729559748423


In [11]:
imputers = ['mean', 'median']
scaler = StandardScaler()
Ks = np.arange(1, 16)
scores = []
for k in Ks:
    for imp in imputers:
        imputer = SimpleImputer(strategy=imp)
        knn = KNeighborsRegressor(n_neighbors=k)
        pipe = Pipeline(steps=[('IMP', imputer), ('SCL', scaler), ("KNN", knn)])
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        scores.append([imp, k, mae])
df_scores = pd.DataFrame(scores, columns=['imputer', 'K', 'score'])

In [12]:
df_scores.sort_values('score', ascending=True)

Unnamed: 0,imputer,K,score
4,mean,3,1.037673
5,median,3,1.040063
3,median,2,1.043396
2,mean,2,1.044623
6,mean,4,1.055047
7,median,4,1.05783
10,mean,6,1.085535
12,mean,7,1.086873
13,median,7,1.094879
11,median,6,1.095566
