

## Discussion 
- Each model have different performance characteristics.

#### How to select good ones?
- like data visualization, do model accuracy visualization
- Use different ways of looking at the estimated accuracy of machine learning algorithms, and select couple of algorithms to finalize.
- One way could be to show the average accuracy, variance, and other properties of the distribution of model accuracies. 

## <span style="color:red"> Problem Statement - 1 </span>
- Compare multiple different classifiction algorithms
    - Logistic Regression
    - Linear Discriminant Analysis
    - k-Nearest Neighors
    - Classification and Regression Trees
    - Support Vector Machines

#### Load Python libraries and dataset

In [None]:
import pandas as pd
from matplotlib import pyplot
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from numpy import set_printoptions

from sklearn import datasets


In [None]:
data = pd.read_csv("../data/pima-indians-diabetes.csv")

#### Check Your Data

In [None]:
# check first 5 rows of the dataset
print(data.head(5))

### Separate input and target variables

In [None]:
# sseparate input and target variable
data_array = data.values
_X = data_array[:,0:8]
y = data_array[:,8]

# instantiate StandardScaler class and fit on data 
scaler = StandardScaler().fit(_X)
X = scaler.transform(_X)

### Prepare classification models

In [None]:
models_clf = []
models_clf.append(('LR', LogisticRegression()))
models_clf.append(('LDA', LinearDiscriminantAnalysis()))
models_clf.append(('KNN', KNeighborsClassifier()))
models_clf.append(('CART', DecisionTreeClassifier()))
models_clf.append(('NB', GaussianNB()))
models_clf.append(('SVM', SVC()))
print(models_clf)

### Evaluate each model

In [None]:
results = []
names = []

print(f'Mean (s.d.) of accuracy for each algorithm\n==========================================')
for name, model in models_clf:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    #print(cv_results)
    result = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(result)


In [None]:
# boxplot algorithm comparison
fig = pyplot.figure(figsize=(12, 12))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()

### Box plots shows the spread of accuracy scores across each cross validation fold for each algorithm.

## <span style="color:red"> Problem Statement - 2 </span>
- Compare multiple different regression algorithms
    - Linear Regression
    - Ridge Regression
    - LASSO Linear Regression
    - Elastic Net Regression
    - Classification and Regression Trees
    - Support Vector Machines

In [None]:
import pandas as pd
from matplotlib import pyplot
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

from numpy import set_printoptions

from sklearn import datasets


In [None]:
# Load Dataset
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    boston = datasets.load_boston()

data = pd.DataFrame(boston.data, columns = boston.feature_names)
data['PRICE'] = boston.target

with pd.option_context('expand_frame_repr', False):
    print(data.head(5))

In [None]:
data_array = data.values
# split data into train and test 
X2 = data_array[:,0:13]
y2 = data_array[:,13]

### Prepare regression models

In [None]:
models_reg = []
models_reg.append(('LR', LinearRegression()))
models_reg.append(('Ridge', Ridge()))
models_reg.append(('Lasso', Lasso()))
models_reg.append(('EN', ElasticNet()))
models_reg.append(('KNR', KNeighborsRegressor()))
models_reg.append(('CART', DecisionTreeRegressor()))
models_reg.append(('SVM', SVR()))
print(models_reg)

In [None]:
results2 = []
names2 = []

print(f'Mean (s.d.) of accuracy for each algorithm\n==========================================')
for name2, model2 in models_reg:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model2, X2, y2, cv=kfold, scoring='neg_mean_squared_error')
    results2.append(cv_results)
    names2.append(name2)
    #print(cv_results)
    result2 = "%s: %f (%f)" % (name2, cv_results.mean(), cv_results.std())
    print(result2)


In [None]:
# boxplot algorithm comparison
fig = pyplot.figure(figsize=(10, 10))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results2)
ax.set_xticklabels(names2)
pyplot.show()

## <span style="color:red"> Problem Statement - 3 </span>
- Compare different Pipelines

In [None]:
from sklearn.pipeline import Pipeline
pipelines = []
pipelines.append(('LR_pipe', Pipeline([('Scaler', StandardScaler()), ('LR',LinearRegression())])))
pipelines.append(('Ridge_pipe', Pipeline([('Scaler', StandardScaler()), ('LR',Ridge())])))
pipelines.append(('LASSO_pipe', Pipeline([('Scaler', StandardScaler()), ('LASSO',Lasso())])))
pipelines.append(('EN_pipe', Pipeline([('Scaler', StandardScaler()), ('EN',ElasticNet())])))
pipelines.append(('KNN_pipe', Pipeline([('Scaler', StandardScaler()), ('KNN',KNeighborsRegressor())])))
pipelines.append(('CART_pipe', Pipeline([('Scaler', StandardScaler()), ('CART',DecisionTreeRegressor())])))
pipelines.append(('SVR_pipe', Pipeline([('Scaler', StandardScaler()), ('SVR', SVR())])))

In [None]:
results3 = []
names3 = []
print(f'Mean (s.d.) of accuracy for each algorithm\n==========================================')
for name3, pipe in pipelines:
    kfold = KFold(n_splits=10)
    cv_results3 = cross_val_score(pipe, X, y, cv=kfold, scoring='neg_mean_squared_error')
    results3.append(cv_results3)
    names3.append(name3)
    result3 = "%s: %f (%f)" % (name3, cv_results3.mean(), cv_results3.std())
    print(result3)

In [None]:
# boxplot algorithm comparison
fig = pyplot.figure(figsize=(10, 10))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results3)
ax.set_xticklabels(names3)
pyplot.show()