# Automate the model selection problem

build a notebook that takes the data, perform the preprocessing, evaluation and choose the best model from multiple models.

it would be great if you save the model to a pickle file at the end.

In [17]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import joblib
import os

In [32]:
PATH = "../datasets/"
dataset = "Social_Network_Ads.csv"
target = "Purchased"
df = pd.read_csv(PATH+dataset)
y_col = target

In [33]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0


## Preparing the data:

In [34]:
features = df.drop(y_col, axis=1)
goal = df[ y_col]

encoder = LabelEncoder()
for x in features:
    if features[x].dtype == object:
        features[x] = encoder.fit_transform(features[x])

features = features.values
goal = goal.values

scaler = StandardScaler()
features = scaler.fit_transform(features)

x_train, x_test, y_train, y_test = train_test_split(features, goal, test_size = 0.2, random_state = 0)

In [35]:
pca = PCA(n_components=4)
pca.fit(x_train)
x_train = pca.transform(x_train)
x_test  = pca.transform(x_test)

## Creating the classifiers:

In [36]:
svm = SVC()
knn = KNeighborsClassifier()
logistic = LogisticRegression()

## Train the classifiers and search for the best instance in each classifier and calculate their score:

In [37]:
svmParameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'kernel': ['rbf']}]

knnParameters = [{'n_neighbors': [1, 10, 100, 287]},
              {'n_neighbors': [2, 20, 200, 287]},
             {'n_neighbors': [3, 30, 150, 287]}]

logisticParameters = [{'random_state': [0]}]

classifiers = {"svm": svm, "knn": knn, "logistic": logistic}

allParameters = [svmParameters, knnParameters, logisticParameters]

clfInfo = {}

for classifier, param, name in zip(classifiers.values(), allParameters, classifiers.keys()):
    grid_search = GridSearchCV(estimator = classifier, param_grid = param, cv = 10, n_jobs = -1)
    grid_search = grid_search.fit(x_train, y_train)
    print("{} best accuracy is :{}".format(name, grid_search.best_score_))
    print("{} best params is :{}".format(name, grid_search.best_params_))
    print("----------------------------------------------------------------------")
    clfInfo[name] = {"score": grid_search.best_score_, "params": grid_search.best_params_}

svm best accuracy is :0.890625
svm best params is :{'C': 1, 'kernel': 'rbf'}
----------------------------------------------------------------------
knn best accuracy is :0.88125
knn best params is :{'n_neighbors': 3}
----------------------------------------------------------------------
logistic best accuracy is :0.825
logistic best params is :{'random_state': 0}
----------------------------------------------------------------------


## Find the best classifier:

In [38]:
score = 0
clf = None
param = None
for classifier in clfInfo:
    if clfInfo[classifier]["score"] > score:
        score = clfInfo[classifier]["score"]
        clf = classifier
        param = clfInfo[classifier]["params"]
        
print("The best Classifier is : {} with score {} and best combination of parameters : {}".format(clf, score, param))

clf = classifiers[clf]

The best Classifier is : svm with score 0.890625 and best combination of parameters : {'C': 1, 'kernel': 'rbf'}


## Setting the best parameters found for the best found classifier:

In [39]:
params = param
names = params.keys()
values = params.values()

for name, value in zip(names, values):
    setattr(clf, name, value)


## Generate Pickle Files:

In [41]:
RES_DIR = os.path.dirname(os.path.abspath(__name__))
DATA_DIR = os.path.join(RES_DIR, dataset)
PICKLE_DIR = os.path.join(RES_DIR, 'model.pkl')
SCALER_DIR = os.path.join(RES_DIR, 'scale.pkl')
LABEL_DIR = os.path.join(RES_DIR, 'label.pkl')

joblib.dump(clf, PICKLE_DIR)
joblib.dump(scaler, SCALER_DIR)
joblib.dump(encoder, LABEL_DIR)

['F:\\Work\\Python Machine Learning\\CLS-Python\\CLS-Python\\Machine_learning\\Session_5\\assigns\\label.pkl']