## Machine Leaning 

**Objectives**
* Perform exploratory Data Analysis and determine Training Labels

* create a column for the class
* Standardize the data
* Split into training data and test data
- Find best Hyperparameter for SVM, Classification Trees and Logistic Regression

* Find the method performs best using test data

In [1]:
# Pandas is a software library written for the Python programming language for data manipulation and analysis.
import pandas as pd
# NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays
import numpy as np
# Matplotlib is a plotting library for python and pyplot gives us a MatLab like plotting framework. We will use this in our plotter function to plot data.
import matplotlib.pyplot as plt
#Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics
import seaborn as sns
# Preprocessing allows us to standarsize our data
from sklearn import preprocessing
# Allows us to split our data into training and testing data
from sklearn.model_selection import train_test_split
# Allows us to test parameters of classification algorithms and find the best one
from sklearn.model_selection import GridSearchCV
# Logistic Regression classification algorithm
from sklearn.linear_model import LogisticRegression
# Support Vector Machine classification algorithm
from sklearn.svm import SVC
# Decision Tree classification algorithm
from sklearn.tree import DecisionTreeClassifier
# K Nearest Neighbors classification algorithm
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Create the basis for a confusion matrix
def plot_confusion_matrix(y,y_predict):
    "this function plots the confusion matrix"
    from sklearn.metrics import confusion_matrix

    cm = confusion_matrix(y, y_predict)
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax); #annot=True to annotate cells
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['did not land', 'land']); ax.yaxis.set_ticklabels(['did not land', 'landed']) 
    plt.show() 

In [None]:
# Load the data
from js import fetch
import io

URL1 = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/dataset_part_2.csv"
resp1 = await fetch(URL1)
text1 = io.BytesIO((await resp1.arrayBuffer()).to_py())
data = pd.read_csv(text1)
data.head()

In [None]:
URL2 = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/dataset_part_3.csv'
resp2 = await fetch(URL2)
text2 = io.BytesIO((await resp2.arrayBuffer()).to_py())
X = pd.read_csv(text2)
X.head(100)

In [None]:
# Create a numpy array from the column class in data, by applying the method to_numpy()
Y = data['Class'].to_numpy()
Y

In [None]:
# Standardize the data in X then reassign it to the variable X
transform = preprocessing.StandardScaler()
X = transform.fit_transform(X)
X

In [None]:
# Split into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 2)
Y_test.shape

### Logistic Regression Method

In [None]:
# Create a logistic regression object then create a GridSearchCV object
parameters ={'C':[0.01,0.1,1],
             'penalty':['l2'],
             'solver':['lbfgs']}


In [None]:
parameters ={"C":[0.01,0.1,1],'penalty':['l2'], 'solver':['lbfgs']}# l1 lasso l2 ridge
lr=LogisticRegression()

logreg_cv = GridSearchCV(lr, parameters, cv = 10)

logreg_cv.fit(X_train, Y_train)

# output the GridSearchCV object for logistic regression
# display the best parameters using the data attribute best_params_
# the accuracy on the validation data using the data attribute best_score_
print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

In [None]:
# Calculate the accuracy on the test data using the method score:
logreg_cv.score(X_test, Y_test)

In [None]:
# plot the confusion matrix
yhat=logreg_cv.predict(X_test)
plot_confusion_matrix(Y_test,yhat)

### SVM Method

In [None]:
parameters = {'kernel':('linear', 'rbf','poly','rbf', 'sigmoid'),
              'C': np.logspace(-3, 3, 5),
              'gamma':np.logspace(-3, 3, 5)}
svm = SVC()

In [None]:

svm_cv = GridSearchCV(svm, parameters, cv = 10)
svm_cv.fit(X_train, Y_train)
# output the GridSearchCV object for SVM
# display the best parameters using the data attribute best_params_
# the accuracy on the validation data using the data attribute best_score_
print("tuned hpyerparameters :(best parameters) ",svm_cv.best_params_)
print("accuracy :",svm_cv.best_score_)

In [None]:
# Calculate the accuracy on the test data using the method score:
svm_cv.score(X_test, Y_test)

In [None]:
# plot the confusion matrix
yhat=svm_cv.predict(X_test)
plot_confusion_matrix(Y_test,yhat)

### Decision Tree Classifier Method

In [None]:
Parameters = {'criterion': ['gini', 'entropy'],
     'splitter': ['best', 'random'],
     'max_depth': [2*n for n in range(1,10)],
     'max_features': ['auto', 'sqrt'],
     'min_samples_leaf': [1, 2, 4],
     'min_samples_split': [2, 5, 10]}

tree = DecisionTreeClassifier()

In [None]:
# Instantiate the GridSearchCV object: tree_cv
tree_cv = GridSearchCV(tree, Parameters, cv=10)

# Fit it to the data
tree_cv.fit(X_train, Y_train)
# output the GridSearchCV object for decision tree
# display the best parameters using the data attribute best_params_
# the accuracy on the validation data using the data attribute best_score_
print("tuned hpyerparameters :(best parameters) ",tree_cv.best_params_)
print("accuracy :",tree_cv.best_score_)

In [None]:
# Calculate the accuracy on the test data using the method score:
tree_cv.score(X_test, Y_test)

In [None]:
# Plot the confussion matrix
yhat = tree_cv.predict(X_test)
plot_confusion_matrix(Y_test,yhat)

### KNN Method

In [None]:
parameters = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'p': [1,2]}

KNN = KNeighborsClassifier()

In [None]:
# Instantiate the GridSearchCV object: knn_cv
knn_cv = GridSearchCV(KNN, parameters, cv = 10)

knn_cv.fit(X_train, Y_train)

# output the GridSearchCV object for k-nearest neighbor
# display the best parameters using the data attribute best_params_
# the accuracy on the validation data using the data attribute best_score_
print("tuned hpyerparameters :(best parameters) ",knn_cv.best_params_)
print("accuracy :",knn_cv.best_score_)

In [None]:
# Calculate the accuracy on the test data using the method score:
knn_cv.score(X_test, Y_test)

In [None]:
# plot the confusion matrix
yhat = knn_cv.predict(X_test)
plot_confusion_matrix(Y_test,yhat)

In [None]:
# print them all in a table to compare the accuracy
predictors = [knn_cv, svm_cv, logreg_cv, tree_cv]
best_predictor = ""
best_result = 0
results = []
for predictor in predictors:    
    score = predictor.score(X_test, Y_test)
    results.append([str(predictor), score])

results_df = pd.DataFrame(results, columns=['Predictors'])
print(results_df)