# My first `scikit-learn` notebook

In [1]:
import pandas as pd
import numpy as np
from random import choices
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

Load a dataset

In [2]:
forecast = pd.read_csv('Forecast.csv')
forecast.head()

Unnamed: 0,Temperature,Humidity,Wind_Speed,Go-Out
0,6,85,30,0
1,14,90,35,0
2,15,86,8,1
3,21,56,15,1
4,17,67,9,1


Setup the `numpy` arrays to use to train classifiers

In [3]:
y = forecast.pop('Go-Out').values  # target feature
X = forecast.values                # training data
type(X),type(y)

(numpy.ndarray, numpy.ndarray)

Train a *k*-NN classifier

In [4]:
kNN = KNeighborsClassifier(n_neighbors=3) 
kNN.fit(X,y)

KNeighborsClassifier(n_neighbors=3)

Set up sample test data and use for prediction

In [5]:
X_test = np.array([[8,70,11],
                   [8,69,15]])
kNN.predict(X_test)

array([1, 0])

All `sklearn` classifiers implement the `Estimator` API.

In [6]:
tree = DecisionTreeClassifier()
tree.fit(X,y)
tree.predict(X_test)

array([1, 1])

In [7]:
lr = LogisticRegression()
lr.fit(X,y)
lr.predict(X_test)

array([0, 0])

Swapping between classifiers (Estimators) makes model selection easy.  
Note that each predictor gives different results for the test data examples...

In [8]:
cfrs = [kNN,tree,lr]
for cfr in cfrs:
    cfr.fit(X,y)
    print(cfr.predict(X_test))

[1 0]
[1 1]
[0 0]


## Preprocessing
All preprocessing modules implement the `Transformer`  API.

In [9]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X)   # standardise to zero mean and unit variance
X_scaled = scaler.transform(X)
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[-1.59094327, -0.05406252, -0.79537086],
       [-1.59094327, -0.10040182, -0.37117307]])

In [10]:
mm_scaler = preprocessing.MinMaxScaler()        # standardise to range [0,1]
mm_scaler.fit(X)
X_scaled = mm_scaler.transform(X)
X_test_scaled = mm_scaler.transform(X_test)
X_test_scaled

array([[0.125     , 0.6875    , 0.17241379],
       [0.125     , 0.675     , 0.31034483]])

# Try It Yourself

Using the `penguin_size` dataset, experiment with some of the different models available in *sci-kit learn*. Some examples of what you can try are

* [Decision Trees](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)
* [Naive Bayes](https://scikit-learn.org/stable/modules/naive_bayes.html)
* [KNN Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)
* [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

You can try each of the algorithms with and without scalers, and explore the parameters outlined in the SKLearn documentation for each to see what impact it has on the results.


In [107]:
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

# Before Data Scalling

In [108]:
from sklearn.metrics import accuracy_score

def encode_features(df):
    """
    Some models (such as the decision tree, for example) don't work with categorical data. This function
    goes through each column in the dataframe and uses a label encoder to convert categorical data to numerical.
    For example, `Gentoo`, `Emperor`, `Chinstrap` as penguin species would get replaced with 1, 2, 3
    
    We'll talk more about label encoding and other things to watch out for as the module progresses.
    """
    le = preprocessing.LabelEncoder()
    for i in range(len(df.columns)):
        df.iloc[:,i] = le.fit_transform(df.iloc[:,i])
    return df

penguins_train = pd.read_csv('penguins_train.csv')
penguins_test = pd.read_csv('penguins_test.csv')


# Preprocessing goes here. Make sure that any preprocessing done to the training data is also done to the test data




penguins_train = encode_features(penguins_train)
penguins_test = encode_features(penguins_test)


y_train = penguins_train.pop('species')
X_train = penguins_train.values

y_test = penguins_test.pop('species')
X_test = penguins_test.values



# y_pred = [] # the predict(X_test) method on your classifier will return a list of predictions for y_test

# # create a classifier
# # make sure you `fit` the classifier on the training data before you try to predict


# # A handy way to measure the accuracy of your classifier which compares actual targets against predictions
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy is {accuracy}")

classifiers =[]
model1 = GaussianNB()
classifiers.append(model1)
model2 = LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=10000)
classifiers.append(model2)
model3 = DecisionTreeClassifier()
classifiers.append(model3)
model4 = KNeighborsClassifier(n_neighbors=3)
classifiers.append(model4)

for clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred= clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy of %s is %s"%(clf, acc)),

Accuracy of GaussianNB() is 0.5277777777777778
Accuracy of LogisticRegression(class_weight='balanced', max_iter=10000) is 0.7222222222222222
Accuracy of DecisionTreeClassifier() is 0.4444444444444444
Accuracy of KNeighborsClassifier(n_neighbors=3) is 0.4444444444444444


In [109]:
mm_scaler = preprocessing.MinMaxScaler()        # standardise to range [0,1]
mm_scaler.fit(X_train)
X_train_s = mm_scaler.transform(X_train)
X_test_s = mm_scaler.transform(X_test)
X_test_s



classifiers_s =[]
model1 = GaussianNB()
classifiers_s.append(model1)
model2 = LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=10000)
classifiers_s.append(model2)
model3 = DecisionTreeClassifier()
classifiers_s.append(model3)
model4 = KNeighborsClassifier(n_neighbors=3)
classifiers_s.append(model4)

for clf_s in classifiers_s:
    clf_s.fit(X_train_s, y_train)
    y_pred_s= clf_s.predict(X_test_s)
    acc_s = accuracy_score(y_test, y_pred_s)
    print("Accuracy of %s is %s"%(clf_s, acc_s)),

Accuracy of GaussianNB() is 0.8055555555555556
Accuracy of LogisticRegression(class_weight='balanced', max_iter=10000) is 0.4444444444444444
Accuracy of DecisionTreeClassifier() is 0.4444444444444444
Accuracy of KNeighborsClassifier(n_neighbors=3) is 0.5277777777777778


In [105]:
scaler = preprocessing.StandardScaler().fit(X_train)   # standardise to zero mean and unit variance
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test_scaled

classifiers_scaled =[]
model1 = GaussianNB()
classifiers_scaled.append(model1)
model2 = LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=10000)
classifiers_scaled.append(model2)
model3 = DecisionTreeClassifier()
classifiers_scaled.append(model3)
model4 = KNeighborsClassifier(n_neighbors=3)
classifiers_scaled.append(model4)

for clf_scaled in classifiers_scaled:
    clf_scaled.fit(X_train_scaled, y_train)
    y_pred_scaled= clf_scaled.predict(X_test_scaled)
    acc_scaled = accuracy_score(y_test, y_pred_scaled)
    print("Accuracy of %s is %s"%(clf_scaled, acc_scaled)),

Accuracy of GaussianNB() is 0.8333333333333334
Accuracy of LogisticRegression(class_weight='balanced', max_iter=10000) is 0.5833333333333334
Accuracy of DecisionTreeClassifier() is 0.4444444444444444
Accuracy of KNeighborsClassifier(n_neighbors=3) is 0.5833333333333334
