# Performing a Data Partition on a Sample Dataset

In [None]:
from sklearn.datasets import load_wine
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
data = load_wine()

In [None]:
X = pd.DataFrame(data.data)
Y = pd.DataFrame(data.target)
print(X.shape,Y.shape)

(178, 13) (178, 1)


In [None]:
X, X_test, Y, Y_test = train_test_split(X, Y, test_size = 0.2)

In [None]:
print(X.shape, X_test.shape, Y.shape, Y_test.shape)

(142, 13) (36, 13) (142, 1) (36, 1)


However, to obtain a dev set
that's the same shape as the test set, it is necessary to
calculate the proportion of the size of the test set over the
size of the train set before creating a validation set. This
value will be used as the test_size for the next step:

In [None]:
dev_size = 36/142
print(dev_size)

0.2535211267605634


In [None]:
X_train, X_dev, Y_train, Y_dev =train_test_split(X, Y, \
test_size =dev_size)
print(X_train.shape, Y_train.shape, X_dev.shape, \
Y_dev.shape, X_test.shape, Y_test.shape)

(106, 13) (106, 1) (36, 13) (36, 1) (36, 13) (36, 1)


You have successfully split the dataset into three subsets to develop
efficient machine learning projects. Feel free to test different split
ratios.

# Cross-Validation

In [None]:
# K fold cross validation

from sklearn.datasets import load_wine
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [None]:
data = load_wine()
X = pd.DataFrame(data.data)
Y = pd.DataFrame(data.target)

In [None]:
X, X_test, Y, Y_test = train_test_split(X, Y, \
                                test_size =0.10)

In [None]:
kf = KFold(n_splits = 10)
splits = kf.split(X)

In [None]:
for train_index, test_index in splits:
    X_train, X_dev = X.iloc[train_index,:], \
    X.iloc[test_index,:]
    Y_train, Y_dev = Y.iloc[train_index,:], \
    Y.iloc[test_index,:]

In [None]:
print(X_train.shape, Y_train.shape, X_dev.shape, \
Y_dev.shape, X_test.shape, Y_test.shape)

(144, 13) (144, 1) (16, 13) (16, 1) (18, 13) (18, 1)


The code to train and evaluate the model should be written

inside the loop body, given that the objective of the cross-
validation procedure is to train and validate the model using

the different split configurations.

## Data Partitioning on a Handwritten Digit Dataset

In [None]:
from sklearn.datasets import load_digits
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [None]:
data = load_digits()
X = pd.DataFrame(data.data)
Y = pd.DataFrame(data.target)

In [None]:
X, X_test, Y, Y_test = train_test_split(X, Y, test_size = 0.2)
print(X.shape, X_test.shape, Y.shape, Y_test.shape)

(1437, 64) (360, 64) (1437, 1) (360, 1)


In [None]:
kf = KFold(n_splits = 20)
splits = kf.split(X)
for train_index, test_index in splits:
    X_train, X_dev = X.iloc[train_index,:], \
    X.iloc[test_index,:]
    Y_train, Y_dev = Y.iloc[train_index,:], \
    Y.iloc[test_index,:]

In [None]:
print(X_train.shape, Y_train.shape, X_dev.shape, \
Y_dev.shape, X_test.shape, Y_test.shape)

(1366, 64) (1366, 1) (71, 64) (71, 1) (360, 64) (360, 1)


# Calculating Different Evaluation Metrics on a Classification Task

In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
data = load_breast_cancer()
X = pd.DataFrame(data.data)
Y = pd.DataFrame(data.target)

In [3]:
X_train, X_test, \
Y_train, Y_test = train_test_split(X,Y, test_size =0.1, \
random_state =0)

The random_state parameter is used to set a seed that will
ensure the same results every time you run the code. This
guarantees that you will get the same results as the ones
reflected in this exercise. Different numbers can be used as
the seed; however, use the same number as suggested in the
exercises and activities of this chapter to get the same
results as the ones shown here.

In [4]:
model = tree.DecisionTreeClassifier(random_state =0)
model = model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

In [5]:
confusion_matrix(Y_test, Y_pred)

array([[21,  1],
       [ 6, 29]])

In [6]:
accuracy = accuracy_score(Y_test, Y_pred)
print("accuracy:", accuracy)
precision = precision_score(Y_test, Y_pred)
print("precision:", precision)
recall = recall_score(Y_test, Y_pred)
print("recall:", recall)

accuracy: 0.8771929824561403
precision: 0.9666666666666667
recall: 0.8285714285714286


# Calculating Evaluation Metrics on a Regression Task

In [7]:
from sklearn.datasets import load_boston
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import numpy as np

In [8]:
data = load_boston()
X = pd.DataFrame(data.data)
Y = pd.DataFrame(data.target)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, \
test_size = 0.1,
random_state = 0)


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [9]:
model = linear_model.LinearRegression()
model = model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

In [10]:
MAE = mean_absolute_error(Y_test, Y_pred)
print("MAE:", MAE)
RMSE = np.sqrt(mean_squared_error(Y_test, Y_pred))
print("RMSE:", RMSE)

MAE: 3.9357920841193095
RMSE: 6.45945634367614


The main difference between these two metrics is that
the MAE assigns the same weight of importance to all errors, while
the RMSE squares the error, assigning higher weights to larger
errors.

## Evaluating the Performance of the Model Trained on a Handwritten Dataset

In [11]:
from sklearn.datasets import load_digits
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


In [12]:
data = load_digits()
X = pd.DataFrame(data.data)
Y = pd.DataFrame(data.target)
X, X_test, Y, Y_test = train_test_split(X, Y, test_size = 0.2,random_state =0)
print(X.shape, X_test.shape, Y.shape, Y_test.shape)

(1437, 64) (360, 64) (1437, 1) (360, 1)


In [13]:
model = tree.DecisionTreeClassifier(random_state =0)
model = model.fit(X, Y)
Y_pred = model.predict(X_test)

In [14]:
confusion_matrix(Y_test, Y_pred)

array([[24,  0,  0,  0,  0,  0,  1,  0,  0,  2],
       [ 0, 31,  0,  2,  1,  0,  1,  0,  0,  0],
       [ 1,  0, 29,  0,  0,  0,  2,  2,  1,  1],
       [ 0,  0,  2, 27,  0,  0,  0,  0,  0,  0],
       [ 1,  1,  0,  0, 26,  0,  1,  1,  0,  0],
       [ 0,  1,  1,  0,  0, 34,  0,  0,  1,  3],
       [ 1,  1,  1,  1,  1,  0, 39,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  1,  1, 37,  0,  0],
       [ 1,  3,  3,  5,  0,  1,  0,  1, 24,  1],
       [ 0,  0,  1,  4,  0,  1,  0,  0,  1, 34]])

In [15]:
accuracy = accuracy_score(Y_test, Y_pred)
print("accuracy:", accuracy)
# precision = precision_score(Y_test, Y_pred)
# print("precision:", precision)
# recall = recall_score(Y_test, Y_pred)
# print("recall:", recall)

accuracy: 0.8472222222222222


In [16]:
# code to convert Y_test and Y_pred into a one-
# hot vector.

Y_test_2 = Y_test[:]
Y_test_2[Y_test_2 != 6] = 1
Y_test_2[Y_test_2 == 6] = 0
Y_pred_2 = Y_pred
Y_pred_2[Y_pred_2 != 6] = 1
Y_pred_2[Y_pred_2 == 6] = 0

In [17]:
confusion_matrix(Y_test, Y_pred)

array([[ 39,   5],
       [  6, 310]])

In [18]:
accuracy = accuracy_score(Y_test, Y_pred)
print("accuracy:", accuracy)
precision = precision_score(Y_test, Y_pred)
print("precision:", precision)
recall = recall_score(Y_test, Y_pred)
print("recall:", recall)

accuracy: 0.9694444444444444
precision: 0.9841269841269841
recall: 0.9810126582278481
