# Load dataset

Use ```reader()``` method from csv package to read ```.csv``` files.
<br />
Use ```os.path.join()``` from os package to join directory names, because paths names are different in Windows and Linux.

In [1]:
import os
from csv import reader
from math import sqrt
import random

In [2]:
def load_csv(filename):
    file = open(filename, "r")
    lines = reader(file)
    dataset  = list(lines)
    return dataset

Listing available datasets

In [3]:
!ls ../datasets

data_banknote_authentication.txt
iris.csv
pima-indians-diabetes.csv
sonar.all-data
winequality-white.csv


In [4]:
data_name = 'pima-indians-diabetes.csv'
data_path = os.path.join('..', 'datasets', data_name)
dataset = load_csv(data_path)

print(f'Loaded data file {data_path} with {len(dataset)} rows and {len(dataset[0])} columns')
print('\n')
print('Sample data :')
print(f'{dataset[0]}')

Loaded data file ..\datasets\pima-indians-diabetes.csv with 768 rows and 9 columns


Sample data :
['6', '148', '72', '35', '0', '33.6', '0.627', '50', '1']


Convert required string columns to float.

In [5]:
def col_str_to_float(dataset, col):
    for row in dataset:
        row[col] = float(row[col].strip())

In [6]:
col_to_float = [0, 1, 2, 3]
for col in col_to_float:
    col_str_to_float(dataset, col)

print(dataset[0])

[6.0, 148.0, 72.0, 35.0, '0', '33.6', '0.627', '50', '1']


Convert class column to integers

In [7]:
def col_str_to_int(dataset, col):
    classes = set([row[col] for row in dataset])
    lookup = {v : i for i, v in enumerate(classes)}
    for row in dataset:
        row[col] = lookup[row[col]]
    return lookup

In [8]:
data_name = 'iris.csv'
data_path = os.path.join('..', 'datasets', data_name)
dataset = load_csv(data_path)

print(f'Loaded data file {data_path} with {len(dataset)} rows and {len(dataset[0])} columns')
print('\n')
print('Sample data :')
print(f'{dataset[0]}')

Loaded data file ..\datasets\iris.csv with 150 rows and 5 columns


Sample data :
['5.1', '3.5', '1.4', '.2', 'Setosa']


In [9]:
print('Before conversion :')
print(dataset[0])
col_to_float = [0, 1, 2, 3]
for col in col_to_float:
    col_str_to_float(dataset, col)

col_to_int = 4
lookup = col_str_to_int(dataset, col_to_int)

print('After conversion :')
print(dataset[0])
print(f'lookup dictionary of classes : {lookup}')

Before conversion :
['5.1', '3.5', '1.4', '.2', 'Setosa']
After conversion :
[5.1, 3.5, 1.4, 0.2, 0]
lookup dictionary of classes : {'Setosa': 0, 'Virginica': 1, 'Versicolor': 2}


# Scaling

### Normalize Data

Normalization can refer to different techniques depending on context. Here, we use normalization
to refer to rescaling an input variable to the range between 0 and 1. Normalization requires
that you know the minimum and maximum values for each attribute

In [10]:
data_name = 'iris.csv'
data_path = os.path.join('..', 'datasets', data_name)
dataset = load_csv(data_path)

print(f'Loaded data file {data_path} with {len(dataset)} rows and {len(dataset[0])} columns')
print('\n')
print('Sample data :')
print(f'{dataset[0]}')

Loaded data file ..\datasets\iris.csv with 150 rows and 5 columns


Sample data :
['5.1', '3.5', '1.4', '.2', 'Setosa']


In [11]:
print('Before conversion :')
print(dataset[0])
col_to_float = [0, 1, 2, 3]
for col in col_to_float:
    col_str_to_float(dataset, col)

col_to_int = 4
lookup = col_str_to_int(dataset, col_to_int)

print('After conversion :')
print(dataset[0])
print(f'lookup dictionary of classes : {lookup}')

Before conversion :
['5.1', '3.5', '1.4', '.2', 'Setosa']
After conversion :
[5.1, 3.5, 1.4, 0.2, 0]
lookup dictionary of classes : {'Setosa': 0, 'Virginica': 1, 'Versicolor': 2}


In [12]:
def normalize_col(dataset, col):
    values = [row[col] for row in dataset]
    min_val = min(values)
    max_val = max(values)
    for row in dataset:
        row[col] = (row[col] - min_val) / (max_val - min_val)

In [13]:
cols_to_norm = [0, 1, 2, 3]
for col in cols_to_norm:
    normalize_col(dataset, col)

print(dataset[0])

[0.22222222222222213, 0.6249999999999999, 0.06779661016949151, 0.04166666666666667, 0]


### Standardize Data
Standardization is a rescaling technique that refers to centering the distribution of the data on the value 0 and the standard deviation to the value 1. Together, the mean and the standard deviation can be used to summarize a normal distribution, also called the Gaussian distribution or bell curve.

In [14]:
data_name = 'iris.csv'
data_path = os.path.join('..', 'datasets', data_name)
dataset = load_csv(data_path)

print(f'Loaded data file {data_path} with {len(dataset)} rows and {len(dataset[0])} columns')
print('\n')
print('Sample data :')
print(f'{dataset[0]}')

Loaded data file ..\datasets\iris.csv with 150 rows and 5 columns


Sample data :
['5.1', '3.5', '1.4', '.2', 'Setosa']


In [15]:
print('Before conversion :')
print(dataset[0])
col_to_float = [0, 1, 2, 3]
for col in col_to_float:
    col_str_to_float(dataset, col)

col_to_int = 4
lookup = col_str_to_int(dataset, col_to_int)

print('After conversion :')
print(dataset[0])
print(f'lookup dictionary of classes : {lookup}')

Before conversion :
['5.1', '3.5', '1.4', '.2', 'Setosa']
After conversion :
[5.1, 3.5, 1.4, 0.2, 0]
lookup dictionary of classes : {'Setosa': 0, 'Virginica': 1, 'Versicolor': 2}


In [16]:
def standardize_col(dataset, col):
    values = [row[col] for row in dataset]
    mean = sum(values) / len(values)
    std = sqrt(sum([pow(_val - mean, 2) for _val in values]) / (len(values) - 1))
    for row in dataset:
        row[col] = (row[col] - mean) / std

In [17]:
cols_to_std = [0, 1, 2, 3]
for col in cols_to_std:
    standardize_col(dataset, col)

print(dataset[0])

[-0.8976738791967672, 1.0156019907136327, -1.3357516342415212, -1.3110521482051314, 0]


### When to Normalize and Standardize
Standardization is a scaling technique that assumes your data conforms to a normal distribution. If a given data attribute is normal or close to normal, this is probably the scaling method to use. It is good practice to record the summary statistics used in the standardization process so that you can apply them when standardizing data in the future that you may want to use with your model. Normalization is a scaling technique that does not assume any specific distribution. If your data is not normally distributed, consider normalizing it prior to applying your machine learning algorithm.

## Train and Test Split

Splitting data into train and test sets. Model will be trained on training set and later model will be evaluated on test set.

In [18]:
random.seed = 1234
def train_test_split(dataset, split_ratio):
    train = list()
    train_size = split_ratio * len(dataset)
    dataset_copy = list(dataset)
    while len(train) < train_size:
        index = random.randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
    return train, dataset_copy

In [19]:
train, test = train_test_split(dataset, split_ratio = 0.7)

print('Training sample : ')
print(train[0])
print('Test sample : ')
print(test[0])

Training sample : 
[-0.29385736852629735, -0.1315388120502617, 0.19373496985182176, 0.13206729444894824, 2]
Test sample : 
[-0.8976738791967672, 1.0156019907136327, -1.3357516342415212, -1.3110521482051314, 0]


## k-fold Crossvalidation Split

k-fold Cross validation help in improve model accuracy. Model will be trained on k-1 folds and evaluated on kth fold, this will happen on iteration. In every iteration, kth fold will get changed. We select that model which will have least accuracy.

In [20]:
random.seed = 1234
def cross_validation_split(dataset, n_folds = 3):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset_copy) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = random.randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

In [21]:
folds = cross_validation_split(dataset, n_folds = 4)

print(folds[0])

[[-1.5014903898672372, 0.32731750905529644, -1.3357516342415212, -1.3110521482051314, 0], [-1.5014903898672372, 0.09788934850251736, -1.279103982238064, -1.3110521482051314, 0], [-0.8976738791967672, 1.0156019907136327, -1.3357516342415212, -1.1798594716002149, 0], [-1.6222536920013308, -1.737535935919714, -1.3923992862449786, -1.1798594716002149, 0], [0.3099591421441726, -0.1315388120502617, 0.6469161858794794, 0.7880306774735298, 1], [-0.4146206706603909, -1.049251454261377, 0.3636779258621936, 0.0008746178440318052, 2], [0.6722490485464554, 0.09788934850251736, 0.9868020979002221, 0.7880306774735298, 1], [-1.259963785599049, -0.1315388120502617, -1.3357516342415212, -1.1798594716002149, 0], [-0.1730940663922027, 3.080455435688643, -1.279103982238064, -1.0486667949952986, 0], [-0.29385736852629735, -0.1315388120502617, 0.19373496985182176, 0.13206729444894824, 2], [0.5514857464123608, -1.278679614814156, 0.6469161858794794, 0.3944526476587808, 2], [-0.29385736852629735, -0.5903951331

## Evaluation Metrics

### Accuracy

In [22]:
def accuracy_metric(actual, predicted):
    correct = 0
    if len(actual) == len(predicted):
        for i in range(len(actual)):
            if actual[i] == predicted[i]:
                correct += 1
        return correct * 100.0 / float(len(actual))
    else:
        return 'error'

In [23]:
actual = [0,0,0,0,0,1,1,1,1,1]
predicted = [0,1,0,0,0,1,0,1,1,1]
accuracy = accuracy_metric(actual, predicted)
print(accuracy)

80.0


### Confusion Matrix

In [24]:
def confusion_matrix(actual, predicted):
    classes = set(actual)
    if len(actual) == len(predicted):
        matrix = [[0 for x in range(len(classes))] for y in range(len(classes))]
        lookup_dict = {v: k for k, v in enumerate(classes)}
        for i in range(len(actual)):
            x = lookup_dict[actual[i]]
            y = lookup_dict[predicted[i]]
            matrix[x][y] += 1
        return [classes, matrix]
    else:
        return [classes, 'error']    

In [25]:
actual =  [0,0,0,0,0,1,1,1,1,1]
predicted = [0,1,1,0,0,1,0,1,1,1]
unique, matrix = confusion_matrix(actual, predicted)
print(unique)
print(matrix)

{0, 1}
[[3, 2], [1, 4]]
