In [19]:
from csv import reader
from math import sqrt

In [20]:
import os

os.getcwd()

'/Users/maheshwars/Desktop/venv'

In [21]:
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

# Find the min and max values for each column
def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])
    return minmax
    
# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])


# calculate column means
def column_means(dataset):
    means = [0 for i in range(len(dataset[0]))]
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        means[i] = sum(col_values) / float(len(dataset))
    return means
# calculate column standard deviations
def column_stdevs(dataset, means):
    stdevs = [0 for i in range(len(dataset[0]))]
    for i in range(len(dataset[0])):
        variance = [pow(row[i]-means[i], 2) for row in dataset]
        stdevs[i] = sum(variance)
    stdevs = [sqrt(x/(float(len(dataset)-1))) for x in stdevs]
    return stdevs  

# standardize dataset
def standardize_dataset(dataset, means, stdevs):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - means[i]) / stdevs[i]

In [22]:


# Load dataset
filepath = '/Users/maheshwars/Desktop/venv/data'
filename = filepath +'/diabetes.csv'
dataset = load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset),
len(dataset[0])))

# convert string columns to float
for i in range(len(dataset[0])):
    str_column_to_float(dataset[1:], i)
print(dataset[1])

Loaded data file /Users/maheshwars/Desktop/venv/data/diabetes.csv with 769 rows and 9 columns
[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]


In [23]:
print(dataset[0:2])

[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'], [6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]]


# __Normalisation and Standardisation__
Many machine learning algorithms expect the scale of the input and even the output data to be
equivalent. It can help in methods that weight inputs in order to make a prediction, such as
in linear regression and logistic regression. It is practically required in methods that combine
weighted inputs in complex ways such as in artificial neural networks and deep learning.

We use **normalization** to refer to rescaling an input variable to the range between 0 and 1. Normalization requires
that you know the minimum and maximum values for each attribute.

**Standardization** is a rescaling technique that refers to centering the distribution of the data on
the value 0 and the standard deviation to the value 1. Together, the mean and the standard
deviation can be used to summarize a normal distribution, also called the Gaussian distribution
or bell curve.

In [24]:
# Calculate min and max for each column
minmax = dataset_minmax(dataset[1:])

# Normalize columns
normalize_dataset(dataset[1:], minmax)
print(dataset[1])

[0.35294117647058826, 0.7437185929648241, 0.5901639344262295, 0.35353535353535354, 0.0, 0.5007451564828614, 0.23441502988898377, 0.48333333333333334, 1.0]


In [25]:
# Estimate mean and standard deviation
means = column_means(dataset[1:])
stdevs = column_stdevs(dataset[1:], means)

print(dataset[1])

# standardize dataset
standardize_dataset(dataset[1:], means, stdevs)
print(dataset[1])

[0.35294117647058826, 0.7437185929648241, 0.5901639344262295, 0.35353535353535354, 0.0, 0.5007451564828614, 0.23441502988898377, 0.48333333333333334, 1.0]
[0.6395304921176561, 0.8477713205896719, 0.14954329852954315, 0.9066790623472527, -0.6924393247241301, 0.20387990726746852, 0.46818687022979616, 1.4250667195933595, 1.3650063669598014]


## When to Normalize and Standardize

Standardization is a scaling technique that assumes your data conforms to a normal distribution.
If a given data attribute is normal or close to normal, this is probably the scaling method to use.
It is good practice to record the summary statistics used in the standardization process so that
you can apply them when standardizing data in the future that you may want to use with your
model. Normalization is a scaling technique that does not assume any specific distribution.

If your data is not normally distributed, consider normalizing it prior to applying your
machine learning algorithm. It is good practice to record the minimum and maximum values
for each column used in the normalization process, again, in case you need to normalize new
data in the future to be used with your model.

# Train test split

In [26]:
from random import seed
from random import randrange

In [27]:
# Split a dataset into a train and test set
def train_test_split(dataset, split=0.60):
    train = list()
    train_size = split * len(dataset)
    dataset_copy = list(dataset)
    while len(train) < train_size:
        index = randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
    return train, dataset_copy

In [28]:

# test train/test split
seed(1)
dataset = [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10],[11]]
train, test = train_test_split(dataset)
print(train)
print(test)

[[3], [11], [2], [7], [1], [8], [9]]
[[4], [5], [6], [10]]


# k-fold Cross-Validation Split

A limitation of using the train and test split method is that you get a noisy estimate of
algorithm performance. The k-fold cross-validation method (also called just cross-validation) is
a resampling method that provides a more accurate estimate of algorithm performance.

You should choose a value for k that splits the data into groups with enough rows that each
group is still representative of the original dataset.

A quick way to check if the fold sizes are representative is
to calculate summary statistics such as mean and standard deviation and see how much the
values diﬀer from the same statistics on the whole dataset.

In [29]:
# Split a dataset into $k$ folds
def cross_validation_split(dataset, folds=3):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / folds)
    for i in range(folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

In [30]:
# test cross validation split
seed(1)
dataset = [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]]
folds = cross_validation_split(dataset, 5)
print(folds)

[[[3], [2]], [[7], [1]], [[8], [9]], [[10], [6]], [[5], [4]]]


* The downside of
cross-validation is that it can be time-consuming to run, requiring k diﬀerent models to be
trained and evaluated. This is a problem if you have a very large dataset or if you are evaluating
a model that takes a long time to train.

* Large datasets are those in the hundreds of thousands or millions of records, large enough
that splitting it in half results in two datasets that have nearly equivalent statistical properties.
In such cases, there may be little need to use k-fold cross-validation as an evaluation of the
algorithm and a train and test split may be just as reliable

# Evaluation Matrix

In [31]:
# Example of calculating classification accuracy

# Calculate accuracy percentage between two lists
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0
    # Test accuracy
    
actual = [0,0,0,0,0,1,1,1,1,1]
predicted = [0,1,0,0,0,1,0,1,1,1]
accuracy = accuracy_metric(actual, predicted)
print(accuracy)

80.0


In [32]:
# calculate a confusion matrix
def confusion_matrix(actual, predicted):
    unique = set(actual)
    matrix = [list() for x in range(len(unique))]
    for i in range(len(unique)):
        matrix[i] = [0 for x in range(len(unique))]
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for i in range(len(actual)):
        x = lookup[actual[i]]
        y = lookup[predicted[i]]
        matrix[y][x] += 1
    return unique, matrix

# Test confusion matrix with integers
actual = [0,0,0,0,0,2,1,1,1,2]
predicted = [0,1,1,0,0,1,0,1,2,2]
unique, matrix = confusion_matrix(actual, predicted)
print(unique)
print(matrix)

{0, 1, 2}
[[3, 1, 0], [2, 1, 1], [0, 1, 1]]


In [33]:
# Calculate mean absolute error (MAE)
def mae_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        sum_error += abs(predicted[i] - actual[i])
    return sum_error / float(len(actual))

    
# Calculate root mean squared error (RMSE)
def rmse_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i]
        sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
    return sqrt(mean_error)
    

In [34]:
# Test RMSE
actual = [0.1, 0.2, 0.3, 0.4, 0.5]
predicted = [0.11, 0.19, 0.29, 0.41, 0.5]
rmse = rmse_metric(actual, predicted)
print(rmse)

0.00894427190999915


RMSE values are always slightly higher than MSE values, which becomes more pronounced as
the prediction errors increase. This is a benefit of using RMSE over MSE in that it penalizes
larger errors with worse scores.

# __Baseline models__

A baseline prediction algorithm provides a set
of predictions that you can evaluate as you would any predictions for your problem, such as
classification accuracy or RMSE.

The scores from these algorithms provide the required point of comparison when evaluating
all other machine learning algorithms on your problem. Once established, you can comment on
how much better a given algorithm is as compared to the naive baseline algorithm, providing
context on just how good a given method actually is.

### Random Prediction Algorithm

The random prediction algorithm predicts a random outcome as observed in the training data.
It is perhaps the simplest algorithm to implement. It requires that you store all of the distinct
outcome values in the training data, which could be large on regression problems with lots of
distinct values.

In [35]:
# Example of Making Random Predictions
from random import seed
from random import randrange

# Generate random predictions
def random_algorithm(train, test):
    output_values = [row[-1] for row in train]
    unique = list(set(output_values))
    predicted = list()
    for _ in test:
        index = randrange(len(unique))
        predicted.append(unique[index])
    return predicted
    
seed(1)
train = [[0], [1], [0], [1], [0], [1]]
test = [[None], [None], [None], [None]]
predictions = random_algorithm(train, test)
print(predictions)

[0, 0, 1, 0]


### Zero Rule Algorithm

The Zero Rule Algorithm is a better baseline than the random algorithm. It uses more
information about a given problem to create one rule in order to make predictions. This rule is
diﬀerent depending on the problem type.

one rule is to predict the class value that is most common in
the training dataset.

In [36]:
# Example of Zero Rule Classification Predictions
from random import seed

# zero rule algorithm for classification
def zero_rule_algorithm_classification(train, test):
    output_values = [row[-1] for row in train]
    prediction = max(set(output_values), key=output_values.count)
    predicted = [prediction for i in range(len(test))]
    return predicted
    
seed(1)
train = [['0'], ['0'], ['0'], ['0'], ['1'], ['1']]
test = [[None], [None], [None], [None]]
predictions = zero_rule_algorithm_classification(train, test)
print(predictions)

# Example of Zero Rule Regression Predictions
from random import seed

# zero rule algorithm for regression
def zero_rule_algorithm_regression(train, test):
    output_values = [row[-1] for row in train]
    prediction = sum(output_values) / float(len(output_values))
    predicted = [prediction for i in range(len(test))]
    return predicted
    
seed(1)
train = [[10], [15], [12], [15], [18], [20]]
test = [[None], [None], [None], [None]]
predictions = zero_rule_algorithm_regression(train, test)
print(predictions)

['0', '0', '0', '0']
[15.0, 15.0, 15.0, 15.0]


# Algorithm Test Harness

We need a function that can take a dataset and an algorithm and return a performance score.
Below is a function named evaluate algorithm() that achieves this. It takes 3 fixed arguments
including the dataset, the algorithm function and the split percentage for the train-test split.

### train-test split harness

In [52]:
# Evaluate an algorithm using a train/test split
def evaluate_algorithm(dataset, algorithm, split, *args):
    train, test = train_test_split(dataset, split)
    test_set = list()
    for row in test:
        row_copy = list(row)
        row_copy[-1] = None
        test_set.append(row_copy)
    predicted = algorithm(train, test_set, *args)
    actual = [row[-1] for row in test]
    accuracy = accuracy_metric(actual, predicted)
    return accuracy

# Used zero rule algorithm for classification
# Test the train/test harness

seed(1)

# load and prepare data
filepath = '/Users/maheshwars/Desktop/venv/data'
filename = filepath +'/diabetes.csv'
dataset = load_csv(filename)
dataset = dataset[1:]              #removing headers 
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)
    
# evaluate algorithm
split = 0.6
accuracy = evaluate_algorithm(dataset, zero_rule_algorithm_classification, split)
print('Accuracy: %.3f%%' % (accuracy))

Accuracy: 67.427%


Notice
how the name of the Zero Rule algorithm zero rule algorithm classification was passed
as an argument to the evaluate algorithm() function. You can see how this test harness may
be used again and again with diﬀerent algorithms. Running the example above prints out the
accuracy of the model.

### Cross-Validation Algorithm Test Harness

In [38]:
lis = [[1,2],[3,4],[5,6,7]]
lis = sum(lis,[0])
print(lis)

[0, 1, 2, 3, 4, 5, 6, 7]


In [50]:
# Evaluate an algorithm using a cross-validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    #print('here folds: {0}'.format(folds))
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
            
        print('ts1 {0}'.format(test_set[0]))
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

In [40]:
# Test cross validation test harness
seed(1)
# load and prepare data
filepath = '/Users/maheshwars/Desktop/venv/data'
filename = filepath +'/diabetes.csv'
dataset = load_csv(filename)
dataset = dataset[1:] 

for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)
    
# evaluate algorithm
n_folds = 7
scores = evaluate_algorithm(dataset, zero_rule_algorithm_classification, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/len(scores)))

ts1 [0.0, 93.0, 60.0, 25.0, 92.0, 28.7, 0.532, 22.0, None]
ts1 [3.0, 158.0, 70.0, 30.0, 328.0, 35.5, 0.344, 35.0, None]
ts1 [1.0, 89.0, 76.0, 34.0, 37.0, 31.2, 0.192, 23.0, None]
ts1 [8.0, 109.0, 76.0, 39.0, 114.0, 27.9, 0.64, 31.0, None]
ts1 [4.0, 118.0, 70.0, 0.0, 0.0, 44.5, 0.904, 26.0, None]
ts1 [8.0, 176.0, 90.0, 34.0, 300.0, 33.7, 0.467, 58.0, None]
ts1 [1.0, 140.0, 74.0, 26.0, 180.0, 24.1, 0.828, 23.0, None]
Scores: [66.97247706422019, 55.96330275229357, 68.80733944954129, 63.30275229357798, 64.22018348623854, 67.88990825688074, 67.88990825688074]
Mean Accuracy: 65.007%


# **Simple Linear Regression**

### Calculate Mean and Variance

In [41]:
# Calculate the mean value of a list of numbers
def mean(values):
    return sum(values) / float(len(values))

# Calculate the variance of a list of numbers
def variance(values, mean):
    return sum([(x-mean)**2 for x in values])    

### Calculate Covariance

The covariance of two groups of numbers describes how those numbers change together. Co-
variance is a generalization of correlation.

In [42]:
# Calculate covariance between x and y
def covariance(x, mean_x, y, mean_y):
    covar = 0.0
    for i in range(len(x)):
        covar += (x[i] - mean_x) * (y[i] - mean_y)
    return covar

## Estimate Coeﬃcients
y = B0 + B1*x

B1 = covarience(x,y)/variance(x) <======> **W = summation(x(i).y(i)) / summation(x(i)^2)**

B2 = mean(y) - B1*mean(x)

In [43]:
# Calculate coefficients
def coefficients(dataset):
    x = [row[0] for row in dataset]
    y = [row[1] for row in dataset]
    x_mean, y_mean = mean(x), mean(y)
    b1 = covariance(x, x_mean, y, y_mean) / variance(x, x_mean)
    b0 = y_mean - b1 * x_mean
    return [b0, b1]
    
# calculate coefficients
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
b0, b1 = coefficients(dataset)
print('Coefficients: B0=%.3f, B1=%.3f' % (b0, b1))

Coefficients: B0=0.400, B1=0.800


In [62]:
# Evaluate regression algorithm on training dataset
def evaluate_algorithm(dataset, algorithm):
    test_set = list()
    for row in dataset:
        row_copy = list(row)
        row_copy[-1] = None
        test_set.append(row_copy)
    predicted = algorithm(dataset, test_set)
    print(predicted)
    actual = [row[-1] for row in dataset]
    rmse = rmse_metric(actual, predicted)
    return rmse

# Simple linear regression algorithm
def simple_linear_regression(train, test):
    predictions = list()
    b0, b1 = coefficients(train)
    print('coefficeints {0}, {1}'.format(b0,b1))
    for row in test:
        yhat = b0 + b1 * row[0]
        predictions.append(yhat)
    return predictions
# Test simple linear regression
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
rmse = evaluate_algorithm(dataset, simple_linear_regression)
print('RMSE: %.3f' % (rmse))

coefficeints 0.39999999999999947, 0.8
[1.1999999999999995, 1.9999999999999996, 3.5999999999999996, 2.8, 4.3999999999999995]
RMSE: 0.693


### Insurance dataset

In [45]:
#pip install xlrd

In [46]:
import pandas as pd

# Load the Excel file into a DataFrame
filepath = '/Users/maheshwars/Desktop/venv/data'
filename = filepath +'/insurance.xls'
df = pd.read_excel(filename, engine='xlrd')

# Print the DataFrame
df.head(8)

*** No CODEPAGE record, no encoding_override: will use 'iso-8859-1'


Unnamed: 0,X,Y
0,108,392.5
1,19,46.2
2,13,15.7
3,124,422.2
4,40,119.4
5,57,170.9
6,23,56.9
7,14,77.5


In [70]:
dataset = df.values
dataset[0]

array([108. , 392.5])

In [74]:
# Evaluate an algorithm using a train/test split
def evaluate_algorithm(dataset, algorithm, split, *args):
    train, test = train_test_split(dataset, split)
    test_set = list()
    for row in test:
        row_copy = list(row)
        row_copy[-1] = None
        test_set.append(row_copy)
    predicted = algorithm(train, test_set, *args)
    actual = [row[-1] for row in test]
    rmse = rmse_metric(actual, predicted)
    return rmse
    
# evaluate algorithm
seed(1)
split = 0.6
rmse = evaluate_algorithm(dataset, simple_linear_regression, split)
print('RMSE: %.3f' % (rmse))

coefficeints 28.228654849397856, 3.3175561991590437
RMSE: 33.630


# **Multivariate Linear Regression**

In [77]:
# Make a prediction with coefficients
def predict(row, coefficients):
    yhat = coefficients[0]
    for i in range(len(row)-1):
        yhat += coefficients[i + 1] * row[i]
    return yhat

The first coeﬃcient in is always the intercept, also called the bias or b0 as it is standalone and
not responsible for a specific input value.

### Stochastic Gradient Descent

In [78]:
# Estimate linear regression coefficients using stochastic gradient descent
def coefficients_sgd(train, l_rate, n_epoch):
    coef = [0.0 for i in range(len(train[0]))]
    for epoch in range(n_epoch):
        sum_error = 0
        for row in train:
            yhat = predict(row, coef)
            error = yhat - row[-1]
            sum_error += error**2
            coef[0] = coef[0] - l_rate * error #updated without an input as it is not associated with a specific input value
            for i in range(len(row)-1):
                coef[i + 1] = coef[i + 1] - l_rate * error * row[i] # #updated without an input (row[i])
        print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))
    return coef

In [87]:
# Calculate coefficients
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
l_rate = 0.001
n_epoch = 50
coef = coefficients_sgd(dataset, l_rate, n_epoch)
print(coef)

>epoch=0, lrate=0.001, error=46.236
>epoch=1, lrate=0.001, error=41.305
>epoch=2, lrate=0.001, error=36.930
>epoch=3, lrate=0.001, error=33.047
>epoch=4, lrate=0.001, error=29.601
>epoch=5, lrate=0.001, error=26.543
>epoch=6, lrate=0.001, error=23.830
>epoch=7, lrate=0.001, error=21.422
>epoch=8, lrate=0.001, error=19.285
>epoch=9, lrate=0.001, error=17.389
>epoch=10, lrate=0.001, error=15.706
>epoch=11, lrate=0.001, error=14.213
>epoch=12, lrate=0.001, error=12.888
>epoch=13, lrate=0.001, error=11.712
>epoch=14, lrate=0.001, error=10.668
>epoch=15, lrate=0.001, error=9.742
>epoch=16, lrate=0.001, error=8.921
>epoch=17, lrate=0.001, error=8.191
>epoch=18, lrate=0.001, error=7.544
>epoch=19, lrate=0.001, error=6.970
>epoch=20, lrate=0.001, error=6.461
>epoch=21, lrate=0.001, error=6.009
>epoch=22, lrate=0.001, error=5.607
>epoch=23, lrate=0.001, error=5.251
>epoch=24, lrate=0.001, error=4.935
>epoch=25, lrate=0.001, error=4.655
>epoch=26, lrate=0.001, error=4.406
>epoch=27, lrate=0.001,