# Linear Methods for Regression & Classification 

Table of contents

Chapter 1.   Linear regression

Chapter 2.   Logistic regression

✔ Chapter 3.   Gaussian discriminant analysis (Gaussian Naive Bayes)

To install further python libraries, type

`!pip install --target=$my_path [LIBRARY_NAME]`

To download online dataset,

`!wget [URL]`

In [1]:
# Split the dataset by class values (i.e., labels), returns a dictionary
def separate_by_class(dataset):
	data_dict = dict()
	for i in range(len(dataset)):
		vector = dataset[i]
		class_value = vector[-1]
		if (class_value not in data_dict):
			data_dict[class_value] = list()
		data_dict[class_value].append(vector)
	return data_dict

In [2]:
# Test separating data by class
dataset = [[3.393533211,2.331273381,0],
	[3.110073483,1.781539638,0],
	[1.343808831,3.368360954,0],
	[3.582294042,4.67917911,0],
	[2.280362439,2.866990263,0],
	[7.423436942,4.696522875,1],
	[5.745051997,3.533989803,1],
	[9.172168622,2.511101045,1],
	[7.792783481,3.424088941,1],
	[7.939820817,0.791637231,1]]
separated = separate_by_class(dataset)
for label in separated:
	print(label)
	for row in separated[label]:
		print(row)

0
[3.393533211, 2.331273381, 0]
[3.110073483, 1.781539638, 0]
[1.343808831, 3.368360954, 0]
[3.582294042, 4.67917911, 0]
[2.280362439, 2.866990263, 0]
1
[7.423436942, 4.696522875, 1]
[5.745051997, 3.533989803, 1]
[9.172168622, 2.511101045, 1]
[7.792783481, 3.424088941, 1]
[7.939820817, 0.791637231, 1]


In [3]:
from math import sqrt
 
# compute the standard deviation of a list of numbers
def stdev(numbers):
	avg = mean(numbers)
	variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
	return sqrt(variance)
 
# compute the mean of a list of numbers
def mean(numbers):
	return sum(numbers)/float(len(numbers))

In [4]:
# Calculate the mean, stdev and count for each column in a dataset
def mean_std_for_all(dataset):
	summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
	del(summaries[-1])
	return summaries

In [5]:
# Test summarizing a dataset
dataset = [[3.393533211,2.331273381,0],
	[3.110073483,1.781539638,0],
	[1.343808831,3.368360954,0],
	[3.582294042,4.67917911,0],
	[2.280362439,2.866990263,0],
	[7.423436942,4.696522875,1],
	[5.745051997,3.533989803,1],
	[9.172168622,2.511101045,1],
	[7.792783481,3.424088941,1],
	[7.939820817,0.791637231,1]]
summary = mean_std_for_all(dataset)
print(summary)

[(5.178333386499999, 2.7665845055177263, 10), (2.9984683241, 1.218556343617447, 10)]


In [6]:
for i in zip(*dataset):
  print(i)

(3.393533211, 3.110073483, 1.343808831, 3.582294042, 2.280362439, 7.423436942, 5.745051997, 9.172168622, 7.792783481, 7.939820817)
(2.331273381, 1.781539638, 3.368360954, 4.67917911, 2.866990263, 4.696522875, 3.533989803, 2.511101045, 3.424088941, 0.791637231)
(0, 0, 0, 0, 0, 1, 1, 1, 1, 1)


In [7]:
def mean_std_per_class(dataset):
  separated = separate_by_class(dataset)
  summaries = dict()
  for class_value, rows in separated.items():
    print(class_value, rows)
    summaries[class_value] = mean_std_for_all(rows)
  return summaries

In [8]:
# compute mean and std for each class of 0 and 1
dataset = [[3.393533211,2.331273381,0],
	[3.110073483,1.781539638,0],
	[1.343808831,3.368360954,0],
	[3.582294042,4.67917911,0],
	[2.280362439,2.866990263,0],
	[7.423436942,4.696522875,1],
	[5.745051997,3.533989803,1],
	[9.172168622,2.511101045,1],
	[7.792783481,3.424088941,1],
	[7.939820817,0.791637231,1]]
summary = mean_std_per_class(dataset)
for label in summary:
	print(label)
	for row in summary[label]:
		print(row)# (mean, std, length)

0 [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0], [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0], [2.280362439, 2.866990263, 0]]
1 [[7.423436942, 4.696522875, 1], [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1], [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]]
0
(2.7420144012, 0.9265683289298018, 5)
(3.0054686692, 1.1073295894898725, 5)
1
(7.6146523718, 1.2344321550313704, 5)
(2.9914679790000003, 1.4541931384601618, 5)


To compute the probability (i.e., density) or likelihood, assume that the data is on Gaussian distribution

f(x) = (1 / sqrt(2 * PI) * std) * exp(-((x-mean)^2 / (2 * std^2)))

In [9]:
# use Gaussian PDF to derive the probability
from math import sqrt
from math import pi
from math import exp

# compute the Gaussian probability distribution function (i.e., pdf) for x
def calculate_probability(x, mean, stdev):
	exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
	return (1 / (sqrt(2 * pi) * stdev)) * exponent

In [10]:
# compute the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
  total_rows = sum([summaries[label][0][2] for label in summaries]) # summaries[label][0][2]: length
  probabilities = dict()
  for class_value, class_summaries in summaries.items():
    probabilities[class_value] = summaries[class_value][0][2]/float(total_rows) # prior
    
    for i in range(len(class_summaries)):
      mean, stdev, count = class_summaries[i]
      probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
  return probabilities

def predict_label(probs):
  return max(probs, key=probs.get)

In [11]:
# Test calculating class probabilities
dataset = [[3.393533211,2.331273381,0],
	[3.110073483,1.781539638,0],
	[1.343808831,3.368360954,0],
	[3.582294042,4.67917911,0],
	[2.280362439,2.866990263,0],
	[7.423436942,4.696522875,1],
	[5.745051997,3.533989803,1],
	[9.172168622,2.511101045,1],
	[7.792783481,3.424088941,1],
	[7.939820817,0.791637231,1]]
summaries = mean_std_per_class(dataset)

# Test with the first row of the dataset
probabilities = calculate_class_probabilities(summaries, dataset[0])
print(probabilities)
print(predict_label(probabilities))
probabilities = calculate_class_probabilities(summaries, dataset[5])
print(probabilities)
print(predict_label(probabilities))

0 [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0], [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0], [2.280362439, 2.866990263, 0]]
1 [[7.423436942, 4.696522875, 1], [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1], [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]]
{0: 0.05032427673372076, 1: 0.00011557718379945765}
0
{0: 6.919582110595225e-08, 1: 0.0220272991338951}
1
