# V.1 Exploring the green reds

## a) Write a function that will plot a scatterplot matrix of your red wine data

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import random
import math
def plot_scatter_matrix(winedata, good_threshold, bad_threshold, save_plot=False):
  c = winedata.columns
  fig, axes = plt.subplots(len(c) - 1, len(c) - 1, figsize=(15,15))
  fig.subplots_adjust(top=0.99, bottom=0.01, left=0.01, right=0.99, wspace=0, hspace=0)
  for i in range(len(c) - 1):
      for j in range(len(c) - 1):
          ax = axes[i, j]
          p = patches.Rectangle((0, 0), 1, 1, fill=True, transform=ax.transAxes,
              clip_on=False, facecolor='#000000', zorder=0.1, edgecolor='#ffffff')
          ax.add_patch(p)
          if (i != j):
              y1 = winedata[winedata['quality'] < bad_threshold][c[i]]
              x1 = winedata[winedata['quality'] < bad_threshold][c[j]]
              y2 = winedata[winedata['quality'] > good_threshold][c[i]]
              x2 = winedata[winedata['quality'] > good_threshold][c[j]]

              ax.scatter(x1, y1, marker='.', s=20, linewidths=0, c='m', alpha=0.8)
              ax.scatter(x2, y2, marker='.', s=20, linewidths=0, c='c', alpha=0.8)
          else:
              ax.text(0.5, 0.5, c[i].replace(' ', '\n'), horizontalalignment='center', verticalalignment='center',
                  transform=ax.transAxes, fontsize='x-large', color="#ffffff")
          ax.set_axis_off()

  if (save_plot == False):
      plt.show()
  else:
     fig.savefig("./test.png")
  return fig

ModuleNotFoundError: No module named 'pandas'

In [None]:
wine_data = pd.read_csv("./winequality-red.csv", sep=';')
fig = plot_scatter_matrix(wine_data, 6, 5, True)
plt.show(fig)

## b) which factors do you think will be most useful to your perceptron for distinguishing high quality vs. low quality wines? Why?

Wine factors such as alcohol and ph will work well at distinguishing wines. Those factors have clear groups that allow perceptrons to easily find classification groups.

# V.2 Learning to perceptron

## a) Implement a perceptron, b) implement a function to train your perceptron.

In [None]:
# Take the wine as good, if it's quality > 5
wine_data = wine_data.assign(goodness=pd.Series(wine_data['quality'] > 5))
# selecting the subset from `wine_data`
# only use wines with a score of 8 or higher and wines with a score of 3 or lower
features = ['pH', 'alcohol', 'quality', 'goodness']
selected_wine_data = wine_data[(wine_data['quality'] > 7) | (wine_data['quality'] < 4)][features]
selected_wine_data = selected_wine_data.reset_index(drop=True)

X = selected_wine_data.loc[:, ['pH', 'alcohol']]
Y = selected_wine_data['goodness'].values
print (selected_wine_data)
# print('{} samples selected:\n'.format(selected_wine_data.shape[0]))

In [None]:
def ft_dot(arr1, arr2):
    if len(arr1) != len(arr2):
        raise ValueError('arguments have different length')
    result = 0.
    for i in range(len(arr1)):
        result += arr1[i] * arr2[i]
    return result


class Perceptron:
    
    def __init__(self, lr):
        self.W = None
        self.lr = lr
        self.performance = list()
    
    def predict(self, inp):
        summ = ft_dot(inp, self.W[1:]) + self.W[0]
        if summ > 0.0:
            return 1
        else:
            return 0
    def train(self, x, y,verbose, epochs):
        epoch = 0
        if self.W is None:
            self.W = [0.0001 * random.uniform(-1, 1) for i in range(X.shape[1] + 1)]
        while epoch <= epochs or epochs == 0:
            epoch_errors = 0
            for xi, yi in zip(x, y):
                pred = self.lr * (yi - self.predict(xi))
                self.W[1:] += pred * xi
                self.W[0] += pred
                epoch_errors += int(pred != 0.0)
            self.performance.append((epoch, epoch_errors, self.W[1:], self.W[0]))
            if verbose and epoch % 10 == 0:
                print('Epoch {}: {} errors'.format(epoch, epoch_errors))
            epoch += 1
            if epoch_errors == 0 and epochs == 0:
                break
        
        return self.performance

In [None]:
perceptron = Perceptron(lr=0.005)
stats = perceptron.train(X.values, Y,verbose=True, epochs=0)
import pprint
import matplotlib.pyplot as plt
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(stats)

## c)function that will take the output of your perceptron training function and your red wine data and generate two plots in one figure:

In [None]:
def plot_performance(stats, wine_data, good_thresh, bad_thresh, epoch=-1, save_plot=False):
    fig, axe = plt.subplots(2, 1, figsize=(10,10))
#     fig.subplots_adjust(top=0.99, bottom=0.01, left=0.01, right=0.99, wspace=0, hspace=0)
    plt.figure(1)
    y2 = wine_data[(wine_data['quality'] >= good_thresh )]['pH']
    x2 = wine_data[(wine_data['quality']>= good_thresh)]['alcohol']
    y1 = wine_data[(wine_data['quality'] <= bad_thresh)]['pH']
    x1 = wine_data[(wine_data['quality'] <= bad_thresh)]['alcohol']
    # print(x1.values)
    res = stats[epoch]
    # for 
    # y = (-(b / w2) / (b / w1))x + (-b / w2)
    # y = (-(res[3] / res[2][1]) / (res[3] / res[2][0])) * x + (-res[3] / res[2][1])
    plt.subplot(211)
    plt.scatter(x1.values, y1.values, marker='o', s=20, linewidths=0, c='r', alpha=0.8)
    plt.scatter(x2.values, y2.values, marker='o', s=20, linewidths=0, c='g', alpha=0.8)
    w2, w1 = res[2]
    b = res[3]
    # for i in range()):
    #     x.append(i)
    #     y.append((-(res[3] / res[2][1]) / (res[3] / res[2][0])) * i + (-res[3] / res[2][1]))
    #     print("w1 = {}, w2 = {}, b = {}, x = {}, y = {}".format(, , , i, y[-1]))
    x = range(int(min(wine_data['alcohol'].values)), int(max(wine_data['alcohol'].values)))
    y = (w1 * x  + b) / -w2
    plt.plot(x,  y)

    x = []
    y = []
    for col in stats[:epoch]:
        x.append(col[0])
        y.append(col[1])
    plt.subplot(212)
    plt.plot(x, y)
    if (save_plot==False):   
        plt.show()
    else:
        fig.savefig("./err_boundary.png")
    return fig



## d) Use the function you just created in part c) to plot and verify that your perceptron is learning more efficiently!

In [None]:
plt.show(plot_performance(stats, selected_wine_data, 8, 3))

# V.3 My fair ADALINE

## Implement an ADALINE

In [None]:
def ft_dot(arr1, arr2):
    if len(arr1) != len(arr2):
        raise ValueError('arguments have different length')
    result = 0.0
    for a, b in zip(arr1, arr2):
        result += a * b
    return result


class Adaline:
    def __init__(self):
        self.W = None
        self.performance = list()
    
    def Transpose(self, m):
        res = list(map(list, zip(*m)))
        return (res)
    
    def _net_input_(self, xi):
        return ft_dot(self.W[1:], xi) + self.W[0]

    
    def activation(self, x):
        return 1 / (1 + math.exp(-self._net_input_(x)))
    
    def predict(self, X, Y):
        num_missclass = 0
        for xi, yi in zip(X, Y):
            prediction = 1 if self.activation(xi) > 0.5 else 0
            num_missclass += int(prediction != int(yi))
        return (num_missclass)

    def train(self, X, Y, epochs, lr, verbose=False, mode="batch"):
        cur_epoch = 0
        flag = 1 if mode == "batch" else 0
        if self.W == None:
            self.W = [random.uniform(-1, 1) for i in range(X.shape[1] + 1)]
        while cur_epoch < epochs or epochs == 0:
            errors = []
            for xi, yi in zip (X, Y):
                err = yi - self.activation(xi)
                errors.append(err)
                if not flag:
                    self.W[0] += lr * err
                    self.W[1:] += lr * err * xi
            if flag:
                self.W[0] += lr * sum(errors)
                self.W[1:] += lr * ft_dot(X, errors)
            pred = self.predict(X, Y)
            self.performance.append([cur_epoch, pred, self.W[1:], self.W[0]])
            if verbose and cur_epoch % 10 == 0:
                print('Epoch {}: {} errors'.format(cur_epoch, pred))
            if epochs == 0 and errors == 0:
                break;
            cur_epoch +=1
        return self.performance

In [None]:
ada = Adaline()
performance = ada.train(X.values, Y, 6000, 0.005, verbose=True, mode="online")


In [None]:
print (performance)

In [None]:
plt.show(plot_performance(performance, selected_wine_data, 7, 4,save_plot = True))

In [None]:
print (wine_data.sample(frac=0.7))

# V.4 Advanced wine sampling and resampling

In [None]:
def holdout_partition(data, part = 0.7):
    training = data.sample(frac=part)
    validation = data.query('index not in @training.index')
    return (training, validation)

In [None]:
train, validation = holdout_partition(selected_wine_data)
print (train)
print (validation)

In [None]:
def k_fold_split(data, k, shuffle=True):
    folds = []
    
    if shuffle:
        data = data.sample(frac=1).reset_index(drop=True)
    start = 0
    end = 0
    fold_size = 0
    for i in range(k):
        start += fold_size
        fold_size = data.shape[0] // k + 1 if i < data.shape[0] % k else data.shape[0] // k
        end = start + fold_size
        test_data = data.iloc[start: end, :]
        train_data = data.query('index not in @test_data.index')
        folds.append([train_data, test_data])
    
    return pd.DataFrame(folds, columns=["trainData", "testData"])

In [None]:
folds = k_fold_split(selected_wine_data, 5)
print (folds["testData"][3])

In [None]:
def k_fold_train(data, lr, epochs, features):
    folds = k_fold_split(data, 5)
#     X = data[0].loc[:, features].values
#     Y = data[0].loc['goodness'].values
    ada = Adaline()
    i = 0
    errors = 0
    for fold in folds.values:
        ada.train(fold[0].loc[:, features].values, fold[0]['goodness'].values, epochs, lr, verbose=False, mode="batch")
        errors += ada.predict(fold[1].loc[:,features].values, fold[1]['goodness'].values)
        i+=1
    print ("lr = {}, errors={}, epochs per fold = {}, features= {}".format(lr, errors, epochs, features) )

In [None]:
lr_array = [0.0005, 0.005, 0.05, 0.0035, 0.009, 0.0032, 0.00053, 0.0001, 0.01]

In [None]:
for rate in lr_array:
    for i in range(1, 5) :
        print ("Run No:{}, epochs={}, lr ={}".format(i, i * 1000, rate))
        k_fold_train(selected_wine_data, rate, i * 1000, ['pH', 'alcohol'])

In [None]:
for rate in lr_array:
    for i in range(1, 5):
        print ("Run No:{}, epochs={}, lr ={}".format(i, i * 1000, rate))
        k_fold_train(data, rate, i * 1000, features)

In [None]:

columns = ['pH', 'alcohol', 'volatile acidity']
k_fold_train(wine_data[(wine_data['quality'] > 7) | (wine_data['quality'] < 4)], 0.0035,  2000, columns)

columns = ['pH', 'alcohol', 'sulphates']
k_fold_train(wine_data[(wine_data['quality'] > 7) | (wine_data['quality'] < 4)], 0.0035,  2000, columns)

columns = ['pH', 'alcohol', 'fixed acidity']
k_fold_train(wine_data[(wine_data['quality'] > 7) | (wine_data['quality'] < 4)], 0.0035,  2000, columns)


columns = ['pH', 'alcohol', 'sulphates', 'volatile acidity']
k_fold_train(wine_data[(wine_data['quality'] > 7) | (wine_data['quality'] < 4)], 0.0035,  2000, columns)


columns = ['pH', 'alcohol', 'sulphates', 'fixed acidity']
k_fold_train(wine_data[(wine_data['quality'] > 7) | (wine_data['quality'] < 4)], 0.0035,  2000, columns)


columns = ['pH', 'alcohol', 'volatile acidity', 'fixed acidity']
k_fold_train(wine_data[(wine_data['quality'] > 7) | (wine_data['quality'] < 4)], 0.0035,  2000, columns)


columns = ['pH', 'alcohol', 'sulphates', 'volatile acidity', 'fixed acidity']
k_fold_train(wine_data[(wine_data['quality'] > 7) | (wine_data['quality'] < 4)], 0.0035,  2000, columns)

# V.6 Marvinâ€™s rebuttal

In [None]:

def normalize(data):
    r = data.max() - data.min()
    m = mean(data)
    for i in range(len(data)):
        data.values[i] = (data.values[i] - m) / r
    return data

def mean(data):
    return (sum(data) / len(data))

df = pd.read_csv("./Pan Galactic Gargle Blaster.csv", sep=';')
x1 = normalize(df.loc[:, 'wonderflonium'])
x2 = normalize(df.loc[:, 'fallian marsh gas'])

df = df.assign(r=pd.Series(x1 * x1 + x2 * x2).pow(1./2))
df = df.assign(phi=pd.Series([math.atan2(x, y) for (x, y) in zip(x1, x2)]))
data = df[['r', 'phi', 'quality']]
data = data.assign(goodness=pd.Series(wine_data['quality'] > 5))
# print (data)
matrixplot = plot_scatter_matrix(data[['r', 'phi', 'quality']], 7, 4, True)


In [None]:
k_fold_train(data[(data['quality'] > 8) | (data['quality'] < 3)], 0.0035,  2000, ['r', 'phi'])