In [9]:
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [74]:
class Regularization:
    def __init__(self, X_train,y_train,X_test = None,y_test = None,w_vect = None):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        
        # in the non-linear case, will define Z_mat
        self.Z_mat = None
        
        if w_vect is None:
            w_vect = np.array([0.,0.,0.]).T
        else: 
            w_vect = w_vect
    
    def transform(self,data_set):
        arr = []
        for point in data_set:
            x1,x2 = point[1],point[2]
            arr.append([1,x1,x2,x1**2,x2**2,x1*x2,np.abs(x1-x2),np.abs(x1+x2)])
        return arr
    
    def lr_train(self,k = None):
        y_vect = np.array([self.y_train]).T

        self.Z_train = self.transform(self.X_train)
        self.w_vect = np.dot(np.linalg.pinv(self.Z_train),y_vect)
        
        # add regularization (weight decay)
        if k is not None:
            lmbda = 10**(k)
            w_sqrd = (np.linalg.norm(self.w_vect))**2
            N = len(y_vect)
            decay = (lmbda / float(N)) * w_sqrd 
            
            regularization = np.linalg.pinv(self.Z_train + decay)
            self.w_vect = np.dot(regularization, y_vect)

             
    def test(self):
        error = 0
        self.Z_test = self.transform(self.X_test)
        y_pred = np.sign(np.dot(self.Z_test, self.w_vect))

        # count all different members
        for i in range(len(y_pred)):
            if y_pred[i] != self.y_test[i]:
                error += 1
        return error
        
def run(e_in=False,k=None):
    
    # load training and testing data
    df1 = pd.read_table("in.dta.txt",delim_whitespace=True,header=None)
    df2 = pd.read_table("out.dta.txt",delim_whitespace=True, header = None)
    X_train = [[1,df1[1][i],df1[0][i]] for i in range(len(df1)-1)]
    y_train = [df1[2][i] for i in range(len(df1)-1)]
    
    if not e_in:
        # load testing data
        X_test = [[1,df2[1][i],df2[0][i]] for i in range(len(df2)-1)]
        y_test = [df2[2][i] for i in range(len(df2)-1)]
        
    else:
        # training data is used for testing
        X_test = X_train
        y_test = y_train
        
    reg = Regularization(X_train,y_train,X_test,y_test)
    
    reg.lr_train(k=k)
    error = reg.test()
    
    return error / float(len(X_test))
        
if __name__ == "__main__":
    print(run(True)) # returns 0.0294 
    print(run(False)) # returns 0.0803 closest is 0.08
    
    print(run(True,k = -3)) # returns 0.0294
    print(run(False,k = -3)) # returns 0.0803 so have the same results
    
    print(run(True,k = 3)) # returns 0.5588
    print(run(False, k = 3)) # returns 0.5301
    
    print(run(True, k =-1), run(False, k = -1)) # this gives the smallest error E_out = 0.05622
    

0.029411764705882353
0.08032128514056225
[[  1.00098760e+00   8.39208985e-01  -7.78482605e-01   7.03602687e-01
    6.08561413e-01  -6.52380990e-01   1.61867919e+00   5.97387747e-02]
 [  1.00098760e+00   8.96365035e-01   1.56622515e-01   8.02688347e-01
    2.52098300e-02   1.40339590e-01   7.40730125e-01   1.05199994e+00]
 [  1.00098760e+00  -7.16792345e-01  -5.89200983e-02   5.16195661e-01
    4.57653762e-03   4.39881528e-02   6.58859852e-01   7.78675258e-01]
 [  1.00098760e+00   7.59920985e-01   2.08583965e-01   5.76967480e-01
    4.40838534e-02   1.58539412e-01   5.52324625e-01   9.67517345e-01]
 [  1.00098760e+00  -3.74499555e-01  -1.94995515e-01   1.41978212e-01
    3.93969881e-02   7.45767499e-02   1.80491645e-01   5.72457885e-01]
 [  1.00098760e+00  -8.41566205e-01   5.89477075e-01   7.10884527e-01
    3.47307461e-01  -4.94846440e-01   1.43203088e+00   2.55051945e-01]
 [  1.00098760e+00  -5.47328895e-01   8.18619214e-03   3.01638589e-01
    1.03942440e-03  -2.95949951e-03   5.565