In [1]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets
from sklearn.decomposition import PCA
import os
from subprocess import call
import pickle

class TargetFunction():
    def __init__(self, function, string_representation): 
        self.function = function
        self.string_representation = string_representation
    
    def __str__(self):
        return self.string_representation
    

class DataSet:
    
    # Potential features:
    # - Add parameter that allows to decide if bias should be encoded in data.
    # - Add parameter that allows to decide if labels should be {0, 1} or {-1, +1}
    def __init__(self, name, n=100, d=2, var=0.8, means=np.array([[8,2], [2,8]]), xlim=[0,10], ylim=[0,10]):
        self.xlim = xlim
        self.ylim = ylim
        
        self.name = name
        if name == "perceptron": 
            self.X, self.y = self.make_classification(n, 2, means = np.array([[2,4], [8,5]]))
        elif name == "cats_vs_dogs":
            self.X, self.y = self.cifar_cats_dogs() 
        elif name == "pocket":
            self.X, self.y = self.make_classification(n, 2, means = np.array([[3,5], [8,5]]), variance=1.7)
        elif name == "linear_regression":
            self.linreg = True
            self.X, self.y, self.target_function = self.make_regression(n, d)
        elif name == "linear_classification":
            pass
        elif name == "breast_cancer":
            self.X, self.y = sklearn.datasets.load_breast_cancer(return_X_y=True)
            n, d = self.X.shape
            self.X = np.concatenate((np.ones(n).reshape(n, 1), self.X), axis=1)
            self.y = 2*self.y-1
            self.normalize_mean()
            self.X += 1 # add small noise so X^TX is invertible 
            #update plot limits after normalization

        elif name == "breast_cancer_2d": 
            X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
            self.X, self.y = self.pca_2d(X, y)
            self.normalize_mean()
            self.X += 1  # add small noise so X^TX is invertible 
            #update plot limits after normalization
            self.xlim = [np.min(self.X[:,1]), np.max(self.X[:,1])]
            self.ylim = [np.min(self.X[:,2]), np.max(self.X[:,2])]
            
        elif name == "logistic_regression":
            self.X, self.y = self.make_classification(n, 2, means = means, variance=var)
        elif name == "not_linearly_seperable":
            self.X, self.y = self.non_linear_circle(n, 0.35)
        
    def normalize(self):
        self.X = (self.X - np.mean(self.X, axis=0)) / np.std(self.X, axis=0)
    def normalize_mean(self):
        self.X = self.X - np.mean(self.X, axis=0)
    def normalize_std(self):
        self.X = self.X / np.std(self.X, axis=0)
        
        
    def pca_2d(self, X, y): 
        n, _ = X.shape

        pca = PCA(n_components=2)
        pca.fit(X)

        X = X @ pca.components_[:2].T
        X = np.concatenate((np.ones((n, 1)), X), axis=1)
        
        self.xlim = [np.min(X[:,1]), np.max(X[:,1])]
        self.ylim = [np.min(X[:,2]), np.max(X[:,2])]
        
        return X, y*2-1 # change from {0, 1} to {-1, +1}
            

    def non_linear_circle(self, n, variance=0.2):
        ''' Generate a binary classification problem that is not linearly seperable (circle). '''
        r_class0 = 0.3
        r_class1 = 2

        # for each point, normally distribute around radius and uniformly pick an angle. 
        angles = np.random.rand(n)*2*np.pi
        noisy_r_0 = np.random.normal(loc=r_class0, scale=variance, size=n//2)
        noisy_r_1 = np.random.normal(loc=r_class1, scale=variance, size=n//2)


        # compute (x,y) points from polar coordinates (r, angle). 
        X_0 = noisy_r_0 * np.cos(angles[:n//2])
        y_0 = noisy_r_0 * np.sin(angles[:n//2])
        X_1 = noisy_r_1 * np.cos(angles[n//2:])
        y_1 = noisy_r_1 * np.sin(angles[n//2:])

        X_0 = np.concatenate((np.ones(n//2).reshape(n//2, 1), X_0.reshape(n//2, 1), y_0.reshape(n//2, 1)), axis=1)
        X_1 = np.concatenate((np.ones(n//2).reshape(n//2, 1), X_1.reshape(n//2, 1), y_1.reshape(n//2, 1)), axis=1)
        X = np.concatenate((X_0, X_1), axis=0)
        y = np.concatenate((np.zeros(n//2), np.ones(n//2)))

        perm = np.random.permutation(n)

        X = X[perm]
        y = y[perm]
        return X, y
        
    def make_regression(self, n, d):
        """ For now assumes d=2, make data normally distributed around line. """
        
        if d == 2:
            # Generate normally distributed noise that displaces points from line. 
            noise_variance = 0.5
            normal_distributed_noise = np.random.normal(loc=0, scale=noise_variance, size=n)

            # Generate random line f(x)=ax+b such that points normally distributed around line will 
            # have high probability of being inside plot. 
            b = np.random.rand(1)*5+2 # Let 'b' be in [4, 6] uniform random
            sign = np.random.choice([-1, +1])
            a = sign* np.random.rand(1)/10*4 # let 'a' be in [-4/10+noise_var, 4/10-noise_var] so all data are in uniform 10,10 box

            target_function = TargetFunction(lambda x: a*x+b, str(round(a[0], 2)) + "*x+" + str(round(b[0], 2)))
            target_function.w = [b, a]

            xs = np.ones((n, 2)) 
            xs[:, 1] = np.random.rand(n)*10 

            ys = target_function.function(xs[:, 1]) + normal_distributed_noise

            return xs, ys, target_function
        else:
            # Generate weight vector
            w = np.random.rand(d)
            
            noise_variance = 0.05
            normal_distributed_noise = np.random.normal(loc=0, scale=noise_variance, size=n)
            
            X = np.random.rand(n, d)
            y = X @ w + normal_distributed_noise
            
            return X, y, None
        
        

    def make_classification(self, n, d, means=None, num_classes=2, linear_seperable=False, variance=0.8):
        """ Creates data for a 'num_classes' classification problem. All points are generated in a 
        cube [0, 2]^d. Each class is generated as a normal distribution N(µ, 1) around a 
        randomly generated mean. 

        """
        # Generate num_classes means
        if means is None: 
            means = np.random.rand(num_classes, d)*10
        
        # Initialize data matrix and labels array
        # Encode 1's in first dimension
        X = np.ones((n, d+1))
        y = np.zeros(n, dtype=np.int32)

        for i in range(n):
            y[i] = np.random.choice(num_classes)
            X[i, 1:d+1] = np.random.normal(loc=means[y[i]], scale=variance)

        # Have labels be {-1, +1}
        y = y*2-1
            
        return X, y
    
    
    def plot(self):
        """ Assumes the data is 2d and plots it. Throws exception if data isn't 2d (with bias encoded). 
        
        """
        n, d = self.X.shape
        assert d == 3, "Data needs to be 2d (with bias encoded) to be plotted."
        
        if set(self.y) != {-1, +1}: self.y = self.y*2-1

        X_class_0 = self.X[self.y == -1]
        X_class_1 = self.X[self.y == 1]
        fig, ax_data = plt.subplots(1, 1, figsize=(4, 4))
        
        ax_data.set_title("Dataset: " + self.name)
        ax_data.set_xlabel("X dimension of data")
        ax_data.set_ylabel("Y dimension of data")
        ax_data.set_xlim(self.xlim[0], self.xlim[1])
        ax_data.set_ylim(self.ylim[0], self.ylim[1])
        ax_data.plot(X_class_0[:,1], X_class_0[:,2], 'go')
        ax_data.plot(X_class_1[:,1], X_class_1[:,2], 'bx')
        fig.canvas.draw()
        
    def plot_regression(self):
        n, d = self.X.shape
        #assert d == 2, "Data needs to be 2d (with bias encoded) to be plotted."
        
        plt.title("Dataset: " + self.name + ", Target Function: " + str(self.target_function))
        plt.xlabel("X dimension of data")
        plt.ylabel("Y dimension of data")
        plt.xlim(0, 10)
        plt.ylim(0, 10)
        plt.plot(self.X[:,1], self.y, 'go')
        plt.plot([0, 10], [self.target_function.function(0), self.target_function.function(10)], '--c')
        plt.show()
        
        
        
        
    # CIFAR and CATSvsDOGS

    def download_cifar10(self):
        ''' Downloads CIFAR10 if there is no local copy. '''

        # If there is no local copy of CIFAR10 then download it. 
        if not os.path.exists("cifar-10-python.tar.gz"):
            print("You don't have the 'cifar-10' dataset! ")
            print("Don't worry, I'll start downloading it right away. ")
            print("It's 163 Mb so it might take a few minutes. ")
            print("Downloading... ", end='')
            call(
                "wget http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz",
                shell=True
            )
            print("DONE!")

        # Extract the .tar.gz file. 
        cifar_python_directory = os.path.abspath("cifar-10-batches-py")
        if not os.path.exists(cifar_python_directory):
            print("Extracting the archive... ", end='')
            call(
                "tar -zxvf cifar-10-python.tar.gz",
                shell=True
            )
            print("DONE!")

    def load_cifar(self):
        ''' Loads a single batch of CIFAR10. '''

        # Download CIFAR if there is no local copy. 
        self.download_cifar10()

        # Open a single batch of the CIFAR10 dataset. 
        with open("cifar-10-batches-py/data_batch_1", 'rb') as fo:
            data_dict = pickle.load(fo, encoding='bytes')

        # Retrieve data and labels (dtype is uint8)
        X = data_dict[b'data']
        y = np.array(data_dict[b'labels'])

        return X, y


    def cifar_cats_dogs(self):
        # Load the cifar dataset
        X, y = self.load_cifar()

        # Get only the dogs/cats. Dogs are represented as '5'
        # and cats as 3
        X = X[np.logical_or(y==5, y==3)]
        y = y[np.logical_or(y==5, y==3)]
        y = (y == 3) # 3 -> True,  5 -> False

        return X, y

    def show_cats_vs_dogs(self):
        n, d = self.X.shape

        X = self.X.reshape(n, 3, 32, 32).transpose(0,2,3,1).astype("uint8")

        fig, axes = plt.subplots(5, 5, figsize=(8,8))
        for i in range(5):
            for j in range(5):
                axes[i,j].imshow(X[i+j*5])

        fig.canvas.draw()


