# NAMES: Timothy Barao, Marlan McInnes-Taylor
# FSUIDS: tjb13b, mm05f

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import linalg as LA

In [28]:
class FSA:
    def __init__(self, xt, yt, xv, yv):
        self.load_data(xt, yt, xv, yv)
        
        self.s = 0.001
        self.u = 100
        self.N_iter = 500
        self.lr = 0.01         #Try 20

        #self.B_0 = 0
 
        self.M = self.x_t.shape[1]
        self.N = self.x_t.shape[0]
        self.B = np.zeros(self.M)
        self.k = 10  # 30, 100, 300
        
    
    
    def load_data(self, x_train_file, y_train_file, x_valid_file, y_valid_file):
        with open(x_train_file) as file:
            
            if "dexter" in x_train_file:
                self.x_t = pd.read_csv(file, delim_whitespace=False, header=None)
            else:    
                self.x_t = pd.read_csv(file, delim_whitespace=True, header=None)
            self.x_t = self.x_t.to_numpy(dtype=np.float64)

        with open(y_train_file) as file:
            self.y_t = pd.read_csv(file, header=None)
            self.y_t = self.y_t.to_numpy(dtype=np.float64)

        with open(x_valid_file) as file:
            
            if "dexter" in x_valid_file:
                self.x_v = pd.read_csv(file, delim_whitespace=False, header=None)
            else:    
                self.x_v = pd.read_csv(file, delim_whitespace=True, header=None)
            self.x_v = self.x_v.to_numpy(dtype=np.float64)
            
        with open(y_valid_file) as file:
            self.y_v = pd.read_csv(file, header=None)
            self.y_v = self.y_v.to_numpy(dtype=np.float64)

        self.N = self.x_t.shape[0]   # Rows
        self.M = self.x_t.shape[1]   # Columns 

        # Standardize data
        for i in range(0, self.M):
            if (not np.any(self.x_v[:, i])):
                self.x_v[:, i].fill(0)
            else:  
                self.x_v[:, i] = (self.x_v[:, i] - np.mean(self.x_v[:, i]))/(np.std(self.x_v[:, i]))

            if(not np.any(self.x_t[:, i])):
                self.x_t[:, i].fill(0)
            else:
                self.x_t[:, i] = (self.x_t[:, i] - np.mean(self.x_t[:, i]))/(np.std(self.x_t[:, i]))  
        
        self.x_v = np.insert(self.x_v, 0, 1, axis=1)
        self.x_t = np.insert(self.x_t, 0, 1, axis=1)

                
    def gradient_update(self):
        self.B = self.B - self.lr * self.loss()
        return self.B
    
    def loss(self):
        #temp = np.sum(self.x_t*self.B, axis=1)
        temp = np.dot(self.x_t, self.B)
        temp = np.dot(temp, self.y_t.flatten())
        if temp > 1:
            temp = 0
        else:
            var = (temp - 1)**2
            temp = np.log(1 + var)
        '''
        for index in range(0, len(temp)):
            if temp[index] > 1:
                temp[index] = 0
            else:
                var = (temp[index] - 1)**2
                temp[index] = np.log(1 + var)
        '''            
        temp += self.s * LA.norm(self.B, 2)        
        temp /= self.N 
        return temp
        # 0            if x > 1
        # ln(1+(x-1)^2) else
    
    def predict(self):
        return 0
    
    def train(self):
        for i in range(0, self.N_iter):
            self.gradient_update()
            temp = (self.N_iter - 2 * i)/(2 * i * self.u + self.N_iter)
            temp = max(0., temp)
            mi = self.k + (self.M - self.k) * temp
            print("Iteration: ", i, "M_i: ", int(mi))
        return 0
    
    
    def print_features(self):
        print("X_train Objs: ", self.x_t.shape[0], "X_train Feats: ", self.x_t.shape[1])
        print("Y_train Objs: ", self.y_t.shape[0])

        print("\nX_valid Objs: ", self.x_v.shape[0], "X_valid Feats: ", self.x_v.shape[1])
        print("Y_valid Objs: ", self.y_v.shape[0])

                
        #self.x_t, self.y_t, self.x_v, self.y_v = x_train, y_train, x_valid, y_valid 


In [29]:
# Select k = [10, 30, 100, 300] features
# Plot the training loss vs iteration number k = 10
# Report in Table the misclassification errors on the training and testing set for models obtained for all k
# Plot the misclassification error on the training set and testing set vs k

# Madelon

In [30]:
x_train_file = "../data/MADELON/madelon_train.data"
y_train_file = "../data/MADELON/madelon_train.labels"

x_valid_file = "../data/MADELON/madelon_valid.data"
y_valid_file = "../data/MADELON/madelon_valid.labels"

In [31]:
model = FSA(x_train_file, y_train_file, x_valid_file, y_valid_file)
model.print_features()
model.train()

X_train Objs:  2000 X_train Feats:  501
Y_train Objs:  2000

X_valid Objs:  600 X_valid Feats:  501
Y_valid Objs:  600
Iteration:  0 M_i:  501
Iteration:  1 M_i:  359
Iteration:  2 M_i:  280
Iteration:  3 M_i:  230
Iteration:  4 M_i:  195
Iteration:  5 M_i:  170
Iteration:  6 M_i:  150
Iteration:  7 M_i:  135
Iteration:  8 M_i:  123
Iteration:  9 M_i:  112
Iteration:  10 M_i:  104
Iteration:  11 M_i:  96
Iteration:  12 M_i:  90
Iteration:  13 M_i:  85
Iteration:  14 M_i:  80
Iteration:  15 M_i:  75
Iteration:  16 M_i:  72
Iteration:  17 M_i:  68
Iteration:  18 M_i:  65
Iteration:  19 M_i:  62
Iteration:  20 M_i:  60
Iteration:  21 M_i:  57
Iteration:  22 M_i:  55
Iteration:  23 M_i:  53
Iteration:  24 M_i:  51
Iteration:  25 M_i:  50
Iteration:  26 M_i:  48
Iteration:  27 M_i:  47
Iteration:  28 M_i:  45
Iteration:  29 M_i:  44
Iteration:  30 M_i:  43
Iteration:  31 M_i:  42
Iteration:  32 M_i:  41
Iteration:  33 M_i:  40
Iteration:  34 M_i:  39
Iteration:  35 M_i:  38
Iteration:  36 M

Iteration:  346 M_i:  10
Iteration:  347 M_i:  10
Iteration:  348 M_i:  10
Iteration:  349 M_i:  10
Iteration:  350 M_i:  10
Iteration:  351 M_i:  10
Iteration:  352 M_i:  10
Iteration:  353 M_i:  10
Iteration:  354 M_i:  10
Iteration:  355 M_i:  10
Iteration:  356 M_i:  10
Iteration:  357 M_i:  10
Iteration:  358 M_i:  10
Iteration:  359 M_i:  10
Iteration:  360 M_i:  10
Iteration:  361 M_i:  10
Iteration:  362 M_i:  10
Iteration:  363 M_i:  10
Iteration:  364 M_i:  10
Iteration:  365 M_i:  10
Iteration:  366 M_i:  10
Iteration:  367 M_i:  10
Iteration:  368 M_i:  10
Iteration:  369 M_i:  10
Iteration:  370 M_i:  10
Iteration:  371 M_i:  10
Iteration:  372 M_i:  10
Iteration:  373 M_i:  10
Iteration:  374 M_i:  10
Iteration:  375 M_i:  10
Iteration:  376 M_i:  10
Iteration:  377 M_i:  10
Iteration:  378 M_i:  10
Iteration:  379 M_i:  10
Iteration:  380 M_i:  10
Iteration:  381 M_i:  10
Iteration:  382 M_i:  10
Iteration:  383 M_i:  10
Iteration:  384 M_i:  10
Iteration:  385 M_i:  10


0

# Dexter

In [None]:
x_train_file = "../data/dexter/dexter_train.csv"
y_train_file = "../data/dexter/dexter_train.labels"

x_valid_file = "../data/dexter/dexter_valid.csv"
y_valid_file = "../data/dexter/dexter_valid.labels"

In [None]:
model = FSA(x_train_file, y_train_file, x_valid_file, y_valid_file)
model.print_features()

# Gisette 

In [None]:
x_train_file = "../data/Gisette/gisette_train.data"
y_train_file = "../data/Gisette/gisette_train.labels"

x_valid_file = "../data/Gisette/gisette_valid.data"
y_valid_file = "../data/Gisette/gisette_valid.labels"

In [None]:
model = FSA(x_train_file, y_train_file, x_valid_file, y_valid_file)
model.print_features()

# Commented Code


In [None]:
# In Load Data:    
    # x_valid = np.insert(x_valid, 0, 1, axis=1)
    # x_train = np.insert(x_train, 0, 1, axis=1)

    #y_valid = np.where(y_valid == -1, 0, y_valid)  
    #y_train = np.where(y_train == -1, 0, y_train)
    
    