# NAMES: Timothy Barao, Marlan McInnes-Taylor
# FSUIDS: tjb13b, mm05f

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def load_data(x_train_file, y_train_file, x_valid_file, y_valid_file):

    with open(x_train_file) as file:
        if "Dexter" in x_train_file:
            x_train = pd.read_csv(file, delim_whitespace=False, header=None)
        else:    
            x_train = pd.read_csv(file, delim_whitespace=True, header=None)
        x_train = x_train.to_numpy(dtype=np.float64)

    with open(y_train_file) as file:
        y_train = pd.read_csv(file, header=None)
        y_train = y_train.to_numpy(dtype=np.float64)

    with open(x_valid_file) as file:
        if "Dexter" in x_valid_file:
            x_valid = pd.read_csv(file, delim_whitespace=False, header=None)
        else:    
            x_valid = pd.read_csv(file, delim_whitespace=True, header=None)
        x_valid = x_valid.to_numpy(dtype=np.float64)

    with open(y_valid_file) as file:
        y_valid = pd.read_csv(file, header=None)
        y_valid = y_valid.to_numpy(dtype=np.float64)

    N = x_train.shape[0]   # Rows
    M = x_train.shape[1]   # Columns 

    # Standardize data
    for i in range(0, M):
        if (not np.any(x_valid[:, i])):
            x_valid[:, i].fill(0)
        else:  
            x_valid[:, i] = (x_valid[:, i] - np.mean(x_valid[:, i]))/(np.std(x_valid[:, i]))
            
        if(not np.any(x_train[:, i])):
            x_train[:, i].fill(0)
        else:
            x_train[:, i] = (x_train[:, i] - np.mean(x_train[:, i]))/(np.std(x_train[:, i]))
    
    # x_valid = np.insert(x_valid, 0, 1, axis=1)
    # x_train = np.insert(x_train, 0, 1, axis=1)

    y_valid = np.where(y_valid == -1, 0, y_valid)  
    y_train = np.where(y_train == -1, 0, y_train)  
    
    print("X_train Objs: ", x_train.shape[0], "X_train Feats: ", x_train.shape[1])
    print("Y_train Objs: ", y_train.shape[0])
    
    print("\nX_valid Objs: ", x_valid.shape[0], "X_valid Feats: ", x_valid.shape[1])
    print("Y_valid Objs: ", y_valid.shape[0])
    
    return x_train, y_train, x_valid, y_valid


In [3]:
def penalty(x, lr):
    indices = np.where(np.absolute(x) < lr) 
    mask = np.ones(x.shape, np.bool)
    mask[indices] = 0

    x[indices] = pow(lr,2) - np.power(x[indices] - lr, 2)
    x[mask] = pow(lr,2)
    
    return x
    # Penalty 
    # P_lr(X) = {lr^2 - (|x| - lr)^2     if |x| < lr}
    #          {lr^2                    else       }
    
def log_like(X, Y, W,lr):
    N = ((1)/(X.shape[0]))
    x = N * (np.log(1 + np.exp(np.dot(-Y.flatten(), np.dot(X, W)))) + lr * penalty(W, lr).sum())
    return x
    
    # Loss/log-likliehood
    # L(W) = 1/N * np.log(1 + np.exp(-Y * W * X)) + lr * P_lr(W).sum()     #take sum of all penalties of W    
    
def updateWeights(data, labels, weights, lr):
    N = x_train.shape[0]
    weights = weights + (1/N * np.dot(data.T, (labels.flatten() - (1/(1 + np.exp(np.dot(-data, weights)  ))))))
    weights[np.absolute(weights) <= lr] = 0
    return weights

    # Hybrid Penalties
    # Hybrid(X, lr) =  {0    if |x| <= lr}   return array of each value   
    #                  {x/1  if |x| > lr} 

    # Updating Weights
    # w = w + 1/N * X * (Y - (1 / (1 + np.exp(-X * W))))
    # w = Hybrid(w, lr)

    
def predict(samples, W):
    prod = np.exp(W[0] + np.dot(samples, W))
    pred_1 = prod / (1 + prod)
    pred_0 = 1  / (1 + prod)
   
    return np.round(pred_1)    
    # Predict 


In [4]:
def train(x_train, y_train, W, lr):
    L_i = []
    for i in range(100):
        W = updateWeights(x_train, y_train, W, lr)
        L_i.append(log_like(x_train, y_train, W, lr))
        print("\rIteration: " + str(i) + " Non_zero weights: " +  str(np.sum(W != 0.)), sep='', end='', flush=True)
    
    

# Gisette Dataset

In [5]:
x_train_file = "Data/Gisette/gisette_train.data"
y_train_file = "Data/Gisette/gisette_train.labels"

x_valid_file = "Data/Gisette/gisette_valid.data"
y_valid_file = "Data/Gisette/gisette_valid.labels"

In [6]:
x_train, y_train, x_valid, y_valid = load_data(x_train_file, y_train_file, x_valid_file, y_valid_file)

AttributeError: 'DataFrame' object has no attribute 'to_numpy'

In [None]:
learning_rates = [1.0, 0.1, 0.001, 0.0001]
errorGis = pd.DataFrame({"Features":[0, 0, 0, 0, 0], "Training Error (%)":[0, 0, 0, 0, 0], "Test Error (%)":[0, 0, 0, 0, 0]}, index=learning_rates)
errorGis.index.name = "Lambda"
for lr in learning_rates:                 
    W = np.zeros(x_train.shape[1])
    train(x_train, y_train, W, lr)
    
    preds = predict(x_train, W)    
    missed = len(np.where(preds != y_train))
    train_accuracy = (y_train.shape[0] - missed) / (y_train.shape[0])
    
    preds = predict(x_valid, W)    
    missed = len(np.where(preds != y_valid))
    valid_accuracy = (y_valid.shape[0] - missed) / (y_valid.shape[0])
    
    print("\nlr: " + str(lr) + " Train Accuracy: " +  str(train_accuracy *100.00),)
    print("lr: " + str(lr) + " Valid Accuracy: " +  str(valid_accuracy *100.00), "\n")
    
    errorGis.loc[lr, 'Features'] = np.sum(W != 0.)
    errorGis.loc[lr, 'Training Error (%)'] = round(100 - train_accuracy * 100, 2)
    errorGis.loc[lr, 'Test Error (%)'] = round(100 - valid_accuracy * 100, 2)
    

# Dexter Dataset

In [None]:
x_train_file = "Data/Dexter/dexter_train.csv"
y_train_file = "Data/Dexter/dexter_train.labels"

x_valid_file = "Data/Dexter/dexter_valid.csv"
y_valid_file = "Data/Dexter/dexter_valid.labels"

In [None]:
x_train, y_train, x_valid, y_valid = load_data(x_train_file, y_train_file, x_valid_file, y_valid_file)

In [None]:
learning_rates = [1.0, 0.1, 0.001, 0.0001]
errorDex = pd.DataFrame({"Features":[0, 0, 0, 0, 0], "Training Error (%)":[0, 0, 0, 0, 0], "Test Error (%)":[0, 0, 0, 0, 0]}, index=learning_rates)
errorDex.index.name = "Lambda"
for lr in learning_rates:                 
    W = np.zeros(x_train.shape[1])
    train(x_train, y_train, W, lr)
    
    preds = predict(x_train, W)    
    missed = len(np.where(preds != y_train))
    train_accuracy = (y_train.shape[0] - missed) / (y_train.shape[0])
    
    preds = predict(x_valid, W)    
    missed = len(np.where(preds != y_valid))
    valid_accuracy = (y_valid.shape[0] - missed) / (y_valid.shape[0])
    
    print("\nlr: " + str(lr) + " Train Accuracy: " +  str(train_accuracy *100.00),)
    print("lr: " + str(lr) + " Valid Accuracy: " +  str(valid_accuracy *100.00), "\n")
    
    errorDex.loc[lr, 'Features'] = np.sum(W != 0.)
    errorDex.loc[lr, 'Training Error (%)'] = round(100 - train_accuracy * 100, 2)
    errorDex.loc[lr, 'Test Error (%)'] = round(100 - valid_accuracy * 100, 2)
    

# Madelon Dataset

In [None]:
x_train_file = "Data/Madelon/madelon_train.data"
y_train_file = "Data/Madelon/madelon_train.labels"

x_valid_file = "Data/Madelon/madelon_valid.data"
y_valid_file = "Data/Madelon/madelon_valid.labels"

In [None]:
x_train, y_train, x_valid, y_valid = load_data(x_train_file, y_train_file, x_valid_file, y_valid_file)

In [None]:
learning_rates = [1.0, 0.1, 0.001, 0.0001]
errorMad = pd.DataFrame({"Features":[0, 0, 0, 0, 0], "Training Error (%)":[0, 0, 0, 0, 0], "Test Error (%)":[0, 0, 0, 0, 0]}, index=learning_rates)
errorMad.index.name = "Lambda"
for lr in learning_rates:                 
    W = np.zeros(x_train.shape[1])
    train(x_train, y_train, W, lr)
    
    preds = predict(x_train, W)    
    missed = len(np.where(preds != y_train))
    train_accuracy = (y_train.shape[0] - missed) / (y_train.shape[0])
    
    preds = predict(x_valid, W)    
    missed = len(np.where(preds != y_valid))
    valid_accuracy = (y_valid.shape[0] - missed) / (y_valid.shape[0])
    
    print("\nlr: " + str(lr) + " Train Accuracy: " +  str(train_accuracy *100.00),)
    print("lr: " + str(lr) + " Valid Accuracy: " +  str(valid_accuracy *100.00), "\n")
    
    errorMad.loc[lr, 'Features'] = np.sum(W != 0.)
    errorMad.loc[lr, 'Training Error (%)'] = round(100 - train_accuracy * 100, 2)
    errorMad.loc[lr, 'Test Error (%)'] = round(100 - valid_accuracy * 100, 2)
    

In [None]:
errorTableFull = pd.concat([errorGis, errorDex, errorMad])
errorTableFull

# Commented Code

In [None]:
    #x_train = (x_train - np.mean(x_train, axis=0)) / np.std(x_train, axis=0)
    #x_valid = (x_valid - np.mean(x_valid, axis=0)) / np.std(x_valid, axis=0)
    
    # if columns are 0, return 0
                
            
        
        #x_valid = np.insert(x_valid, 0, 1, axis=1)
        #x_train = np.insert(x_train, 0, 1, axis=1)

    #y_valid = np.where(y_valid == -1, 0, y_valid)  
    #y_train = np.where(y_train == -1, 0, y_train)   
    
                       
    #print("\n\n=-=-=-=-=-=-=-= x_valid =-=-=-=-=-=-=-= ")
    #print("Col: ", i, " mean: ", x_valid[:, i].mean())
    #print("Col: ", i, " std: ", x_valid[:, i].std())


    #print("=-=-=-=-=-=-=-= x_train =-=-=-=-=-=-=-= ")
    #print("Col: ", i, " mean: ", x_train[:, i].mean())
    #print("Col: ", i, " std: ", x_train[:, i].std())   