# Data input and reshape

In [64]:
import numpy as np

np.random.seed(0)
X_train_fpath = './data/X_train'
Y_train_fpath = './data/Y_train'
X_test_fpath = './data/X_test'
output_fpath = './output_{}.csv'

# Totally 54256 (number of data) X 511 (number of feat.)
# Parse csv files to numpy array
with open(X_train_fpath) as f:
    # next() 的原因是不需要 feature 那行 
    next(f)
    #一行代表一個 data , strip 掉換行符號後,再把每個 data的 feature 弄成 list
    X_train = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)
with open(Y_train_fpath) as f:
    next(f)
    Y_train = np.array([line.strip('\n').split(',')[1] for line in f], dtype = float)
with open(X_test_fpath) as f:
    next(f)
    X_test = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)
    

X_mean = np.mean(X_train[:,:],axis=0).reshape(1,-1) #reshpae(1,-1) 代表 row 搞成一列 , column -1 的意思是 column 數自動補齊計算.
X_std = np.std(X_train[:,:],axis=0).reshape(1,-1)



# Split validate data

In [65]:
Split_ratio = 0.1
Validate_len = int(np.round(Split_ratio*X_train.shape[0]))

X_validate = X_train[:Validate_len,:]
Y_validate = Y_train[:Validate_len]
X_training = X_train[Validate_len:,:]
Y_training = Y_train[Validate_len:]

print ("Validate_shape:",X_validate.shape,Y_validate.shape,"Training_shape:",X_training.shape,Y_training.shape)

Validate_shape: (5426, 510) (5426,) Training_shape: (48830, 510) (48830,)


# Normalize Training data 

In [66]:
X_training = (X_training[:,:] - X_mean) / (X_std+1e-8)
X_validate = (X_validate[:,:] - X_mean) / (X_std+1e-8)
X_test = (X_test[:,:] - X_mean) / X_std # test data must normalized with the mean & std of training data
print(X_training.shape,X_validate.shape,X_test.shape)

(48830, 510) (5426, 510) (27622, 510)


  This is separate from the ipykernel package so we can avoid doing imports until


In [67]:
train_size = X_training.shape[0]
Validate_size = X_validate.shape[0]
test_size = X_test.shape[0]
data_dim = X_train.shape[1]

print('Size of training data:{}'.format(train_size))
print('Size of validate data:{}'.format(Validate_size))
print('Size of testing data:{}'.format(test_size))
print('Data dimension:{}'.format(data_dim))

Size of training data:48830
Size of validate data:5426
Size of testing data:27622
Data dimension:510


# Define the function of training

In [78]:
import random

#Randomize the priority of the data
def shuffle(X_data,Y_data):
    
    data_len = X_data.shape[0]
    random_pri = []
    li = list(range(data_len))
    
    while len(li)>0 :
        
        select = random.randint(0,len(li)-1)
        random_pri.append(li.pop(select))
    
    X_data_new = np.zeros((X_data.shape[0],X_data.shape[1]))
    Y_data_new = np.zeros(Y_data.shape[0])
    
    for i in range(len(random_pri)):
        
            X_data_new[i] = X_data[random_pri[i]]
            Y_data_new[i] = Y_data[random_pri[i]]
            
    return X_data_new , Y_data_new

#Calculate Cross Entropy of all data
def CrossEntorpy(y_train,y_predict):
    
    return -y_train@np.log(y_predict)-(1-y_train)@np.log(1-y_predict)

def sigmoid(z) :
    
    return np.clip(1/(1+np.exp(-z)),1e-8,1-1e-8)

def sigmoid_linearinput(X,w,b):
    
    linear = X@w+b
    return sigmoid(linear)

def gradient(y_hat,X,w,b):
    
    y_pred = sigmoid_linearinput(X,w,b)
    return -1*X.T@(y_hat - y_pred) , -1*np.sum(y_hat - y_pred)

#Calculate the accuracy of model 
def accuracy(y_train,y_pred):
    
    return (1-sum(np.round(np.abs(y_pred-y_train)))/y_train.shape)*100

#The main training process
def trianing(epoch, Y_training, Y_validate, X_training, X_validate, w=None, b=None, lr = 0.002, mini_batch_size = 1000):
    
    # accuracy & loss collecting
    Training_accuracy = []
    Training_loss = []
    Validate_accuracy = []
    Validate_loss = []
    
    #parameter w,b initialize
    w = np.zeros(X_training.shape[1])
    b = np.zeros(1)
    G_w = np.zeros(X_training.shape[1])
    G_b = np.zeros(1)
    eps = 1e-8
    
    for i in range(epoch):
        
        shuffle(X_training,Y_training) # Shuffle the training data to the increase the variation of mini-batch
        
        for batch_count in range(int(np.floor(X_training.shape[0]/mini_batch_size))+1):
            
            X = X_training[batch_count*mini_batch_size:(batch_count+1)*mini_batch_size]
            Y = Y_training[batch_count*mini_batch_size:(batch_count+1)*mini_batch_size]
            
            #print()
            #print(batch_count,np.floor(X_training.shape[0]/mini_batch_size),X.shape,Y.shape)
            #Using Adagrad optimizer
            w_grad , b_grad = gradient(Y,X,w,b)
            G_w += w_grad**2
            G_b += b_grad**2
            
            w = w - lr/(np.sqrt(G_w+eps))*w_grad
            b = b - lr/(np.sqrt(G_b+eps))*b_grad
        
        # Collect the accuracy & loss of "training" data
        #print(Y_training ,sigmoid_linearinput(X_training,w,b),sum(np.round(np.abs(sigmoid_linearinput(X_training,w,b)-Y_training ))))
        Train_loss = CrossEntorpy(Y_training , sigmoid_linearinput(X_training,w,b))/X_training.shape[0]
        Train_accu = accuracy(Y_training , sigmoid_linearinput(X_training,w,b))
        Training_loss.append(Train_loss)
        Training_accuracy.append(Train_accu) 
        
        # Collect the accuracy & loss of "Validate" data
        Valid_loss = CrossEntorpy(Y_validate , sigmoid_linearinput(X_validate,w,b))/X_validate.shape[0]
        Valid_accu = accuracy(Y_validate , sigmoid_linearinput(X_validate,w,b))
        Validate_loss.append(Valid_loss)
        Validate_accuracy.append(Valid_accu)
        
        print("Epoch:",i,"Training_loss:",Train_loss,"Training_accuracy:",Train_accu,"Validate_loss:",Valid_loss,"Validate_accuracy:",Valid_accu)
        
    return w , b
        
            


In [80]:
trianing(100, Y_training, Y_validate, X_training, X_validate)

Epoch: 0 Training_loss: 0.5984313682173168 Training_accuracy: [70.84783944] Validate_loss: 0.5950621473034323 Validate_accuracy: [71.43383708]
Epoch: 1 Training_loss: 0.5782915035077285 Training_accuracy: [72.74626254] Validate_loss: 0.5745570403006697 Validate_accuracy: [73.29524512]
Epoch: 2 Training_loss: 0.5664343575261304 Training_accuracy: [73.85214008] Validate_loss: 0.5625368459951716 Validate_accuracy: [74.58532989]
Epoch: 3 Training_loss: 0.5580443322848533 Training_accuracy: [74.67335654] Validate_loss: 0.5540414875424599 Validate_accuracy: [75.41467011]
Epoch: 4 Training_loss: 0.5515565349449641 Training_accuracy: [75.22015155] Validate_loss: 0.5474697327296255 Validate_accuracy: [75.98599337]
Epoch: 5 Training_loss: 0.5462653001823712 Training_accuracy: [75.67888593] Validate_loss: 0.5421041696229181 Validate_accuracy: [76.40987836]
Epoch: 6 Training_loss: 0.5417934469168163 Training_accuracy: [76.0208888] Validate_loss: 0.5375632093469993 Validate_accuracy: [76.96277184]


(array([ 1.48428895e-01, -1.48252702e-03,  1.31926080e-01, -4.95816872e-02,
         9.93718844e-03, -4.04987931e-02, -4.20090152e-03,  4.21593933e-02,
         6.82640142e-03, -3.27556999e-02, -3.03038580e-03,  3.36662479e-02,
        -1.39522106e-02,  7.35141134e-02,  2.16723419e-02,  3.11962595e-02,
         1.93146764e-02,  3.97153284e-03,  7.88060866e-03, -3.54925393e-02,
         2.55670097e-02,  3.34545935e-02, -3.29765630e-04,  2.11986473e-02,
         4.51960439e-02,  7.14113820e-03, -1.20058883e-02,  2.15634703e-02,
        -1.16831322e-02,  1.19459754e-02, -1.18353383e-02, -2.17833804e-02,
         9.18925846e-03,  7.76686116e-02, -9.36820047e-03,  4.05385767e-02,
         5.84375641e-02, -1.47403514e-02,  4.24084609e-02,  1.32256896e-02,
        -1.69746473e-03,  3.92235573e-02,  2.65957488e-02,  2.52174501e-02,
        -2.17169669e-02, -2.20080424e-02, -6.81842257e-03,  7.34280416e-03,
        -5.16983460e-03, -4.41858560e-03,  2.84910410e-02, -3.99861439e-02,
        -8.9