In [10]:
import numpy as np
from tqdm import tqdm
from scipy.special import logsumexp
import pandas as pd
class MLP():
    
    def __init__(self, din, dout):
        self.W = (2 * np.random.rand(dout, din) - 1) * (np.sqrt(6) / np.sqrt(din + dout))
        self.b = (2 * np.random.rand(dout) - 1) * (np.sqrt(6) / np.sqrt(din + dout))
        
    def forward(self, x): # x.shape = (batch_size, din)
        self.x = x # Storing x for latter (backward pass)
        return x @ self.W.T + self.b

    def backward(self, gradout):
        self.deltaW = gradout.T @ self.x
        self.deltab = gradout.sum(0)
        return gradout @ self.W
    
class SequentialNN():
    
    def __init__(self, blocks: list):
        self.blocks = blocks
        
    def forward(self, x):
        
        for block in self.blocks:
            x = block.forward(x)
  
        return x

    def backward(self, gradout):
        
        for block in self.blocks[::-1]:
            gradout = block.backward(gradout)
            
        return gradout

class ReLU():
    
    def forward(self, x):
        self.x = x
        return np.maximum(0, x)
    
    def backward(self, gradout):
        new_grad = gradout.copy()
        new_grad[self.x < 0] = 0.
        return new_grad
    
class LogSoftmax():
    
    def forward(self, x):
        self.x = x
        return x - logsumexp(x, axis=1)[..., None]
    
    def backward(self, gradout):
        gradients = np.eye(self.x.shape[1])[None, ...]
        gradients = gradients - (np.exp(self.x) / np.sum(np.exp(self.x), axis=1)[..., None])[..., None]
        return (np.matmul(gradients, gradout[..., None]))[:, :, 0]
    
class NLLLoss():
    
    def forward(self, pred, true):
        self.pred = pred
        self.true = true
        
        loss = 0
        for b in range(pred.shape[0]):
            loss -= pred[b, true[b]]
        return loss

    def backward(self):
        din = self.pred.shape[1]
        jacobian = np.zeros((self.pred.shape[0], din))
        for b in range(self.pred.shape[0]):
            jacobian[b, self.true[b]] = -1

        return jacobian # batch_size x din
    
    def __call__(self, pred, true):
        return self.forward(pred, true)
    
class Optimizer():
    
    def __init__(self, lr, compound_nn: SequentialNN):
        self.lr = lr
        self.compound_nn = compound_nn
        
    def step(self):
        
        for block in self.compound_nn.blocks:
            if block.__class__ == MLP:
                block.W = block.W - self.lr * block.deltaW
                block.b = block.b - self.lr * block.deltab
                
def train(model, optimizer, trainX, trainy, loss_fct = NLLLoss(), nb_epochs=14000, batch_size=100):
    training_loss = []
    for epoch in tqdm(range(nb_epochs)):

        # Sample batch size
        batch_idx = [np.random.randint(0, trainX.shape[0]) for _ in range(batch_size)]
        x = trainX[batch_idx]
        target = trainy[batch_idx]

        prediction = model.forward(x) # Forward pass
        loss_value = loss_fct(prediction, target) # Compute the loss
        training_loss.append(loss_value) # Log loss
        gradout = loss_fct.backward()
        model.backward(gradout) # Backward pass

        # Update the weights
        optimizer.step()
    return training_loss

def onehot_encoder(ary, columns=[], remove_trap=False):
    df_results = pd.DataFrame()

    # Iterate each column in DataFrame ary
    for i in range(ary.shape[1]):
        # if this column (i) is dummy column
        if i in columns:
            base_name = ary.columns[i]
            this_column = pd.get_dummies(ary.iloc[:, i])
            this_column = this_column.rename(columns={n:"{}_{}".format(base_name, n) for n in this_column.columns})
            # Remove Dummy Variable Trap if needed
            if remove_trap:
                this_column = this_column.drop(this_column.columns[0], axis=1)
        # else this column is normal column
        else:
            this_column = ary.iloc[:, i]
        # Append this column to the Result DataFrame
        df_results = pd.concat([df_results, this_column], axis=1)

    return df_results



In [6]:
if __name__ == "__main__": 
    import numpy as np
    from sklearn.model_selection import KFold
    
    # Load and process data
    train_df = pd.read_csv('./train_preproceed.csv')
    test_df = pd.read_csv('./test.csv')
    train_df = train_df.sample(n=1000, random_state=42)  # 使用 random_state 以确保可重复性
    test_df['volume'] = np.log(test_df.length.astype('int64') * test_df.width * test_df.height * 1e-6)
    test_df = test_df[['volume', 'area_cluster','model','age_of_car','age_of_policyholder','policy_tenure']]
    
    train_df = onehot_encoder(train_df, columns=[1, 2], remove_trap=True)
    test_df = onehot_encoder(test_df, columns=[1, 2], remove_trap=True)    
    X = train_df.iloc[:, :-1].values
    Y = train_df.iloc[:, -1].values.reshape(-1,1)
    
    print(X.shape)
    print(X_test.shape)
    
    mlp = SequentialNN([MLP(34, 128), ReLU(), 
                        MLP(128, 64), ReLU(), 
                        MLP(64, 2), LogSoftmax()])
    
    optimizer = Optimizer(1e-3, mlp)
    
    kf = KFold(n_splits=3, shuffle=True, random_state=42) 
    
    accuracies = []  # 存储每个fold的准确度
    
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        Y_train, Y_val = Y[train_index], Y[val_index]
        
        training_loss = train(mlp, optimizer, X_train, Y_train)
        
        # Compute validation accuracy
        accuracy = 0
        for i in range(X_val.shape[0]):
            prediction = mlp.forward(X_val[i].reshape(1, 34)).argmax()
            if prediction == Y_val[i]:
                accuracy += 1
        val_accuracy = accuracy / X_val.shape[0] * 100
        accuracies.append(val_accuracy)
    
    # Calculate average accuracy over all folds
    average_accuracy = np.mean(accuracies)
    print('Average validation accuracy:', average_accuracy, '%')


(1000, 34)
(39063, 34)


100%|██████████| 14000/14000 [01:47<00:00, 130.63it/s]
100%|██████████| 14000/14000 [01:18<00:00, 179.38it/s]
100%|██████████| 14000/14000 [00:55<00:00, 253.76it/s]

Average validation accuracy: 61.605917294540056 %





In [7]:
if __name__ == "__main__": 
    import numpy as np
    from sklearn.model_selection import KFold
    
    # Load and process data
    train_df = pd.read_csv('./train_preproceed.csv')
    test_df = pd.read_csv('./test.csv')
    train_df = train_df.sample(n=1000, random_state=42)  # 使用 random_state 以确保可重复性
    test_df['volume'] = np.log(test_df.length.astype('int64') * test_df.width * test_df.height * 1e-6)
    test_df = test_df[['volume', 'area_cluster','model','age_of_car','age_of_policyholder','policy_tenure']]
    
    train_df = onehot_encoder(train_df, columns=[1, 2], remove_trap=True)
    test_df = onehot_encoder(test_df, columns=[1, 2], remove_trap=True)
    X = train_df.iloc[:, :-1].values
    Y = train_df.iloc[:, -1].values.reshape(-1,1)
    
    print(X.shape)
    print(X_test.shape)
    
    mlp = SequentialNN([MLP(34, 128), ReLU(), 
                        MLP(128, 64), ReLU(), 
                        MLP(64, 2), LogSoftmax()])
    
    optimizer = Optimizer(1e-3, mlp)
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42) 
    
    accuracies = []  # 存储每个fold的准确度
    
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        Y_train, Y_val = Y[train_index], Y[val_index]
        
        training_loss = train(mlp, optimizer, X_train, Y_train)
        
        # Compute validation accuracy
        accuracy = 0
        for i in range(X_val.shape[0]):
            prediction = mlp.forward(X_val[i].reshape(1, 34)).argmax()
            if prediction == Y_val[i]:
                accuracy += 1
        val_accuracy = accuracy / X_val.shape[0] * 100
        accuracies.append(val_accuracy)
    
    # Calculate average accuracy over all folds
    average_accuracy = np.mean(accuracies)
    print('Average validation accuracy:', average_accuracy, '%')


(1000, 34)
(39063, 34)


100%|██████████| 14000/14000 [01:00<00:00, 229.53it/s]
100%|██████████| 14000/14000 [01:02<00:00, 225.52it/s]
100%|██████████| 14000/14000 [01:17<00:00, 181.37it/s]
100%|██████████| 14000/14000 [01:49<00:00, 127.96it/s]
100%|██████████| 14000/14000 [01:06<00:00, 211.13it/s]


Average validation accuracy: 63.4 %


In [8]:
if __name__ == "__main__": 
    import numpy as np
    from sklearn.model_selection import KFold
    
    # Load and process data
    train_df = pd.read_csv('./train_preproceed.csv')
    test_df = pd.read_csv('./test.csv')
    train_df = train_df.sample(n=1000, random_state=42)  # 使用 random_state 以确保可重复性
    test_df['volume'] = np.log(test_df.length.astype('int64') * test_df.width * test_df.height * 1e-6)
    test_df = test_df[['volume', 'area_cluster','model','age_of_car','age_of_policyholder','policy_tenure']]
    
    train_df = onehot_encoder(train_df, columns=[1, 2], remove_trap=True)
    test_df = onehot_encoder(test_df, columns=[1, 2], remove_trap=True)
    X = train_df.iloc[:, :-1].values
    Y = train_df.iloc[:, -1].values.reshape(-1,1)
    
    print(X.shape)
    print(X_test.shape)
    
    mlp = SequentialNN([MLP(34, 128), ReLU(), 
                        MLP(128, 64), ReLU(), 
                        MLP(64, 2), LogSoftmax()])
    
    optimizer = Optimizer(1e-3, mlp)
    
    kf = KFold(n_splits=10, shuffle=True, random_state=42) 
    
    accuracies = []  # 存储每个fold的准确度
    
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        Y_train, Y_val = Y[train_index], Y[val_index]
        
        training_loss = train(mlp, optimizer, X_train, Y_train)
        
        # Compute validation accuracy
        accuracy = 0
        for i in range(X_val.shape[0]):
            prediction = mlp.forward(X_val[i].reshape(1, 34)).argmax()
            if prediction == Y_val[i]:
                accuracy += 1
        val_accuracy = accuracy / X_val.shape[0] * 100
        accuracies.append(val_accuracy)
    
    # Calculate average accuracy over all folds
    average_accuracy = np.mean(accuracies)
    print('Average validation accuracy:', average_accuracy, '%')


(1000, 34)
(39063, 34)


100%|██████████| 14000/14000 [01:04<00:00, 216.45it/s]
100%|██████████| 14000/14000 [00:54<00:00, 255.17it/s]
100%|██████████| 14000/14000 [00:44<00:00, 312.21it/s]
100%|██████████| 14000/14000 [00:44<00:00, 317.35it/s]
100%|██████████| 14000/14000 [00:39<00:00, 358.56it/s]
100%|██████████| 14000/14000 [01:11<00:00, 195.74it/s]
100%|██████████| 14000/14000 [01:30<00:00, 154.24it/s]
100%|██████████| 14000/14000 [01:02<00:00, 222.97it/s]
100%|██████████| 14000/14000 [01:00<00:00, 232.45it/s]
100%|██████████| 14000/14000 [00:53<00:00, 263.22it/s]

Average validation accuracy: 72.4 %



