In [322]:
import torch
import torchvision.transforms as transforms
import pandas as pd
import torch.nn as nn

In [336]:
class ModelCheckpoint:

    def __init__(self, filepath, model):
        self.min_loss = None
        self.filepath = filepath
        self.model = model

    def update(self, loss):
        if (self.min_loss is None) or (loss < self.min_loss):
            print("Saving a better model")
            torch.save(self.model.state_dict(), self.filepath)
            self.min_loss = loss
            
            
model_path = "best_model.pt"

In [337]:
train_valid_dataset = pd.read_csv('BankChurners.csv')  

In [338]:
geo_col = train_valid_dataset['Geography']
c_col = train_valid_dataset['CreditLevel']

d_count = {}
sum_count = {}
for i in range(0, len(geo_col)):
    if geo_col[i] not in d_count:
        d_count[geo_col[i]] = 1
    else:
        d_count[geo_col[i]] = d_count[geo_col[i]] + 1
        
    if geo_col[i] not in sum_count:
        sum_count[geo_col[i]] = c_col[i]
    else:
        sum_count[geo_col[i]] = sum_count[geo_col[i]] + c_col[i]

for item in sum_count:
    a = sum_count[item] / d_count[item]
    
    print(item, " ", a)


Spain   6.385707944962273
France   6.364745011086474
Germany   6.381761287438533


In [339]:
country = []
for item in train_valid_dataset['Geography']:
    if item == 'Spain':
        country.append(1)
    elif item == 'France':
        country.append(2)
    else:
        country.append(3)

In [340]:

sum = 0
count = 0
res = []
for item in train_valid_dataset['Balance']:
    if item != 0:
        count += 1
    sum += item

for item in train_valid_dataset['Balance']:
    if item != 0:
        res.append(item)
    else:
        res.append(sum / count)

In [341]:
train_valid_dataset = train_valid_dataset.drop(columns='CustomerId')
# train_valid_dataset = train_valid_dataset.drop(columns='Geography')
res_col = train_valid_dataset['CreditLevel']
train_valid_dataset = train_valid_dataset.drop(columns='CreditLevel')
#train_valid_dataset['Balance'] = res
train_valid_dataset['Geography'] = country


print(train_valid_dataset['Balance'])

#归一化
train_valid_dataset = (train_valid_dataset - train_valid_dataset.min()) / (train_valid_dataset.max() - train_valid_dataset.min())
train_valid_dataset['CreditLevel'] = res_col - 1


0       121681.82
1            0.00
2       182888.08
3       102278.79
4       109346.13
          ...    
8995         0.00
8996         0.00
8997     98775.23
8998    119654.44
8999    173340.83
Name: Balance, Length: 9000, dtype: float64


In [342]:
valid_ratio = 0.2 

#weight_tensor = torch.tensor([len(train_valid_dataset)/(len(train_valid_dataset)-train_valid_dataset[:,-1].sum()), len(train_valid_dataset)/train_valid_dataset[:,-1].sum()]).float() 

nb_train = int((1.0 - valid_ratio) * len(train_valid_dataset))
nb_valid =  int(valid_ratio * len(train_valid_dataset))
train_dataset, valid_dataset = torch.utils.data.dataset.random_split(train_valid_dataset.to_numpy(), [nb_train, nb_valid])

In [343]:
class DatasetTransformer(torch.utils.data.Dataset):

    def __init__(self, base_dataset, transform=transforms.Lambda(lambda x: x)):
        self.base_dataset = base_dataset
        self.transform = transform

    def __getitem__(self, index):
        inpt, target = torch.from_numpy(self.base_dataset[index][:-1]), self.base_dataset[index][-1]
        return self.transform(inpt).float(), int(target)

    def __len__(self):
        return len(self.base_dataset)


train_dataset = DatasetTransformer(train_dataset)
valid_dataset = DatasetTransformer(valid_dataset)

In [344]:
#Dataloader

batch_size  = 100   # Using minibatches of X samples

train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                          batch_size=batch_size,
                                          shuffle=True)

valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
                                          batch_size=batch_size, 
                                          shuffle=False)


print("The train set contains {} samples, in {} batches".format(len(train_loader.dataset), len(train_loader)))
print("The validation set contains {} samples, in {} batches".format(len(valid_loader.dataset), len(valid_loader)))

The train set contains 7200 samples, in 72 batches
The validation set contains 1800 samples, in 18 batches


In [345]:
#Define device
use_gpu = torch.cuda.is_available()
if use_gpu:
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [346]:
def linear_relu(dim_in, dim_out):
    return [nn.Linear(dim_in, dim_out),
            nn.ReLU(inplace=True)]

class FullyConnected(nn.Module):

    def __init__(self, input_size, num_classes):
        super(FullyConnected, self).__init__()
        self.classifier =  nn.Sequential(
            #nn.Dropout(0.2),
            *linear_relu(input_size, 14),
#             nn.Dropout(0.5), #Generally 0.2 for the input layer and 0.5 for the hidden layer
            *linear_relu(14, 14),
#             nn.Dropout(0.5),
            *linear_relu(14, 14),
#             nn.Dropout(0.5),
            nn.Linear(14, num_classes)
        )

    def forward(self, x):
        x = x.view(x.size()[0], -1)
        y = self.classifier(x)
        return y


model = FullyConnected(8, 1)
model.to(device)

FullyConnected(
  (classifier): Sequential(
    (0): Linear(in_features=8, out_features=14, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=14, out_features=14, bias=True)
    (3): ReLU(inplace=True)
    (4): Linear(in_features=14, out_features=14, bias=True)
    (5): ReLU(inplace=True)
    (6): Linear(in_features=14, out_features=1, bias=True)
  )
)

In [347]:
def train(model, loader, f_loss, optimizer, device):
    """
    Train a model for one epoch, iterating over the loader
    using the f_loss to compute the loss and the optimizer
    to update the parameters of the model.

    Arguments :

        model     -- A torch.nn.Module object
        loader    -- A torch.utils.data.DataLoader
        f_loss    -- The loss function, i.e. a loss Module
        optimizer -- A torch.optim.Optimzer object
        device    -- a torch.device class specifying the device
                     used for computation

    Returns :
    """

    # We enter train mode. This is useless for the linear model
    # but is important for layers such as dropout, batchnorm, ...
    model.train()

    for i, (inputs, targets) in enumerate(loader):
        
        
        inputs, targets = inputs.to(device), targets.to(device)
        # Compute the forward pass through the network up to the loss
        outputs = model(inputs)
        
        loss = f_loss(outputs, targets)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    

def test(model, loader, f_loss, device):
    """
    Test a model by iterating over the loader

    Arguments :

        model     -- A torch.nn.Module object
        loader    -- A torch.utils.data.DataLoader
        f_loss    -- The loss function, i.e. a loss Module
        device    -- The device to use for computation 

    Returns :

        A tuple with the mean loss, mean accuracy and mean unbiaised accuracy

    """
    # We disable gradient computation which speeds up the computation
    # and reduces the memory usage
    with torch.no_grad():
        # We enter evaluation mode. This is useless for the linear model
        # but is important with layers such as dropout, batchnorm, ..
        model.eval()
        N = 0
        tot_loss, correct, unbiaised_acc = 0.0, 0.0, 0.0
        for i, (inputs, targets) in enumerate(loader):

            # We got a minibatch from the loader within inputs and targets

            # We need to copy the data on the GPU if we use one
            inputs, targets = inputs.to(device), targets.to(device)

            # Compute the forward pass, i.e. the scores for each input
            outputs = model(inputs)

            # We accumulate the exact number of processed samples
            N += inputs.shape[0]

            # We accumulate the loss considering
            # The multipliation by inputs.shape[0] is due to the fact
            # that our loss criterion is averaging over its samples
            tot_loss += inputs.shape[0] * f_loss(outputs, targets).item()

            # For the accuracy, we compute the labels for each input
            # Be carefull, the model is outputing scores and not the probabilities
            # But given the softmax is not altering the rank of its input scores
            # we can compute the label by argmaxing directly the scores
            
#             predicted_targets = outputs.argmax(dim=1)
            predicted_targets = [torch.ceil(item[0]) for item in outputs]
            
            for i in range(0, len(targets)):
                if targets[i] == predicted_targets[i]:
                    correct += 1
                
            #print(correct)
            #correct += (predicted_targets == targets).sum().item()
                
        return tot_loss/N, correct/N, unbiaised_acc/N

In [348]:
epochs = 50
optimizer = torch.optim.Adam(model.parameters())
# f_loss = torch.nn.CrossEntropyLoss()
f_loss = torch.nn.L1Loss()
# f_loss = torch.nn.SmoothL1Loss()
# f_loss = torch.nn.MSELoss(reduction='mean')
model_checkpoint = ModelCheckpoint(model_path, model)

for t in range(epochs):
    print("\nEpoch {}".format(t))
    train(model, train_loader, f_loss, optimizer, device)
    train_loss, train_acc, train_unb_acc = test(model, train_loader, f_loss, device)
    print(" Train : Loss : {:.4f}, Acc : {:.4f}, Unb.Acc. : {:.4f}".format(train_loss, train_acc, train_unb_acc))

    val_loss, val_acc, val_unb_acc = test(model, valid_loader, f_loss, device)
    print(" Validation : Loss : {:.4f}, Acc : {:.4f}, Unb.Acc. : {:.4f}".format(val_loss, val_acc, val_unb_acc))

    model_checkpoint.update(val_loss)


model.load_state_dict(torch.load(model_path))

# Switch to eval mode 
model.eval()

test_loss, test_acc, test_unb_acc = test(model, valid_loader, f_loss, device)
print("\n\n Test : Loss : {:.4f}, Acc. : {:.4f}, Unb.Acc. : {:.4f}".format(test_loss, test_acc, test_unb_acc))


Epoch 0
 Train : Loss : 4.3289, Acc : 0.0331, Unb.Acc. : 0.0000
 Validation : Loss : 4.3114, Acc : 0.0361, Unb.Acc. : 0.0000
Saving a better model

Epoch 1
 Train : Loss : 1.5618, Acc : 0.1906, Unb.Acc. : 0.0000
 Validation : Loss : 1.5662, Acc : 0.1983, Unb.Acc. : 0.0000
Saving a better model

Epoch 2
 Train : Loss : 1.5291, Acc : 0.1900, Unb.Acc. : 0.0000
 Validation : Loss : 1.5335, Acc : 0.1978, Unb.Acc. : 0.0000
Saving a better model

Epoch 3
 Train : Loss : 1.5077, Acc : 0.1981, Unb.Acc. : 0.0000
 Validation : Loss : 1.5122, Acc : 0.1883, Unb.Acc. : 0.0000
Saving a better model

Epoch 4
 Train : Loss : 1.4946, Acc : 0.1994, Unb.Acc. : 0.0000
 Validation : Loss : 1.5001, Acc : 0.2017, Unb.Acc. : 0.0000
Saving a better model

Epoch 5
 Train : Loss : 1.4741, Acc : 0.2081, Unb.Acc. : 0.0000
 Validation : Loss : 1.4812, Acc : 0.2022, Unb.Acc. : 0.0000
Saving a better model

Epoch 6
 Train : Loss : 1.4731, Acc : 0.2047, Unb.Acc. : 0.0000
 Validation : Loss : 1.4775, Acc : 0.2056, Unb.