# AutoEncoder with Pytorch

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import torch
from torch import nn
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from tqdm import tqdm_notebook as tqdm

In [2]:
data= pd.read_csv('C:/Users/Aitichou/Desktop/contextual_bandit/Forest_cov.txt', sep=" ", header = None)
data = pd.DataFrame(data)

In [3]:
# calcul du temps d'éxécution 
import time

_start_time = time.time()

def tic():
    global _start_time 
    _start_time = time.time()

def tac():
    t_sec = round(time.time() - _start_time)
    (t_min, t_sec) = divmod(t_sec,60)
    (t_hour,t_min) = divmod(t_min,60) 
    print('Time passed: {}hour:{}min:{}sec'.format(t_hour,t_min,t_sec))

In [4]:
data.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [5]:
data.shape

(581012, 101)


Un auto-encodeur est un réseau de neurones artificiels utilisé pour l'apprentissage non supervisé. L'objectif d'un auto-encodeur est d'apprendre une représentation (encodage) d'un ensemble de données, généralement dans le but de réduire la dimension de cet ensemble

<img src="https://raw.githubusercontent.com/udacity/deep-learning/c6b46a0bfcb8d4afcb806174b3923d3ea89ca455/autoencoder/assets/autoencoder_1.png" width="700" height="700" align="center"/>


In [6]:

labels = data[[i for i in tqdm(range(94,101))]]
labels = [labels.iloc[i].idxmax()-94 for i in tqdm(range(len(labels)))]

data = data[[i for i in tqdm(range(94))]]
data[94] = 1


HBox(children=(IntProgress(value=0, max=7), HTML(value='')))




HBox(children=(IntProgress(value=0, max=581012), HTML(value='')))




HBox(children=(IntProgress(value=0, max=94), HTML(value='')))




In [7]:
labels[:10]
len(labels)

581012

In [8]:
data.shape

(581012, 95)

In [9]:
data,  _, labels, _ = train_test_split(data, labels, test_size = 0)

In [10]:
labels = pd.DataFrame(labels)
labels.columns=["labels"]

In [11]:
print(data.shape); print(labels.shape)

(581012, 95)
(581012, 1)


In [61]:
data = torch.from_numpy(np.asarray(data))
step = 128
batches = [(data[i:i + step], labels[i:i + step]) for i in range(0, data.shape[0], step)]

In [82]:
class AutoBlock(nn.Module):
    
    ACTIVATIONS = {
        'linear': lambda x: x,
        'relu': F.relu,
        'tanh': torch.tanh,
        'elu': F.elu,
        'leakyrelu': torch.nn.LeakyReLU,
        'sigmoid': torch.sigmoid,
        'softmax': torch.nn.Softmax(dim=None)
    }
    
    def __init__(self, insize, outsize, pdrop=0.2, activation='relu'):
        
        super(AutoBlock, self).__init__()
        
        self.linear = nn.Linear(insize, outsize)
        self.drop = nn.Dropout(pdrop)
        self.activation = AutoBlock.ACTIVATIONS[activation]
        
    def forward(self, x):
        
        return self.activation(self.drop(self.linear(x)))

class Encoder(nn.Module):
    def __init__(self, layers=[95, 60, 30]):
        
        super(Encoder, self).__init__()
        
        n_layers = 0
        for insize, outsize in zip(layers[:-1], layers[1:]):
            setattr(self, 'e%d' % n_layers, AutoBlock(insize[0], outsize[0], **outsize[1]))
            n_layers += 1
        self.n_layers = n_layers
        
    def forward(self, inp):
        
        x = inp
        for i in range(self.n_layers):
            x = getattr(self, 'e%d' % i)(x)
        return x
    
    def encode_until(self, inp, idx):
        x = inp
        for i in range(idx):
            x = getattr(self, 'e%d' % i)(x)
        return x
    
class AutoEncoder(nn.Module):
    
    def __init__(self, encoding_layers=[(95,),(60,), (30,)], decoding_layers=[(30,),(60,),(95,)]):
        
        super(AutoEncoder, self).__init__()
        
        if encoding_layers[0][0] != decoding_layers[-1][0]:
            raise ValueError('Input and output size must be the same')
            
        if decoding_layers[0][0] != encoding_layers[-1][0]:
            raise ValueError('Encoding output shape does not match decoding input shape')
        
        self.encoder = Encoder(layers = encoding_layers)
        self.decoder = Encoder(layers = decoding_layers)
        
    def forward(self, inp):
        return self.decode(self.encode(inp))
    
    def encode(self, inp):
        return self.encoder(inp)
    
    def decode(self, inp):
        return self.decoder(inp)
    

In [97]:
def training_loop(dataframe, nepochs, model, optimizer, scheduler, batch_size=128, train_ratio=0.8, criterion=nn.MSELoss()):
    
    opt = optimizer(model.parameters())
    sch = scheduler(opt)
    
    npdata = np.asarray(dataframe)
    train_size = int(train_ratio * npdata.shape[0])
    
    train_data = torch.from_numpy(npdata[:train_size]).float()
    val_data = torch.from_numpy(npdata[train_size:]).float()
    
    indexes = np.arange(train_data.shape[0])
    
    for epoch in range(nepochs):
        print("Epoch %d / %d" % (epoch + 1, nepochs))
        np.random.shuffle(indexes)
        batches = [train_data[indexes[i:i + batch_size]] for i in range(0, train_size, batch_size)]
        pbar = tqdm(enumerate(batches), total=len(batches), unit='batch')
        running_loss = 0
        for i_batch, batch in pbar:
            
            opt.zero_grad()
            xtrue = batch
            if next(model.parameters()).is_cuda:
              xtrue = xtrue.cuda()
            xpred = model(xtrue)
            loss = criterion(xpred, xtrue)
            loss.backward()
            opt.step()
            
            running_loss = (running_loss * i_batch + float(loss)) / (i_batch + 1)
            pbar.set_description('Loss: %.10f -- AvgLoss: %.10f' % (float(loss), running_loss))

        
        with torch.no_grad():
            model.eval()
            val_loss = 0
            val_acc = 0
            for i_batch, batch in tqdm(enumerate([val_data[i:i + batch_size] for i in range(0, val_data.shape[0], batch_size)])):
                xtrue = batch
                xpred = model(xtrue)
                val_loss = (val_loss * i_batch + float(criterion(xpred, xtrue))) / (i_batch + 1)
                
                xpred[xpred >= 0.7] = 1
                xpred[xpred < 0.7] = 0
                acc = (xpred[xpred == xtrue].nelement() / xpred.nelement()) * 100
                val_acc = (val_acc * i_batch + float(acc)) / (i_batch + 1)
               
            model.train()
        print('Validation loss: %.10f' % val_loss)
        print('Validation acc: %.2f' % val_acc)
        sch.step(val_acc)

            
    
    

In [98]:
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

model = AutoEncoder(
    encoding_layers = [
        (95,),
        (60, {'activation': 'relu', 'pdrop': 0.2}),
        (30, {'activation': 'sigmoid', 'pdrop': 0})
        
    ],
    decoding_layers = [
        (30,),
        (60, {'activation': 'relu', 'pdrop': 0}),
        (95, {'activation': 'relu', 'pdrop': 0.2})
    ]
)
optimizer = lambda params: optim.Adam(params, lr=0.0001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
scheduler = lambda opt: ReduceLROnPlateau(opt, mode='min',cooldown=4, patience=4, factor=0.5, min_lr=1e-8)

In [99]:
#model = model.cuda()
#next(model.parameters()).is_cuda

In [100]:
training_loop(data, 20, model, optimizer, scheduler, criterion = nn.MSELoss() )

Epoch 1 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0582516131
Validation acc: 90.48
Epoch 2 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0484080298
Validation acc: 91.83
Epoch 3 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0432753038
Validation acc: 92.62
Epoch 4 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0408415029
Validation acc: 93.11
Epoch 5 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0392788544
Validation acc: 93.58
Epoch 6 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0382047844
Validation acc: 93.93
Epoch 7 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0377776995
Validation acc: 94.06
Epoch 8 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0373802519
Validation acc: 94.22
Epoch 9 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0370323476
Validation acc: 94.36
Epoch 10 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0367863551
Validation acc: 94.44
Epoch 11 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0365364280
Validation acc: 94.54
Epoch 12 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0362710007
Validation acc: 94.66
Epoch 13 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0361250964
Validation acc: 94.69
Epoch 14 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0359380125
Validation acc: 94.77
Epoch 15 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0357825395
Validation acc: 94.82
Epoch 16 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0356444189
Validation acc: 94.89
Epoch 17 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0355648690
Validation acc: 94.92
Epoch 18 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0354338394
Validation acc: 94.97
Epoch 19 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0353856995
Validation acc: 94.99
Epoch 20 / 20


HBox(children=(IntProgress(value=0, max=3632), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Validation loss: 0.0353018380
Validation acc: 95.02


## Encodage des données

In [101]:
model.eval()
encoded_data = model.encode(data.float())
model.train()

AutoEncoder(
  (encoder): Encoder(
    (e0): AutoBlock(
      (linear): Linear(in_features=95, out_features=60, bias=True)
      (drop): Dropout(p=0.2)
    )
    (e1): AutoBlock(
      (linear): Linear(in_features=60, out_features=30, bias=True)
      (drop): Dropout(p=0)
    )
  )
  (decoder): Encoder(
    (e0): AutoBlock(
      (linear): Linear(in_features=30, out_features=60, bias=True)
      (drop): Dropout(p=0)
    )
    (e1): AutoBlock(
      (linear): Linear(in_features=60, out_features=95, bias=True)
      (drop): Dropout(p=0.2)
    )
  )
)

In [102]:
encoded_data

tensor([[0.1011, 0.2084, 0.3010,  ..., 0.1170, 0.6535, 0.3910],
        [0.9054, 0.5412, 0.7893,  ..., 0.4202, 0.6394, 0.5902],
        [0.2647, 0.4388, 0.4572,  ..., 0.4817, 0.1893, 0.7084],
        ...,
        [0.4508, 0.4588, 0.9132,  ..., 0.6692, 0.1601, 0.6828],
        [0.2751, 0.1743, 0.1757,  ..., 0.3442, 0.7506, 0.4767],
        [0.5727, 0.3510, 0.7768,  ..., 0.3717, 0.4938, 0.5927]],
       grad_fn=<SigmoidBackward>)

In [103]:
arr = encoded_data.data.cpu().numpy()
# write CSV
pd.DataFrame(np.savetxt('encodedata.csv',arr, delimiter=',',fmt='%1.4e'))

In [104]:
encoded_cov = pd.read_csv('C:/Users/Aitichou/Desktop/auto_encoder/encodedata.csv', sep=",", header = None)

In [105]:
encoded_cov.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.1011,0.2084,0.301,0.27274,0.944,0.72481,0.55207,0.27104,0.75414,0.73615,...,0.35619,0.49834,0.88032,0.6113,0.33137,0.89813,0.12584,0.11701,0.65347,0.39097
1,0.90539,0.54122,0.78925,0.28516,0.10432,0.36006,0.37076,0.39752,0.37923,0.13958,...,0.76342,0.47186,0.20421,0.746,0.24523,0.31252,0.47505,0.42016,0.63939,0.59022
2,0.2647,0.43878,0.45722,0.3757,0.9362,0.36738,0.91309,0.70503,0.69002,0.89619,...,0.33927,0.53375,0.72473,0.25437,0.57744,0.62609,0.41275,0.48173,0.18927,0.70841
3,0.28563,0.79135,0.79968,0.24959,0.29864,0.39132,0.5585,0.86992,0.69891,0.63053,...,0.20174,0.91439,0.30777,0.1638,0.27347,0.65232,0.87682,0.51585,0.11286,0.52895
4,0.12645,0.32377,0.88293,0.031443,0.084051,0.57219,0.7828,0.77925,0.65761,0.34458,...,0.1766,0.83645,0.39714,0.66414,0.52654,0.6344,0.62432,0.39543,0.18541,0.49022


In [106]:
encoded_cov.shape

(581012, 30)

In [107]:
encod_cov_lab= pd.concat([encoded_cov, labels], axis = 1)

In [108]:
encod_cov_lab.to_csv('encod_cov_lab.csv',sep=';', encoding='utf-8')


In [109]:
encod_cov_lab.shape

(581012, 31)

In [110]:
encod_cov_lab.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,labels
0,0.1011,0.2084,0.301,0.27274,0.944,0.72481,0.55207,0.27104,0.75414,0.73615,...,0.49834,0.88032,0.6113,0.33137,0.89813,0.12584,0.11701,0.65347,0.39097,1
1,0.90539,0.54122,0.78925,0.28516,0.10432,0.36006,0.37076,0.39752,0.37923,0.13958,...,0.47186,0.20421,0.746,0.24523,0.31252,0.47505,0.42016,0.63939,0.59022,1
2,0.2647,0.43878,0.45722,0.3757,0.9362,0.36738,0.91309,0.70503,0.69002,0.89619,...,0.53375,0.72473,0.25437,0.57744,0.62609,0.41275,0.48173,0.18927,0.70841,0
3,0.28563,0.79135,0.79968,0.24959,0.29864,0.39132,0.5585,0.86992,0.69891,0.63053,...,0.91439,0.30777,0.1638,0.27347,0.65232,0.87682,0.51585,0.11286,0.52895,0
4,0.12645,0.32377,0.88293,0.031443,0.084051,0.57219,0.7828,0.77925,0.65761,0.34458,...,0.83645,0.39714,0.66414,0.52654,0.6344,0.62432,0.39543,0.18541,0.49022,0


## Décodage des données

In [51]:
# récupérer les données décodées
model.eval()
decoded_data = model.decode(encoded_data.float())
model.train()

AutoEncoder(
  (encoder): Encoder(
    (e0): AutoBlock(
      (linear): Linear(in_features=95, out_features=60, bias=True)
      (drop): Dropout(p=0.4)
    )
    (e1): AutoBlock(
      (linear): Linear(in_features=60, out_features=30, bias=True)
      (drop): Dropout(p=0.25)
    )
  )
  (decoder): Encoder(
    (e0): AutoBlock(
      (linear): Linear(in_features=30, out_features=20, bias=True)
      (drop): Dropout(p=0.4)
    )
    (e1): AutoBlock(
      (linear): Linear(in_features=20, out_features=60, bias=True)
      (drop): Dropout(p=0.25)
    )
    (e2): AutoBlock(
      (linear): Linear(in_features=60, out_features=95, bias=True)
      (drop): Dropout(p=0)
    )
  )
)

In [52]:
decoded_data

tensor([[0.0000, 0.0971, 0.2132,  ..., 0.0000, 0.0000, 0.9998],
        [0.1437, 0.2578, 0.2101,  ..., 0.0000, 0.0000, 1.0001],
        [0.2041, 0.1993, 0.1721,  ..., 0.0000, 0.0000, 1.0001],
        ...,
        [0.2158, 0.2966, 0.1573,  ..., 0.0000, 0.0000, 1.0002],
        [0.1014, 0.2155, 0.2109,  ..., 0.0000, 0.0000, 1.0001],
        [0.8942, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.9995]],
       grad_fn=<ReluBackward0>)

In [181]:
arr2 = decoded_data.data.cpu().numpy()
# write CSV
pd.DataFrame(np.savetxt('decodedata.csv',arr2, delimiter=',',fmt='%1.1e'))

In [146]:
decoded_cov = decoded_data.detach().numpy()
decoded_cov = pd.DataFrame(decoded_cov)

In [147]:
decoded_cov.shape

(581012, 95)

In [148]:
decoded_cov.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,85,86,87,88,89,90,91,92,93,94
0,0.0,0.1811,0.158904,0.217313,0.212287,0.0,0.0,0.0,0.758279,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.799998
1,0.0,0.08013,0.547938,0.142227,0.0,0.0,0.0,0.0,0.0,0.687942,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.800009
2,0.0,0.246872,0.173428,0.176758,0.134104,0.0,0.0,0.0,0.0,0.792693,...,0.110355,0.075169,0.112369,0.0,0.0,0.0,0.0,0.0,0.0,0.799999
3,0.0,0.177536,0.165615,0.191247,0.245765,0.763259,0.0,0.0,0.0,0.0,...,0.140828,0.187851,0.107857,0.0,0.0,0.0,0.0,0.0,0.0,0.8
4,0.0,0.115127,0.128481,0.262969,0.271259,0.0,0.0,0.0,0.613944,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8
5,0.79216,0.0,0.0,0.0,0.0,0.734493,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.799992
6,0.0,0.091128,0.5228,0.109607,0.0,0.0,0.0,0.0,0.79756,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.800007
7,0.0,0.171305,0.167456,0.149872,0.288268,0.0,0.0,0.653269,0.0,0.0,...,0.031644,0.187267,0.130736,0.0,0.0,0.0,0.0,0.0,0.0,0.8
8,0.0,0.147545,0.177419,0.213183,0.188293,0.731348,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.800001
9,0.0,0.133435,0.153418,0.18024,0.341658,0.0,0.0,0.710588,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8


In [154]:
decoded_cov_lab.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,labels
0,0.0,0.1811,0.158904,0.217313,0.212287,0.0,0.0,0.0,0.758279,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.799998,0
1,0.0,0.08013,0.547938,0.142227,0.0,0.0,0.0,0.0,0.0,0.687942,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.800009,1
2,0.0,0.246872,0.173428,0.176758,0.134104,0.0,0.0,0.0,0.0,0.792693,...,0.075169,0.112369,0.0,0.0,0.0,0.0,0.0,0.0,0.799999,1
3,0.0,0.177536,0.165615,0.191247,0.245765,0.763259,0.0,0.0,0.0,0.0,...,0.187851,0.107857,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0
4,0.0,0.115127,0.128481,0.262969,0.271259,0.0,0.0,0.0,0.613944,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,1


In [155]:
decoded_cov_lab.to_csv('decoded_cov_lab.csv',sep=';', encoding='utf-8')
