In [1]:
import pandas as pd
import numpy as np
import re
#import tldextract
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random
import math
import time

In [2]:
# ensure reproductivity
SEED = 6745

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
#traintitle.to_csv('traintae.csv')
traintitle = pd.read_csv("traintae.csv", index_col=0)
traintitle = traintitle['0']

  mask |= (ar1 == a)


In [4]:


## Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(traintitle))
train_X = tokenizer.texts_to_sequences(traintitle)
word_index = tokenizer.word_index


In [5]:
# We see that most of the titles+texts have very little words, and the longest posts are much longer than the average ones. This is probably
# Because most of the posts only contain a title and no other contents
maxlen = 10
## Pad the sentences (same length input), texts_to_matrix would be too large
train_X = pad_sequences(train_X, maxlen=maxlen)
train_X = pad_sequences(train_X, maxlen=maxlen)

In [6]:
train_X = torch.as_tensor(train_X, dtype=torch.long)

In [7]:
# use local rtx 2080
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [27]:
# build input iterator, create minibatches
trn = {'batch_size': 1024,
          'shuffle': True}
evl = {'batch_size': 1024,
          'shuffle': False}

trainiter = DataLoader(train_X, **trn)
#next(iter(data_loader))

In [9]:
class EncC(nn.Module):
    def __init__(self, input_dim,emb_dim,hid_dim,kernel_size,dropout,device,maxlen):
        super().__init__()
        
        #assert kernel_size % 2 == 1, "Kernel size must be odd!"
        
        self.device = device
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
        
        self.tok_embedding = nn.Embedding(input_dim, emb_dim)
        self.pos_embedding = nn.Embedding(maxlen, emb_dim)
        
        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        
        self.conv = nn.Conv1d(in_channels = hid_dim, out_channels = 2 * hid_dim, 
                                              kernel_size = kernel_size, 
                                              padding = (kernel_size - 1) // 2)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, tin):
        
        batch_size = tin.shape[0]
        t_len = tin.shape[1]
        
        #create position tensor
        pos = torch.arange(0, t_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        tok_embedded = self.tok_embedding(tin)
        pos_embedded = self.pos_embedding(pos)
        
      
        #combine embeddings by elementwise summing
        embedded = self.dropout(tok_embedded + pos_embedded)
        
        conv_input = self.emb2hid(embedded)
        
     
        #len should be in the last dim, (batch, channel(hid), lenght)
        #permute for convolutional layer
        conv_input = conv_input.permute(0, 2, 1) 
        
        #pass through convolutional layer
        conved = self.conv(self.dropout(conv_input))

        #conved = [batch size, 2 * hid dim, src len]

        #pass through GLU activation function
        conved = F.glu(conved, dim = 1)

        #apply residual connection
        conved = (conved + conv_input) * self.scale

        
        #permute
        conved = conved.permute(0, 2, 1)

        
        return tok_embedded, conved

class DecC(nn.Module):
    def __init__(self, emb_dim,hid_dim,kernel_size,dropout,device):
        super().__init__()
        
        assert kernel_size % 2 == 1, "Kernel size must be odd!"
        
   
       

        self.hid2emb = nn.Linear(hid_dim, emb_dim)
        
        self.conv = nn.Conv1d(in_channels = hid_dim, out_channels = 2 * hid_dim, 
                              kernel_size = kernel_size, padding = (kernel_size - 1) // 2)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, tin):
        
     
        #len should be in the last dim, (batch, channel(hid), lenght)
        #permute for convolutional layer
        conv_input = tin.permute(0, 2, 1) 
        
        #pass through convolutional layer
        conved = self.conv(self.dropout(conv_input))

        #conved = [batch size, 2 * hid dim, src len]

        #pass through GLU activation function
        conved = F.glu(conved, dim = 1)

        #apply residual connection
        conved = (conved + conv_input)

        #conved = [batch size, hid dim, src len]

        
        #...end convolutional blocks
        
        #permute and convert back to emb dim
        conved = self.hid2emb(conved.permute(0, 2, 1))

        
        return conved

class AEC(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, tin):
        embedded, outputs = self.encoder(tin)

        recons = self.decoder(outputs)
        
        return embedded, recons      

In [10]:
INPUT_DIM  = len(word_index) + 1
OUTPUT_DIM  = maxlen
EMB_DIM  = 300
HID_DIM  = 100 
ENC_KERNEL_SIZE  = 3 
DEC_KERNEL_SIZE = 3 
ENC_DROPOUT  = 0
DEC_DROPOUT = 0

In [11]:
enc = EncC(INPUT_DIM, EMB_DIM, HID_DIM, ENC_KERNEL_SIZE, ENC_DROPOUT, device, maxlen)
dec = DecC(EMB_DIM, HID_DIM, DEC_KERNEL_SIZE, DEC_DROPOUT, device)

modelC = AEC(enc, dec).to(device)

In [12]:
optimizer = optim.Adam(modelC.parameters())
criterion  = nn.MSELoss()

In [13]:
def train(model, iterator, optimizer, criterion, device, maxlen):
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        tin = batch.to(device)
        
        optimizer.zero_grad()
        
        target, recons = model(tin)
        
        loss = criterion(recons, target)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()

        #print(f'Batch：{i+1} | Loss: {loss.item()}')
        
    return epoch_loss / len(iterator)

In [14]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [18]:
N_EPOCHS = 3
CLIP = 1

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(modelC, trainiter, optimizer, criterion,  device, maxlen)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
 
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

Epoch: 01 | Time: 5m 24s
	Train Loss: 0.025 | Train PPL:   1.026
Epoch: 02 | Time: 5m 25s
	Train Loss: 0.017 | Train PPL:   1.017
Epoch: 03 | Time: 5m 28s
	Train Loss: 0.013 | Train PPL:   1.014


In [20]:
torch.save(modelC.state_dict(), 'aec4.pt')

In [None]:
#change batch size to 1024 and rerun

In [28]:
N_EPOCHS = 3
CLIP = 1

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(modelC, trainiter, optimizer, criterion,  device, maxlen)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
 
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

Epoch: 01 | Time: 1m 31s
	Train Loss: 0.011 | Train PPL:   1.012
Epoch: 02 | Time: 1m 31s
	Train Loss: 0.010 | Train PPL:   1.011
Epoch: 03 | Time: 1m 32s
	Train Loss: 0.010 | Train PPL:   1.010


In [29]:
torch.save(modelC.state_dict(), 'aec7.pt')

In [30]:
# Start inferencing
# data cannot be shuffled, build new loader
evl = {'batch_size': 1024,
          'shuffle': False}

filteriter = DataLoader(train_X, **evl)

In [31]:
criterionf  = nn.MSELoss(reduction='none')

In [32]:
def flt(model, iterator, criterion, device, maxlen):
    
    model.eval()
    
    losst = np.zeros(1)
           
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            tin = batch.to(device)

            target, recons = model(tin)
            
            loss = torch.sum(criterionf(target,recons),[1,2])
            
            losst = np.append(losst,loss.cpu().numpy())
        
    return losst

In [33]:
losst = flt(modelC, filteriter, criterionf, device, maxlen)

In [34]:
error = losst[1:]
error = pd.DataFrame(error, index=traintitle.index.values)

In [35]:
toshow = error.sort_values(by=[0],ascending = False)[0:50].index.values

In [36]:
traintitle.loc[toshow]

318932     akcesoria elektryknet poszukujesz porządnie wy...
1756484    brainwashing begin brainwashing begin real tal...
285304     oryginalna srebrna nasze produkty autorskie po...
2334546    sfesgrtugiyhi rer5t6y7hu8hjy7e##rt6y##ujhy7gt#...
319994     monday development stolica wielkopolski szybko...
232808     nakit ogledlce nakit online ogledalce nudi vel...
296625     http echipamente ortopedice este magazin onlin...
2211127    money spell recieve money account money flow a...
677822     promo code free io app download download tell ...
1105807    decryption challenge dwi##tytyfn8yqv+3mo2i+1we...
1465770    dbhfyjnn dhdrhjf ddhddwe sdd sddss shfgd hsbee...
1694752    kirtasi̇ye kopyalama baski kopyalama sektörün ...
1473989    sha家族 hash algorithm，缩写为sha）是一个密码散列函数家族，是fips所...
1463853    meetup use massively long identifier email ver...
331794     http katalog budowlany budowlane przetargi bud...
1284403    yurtlar evimiz karşılaştığı yurt arama problem...
167662     halı altı cam