In [None]:
%pwd

In [1]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import re, string, unicodedata
import nltk
from nltk.corpus import stopwords
from textblob import Word
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def cleaning_smsspam_dataset(df_data):
    # covert uppercase letters to lowercase letters
    df_data['text'] = df_data['text'].apply(lambda x: ' '.join(
        x.lower() for x in x.split()
    ))

    # delete puctuation marks
    df_data['text'] = df_data['text'].str.replace('[^\w\s]', '')

    # delete numbers from texts
    df_data['text'] = df_data['text'].str.replace('\d', '')

    # delete stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    punctuation = list(string.punctuation)
    stop_words.update(punctuation)
    df_data['text'] = df_data['text'].apply(lambda x: ' '.join(
        x for x in x.split() if x not in stop_words
    ))

    # lemmatization and get the roots of the words
    df_data['text'] = df_data['text'].apply(lambda x: ' '.join(
        [Word(word).lemmatize() for word in x.split()]
    ))

    # remove words less than 3 letters
    df_data['text'] = df_data['text'].apply(lambda x: ' '.join(
        [x for x in x.split() if len(x) > 3]
    ))
    
    return df_data


def preprocessing_smsspam_dataset(datafile):
    # load dataset
    df = pd.read_csv(datafile, encoding='ISO-8859-1', 
                   engine='python')
    #rename dataset columns
    df.rename(columns = {"v1": "target", "v2": "text"}, inplace = True)

    #drop unnecessary columns
    df.drop(["Unnamed: 2","Unnamed: 3", "Unnamed: 4"], axis = 1, inplace = True)

    # drop duplicate data
    df.drop_duplicates(inplace = True)

    # cleaning data
    df = cleaning_smsspam_dataset(df)

    df['target'].replace({'ham': 0, 'spam': 1}, inplace=True)

    print("+++ httpparams dataset: +++")
    print("\tNumber of normal requests: ", len(df[df['target'] == 0]))
    print("\tNumber of anomalous requests: ", len(df[df['target'] == 1]))
    print("\tNumber of total requests: ", df.shape[0])
    return df

In [84]:
def smsspam_load_data(test_prob=0.2, max_len=71):
    data_file = './datasets/smsspam/spam.csv'
    data = preprocessing_smsspam_dataset(data_file)

    x_data = data['text'].values
    y_data = data['target'].values

    tokenizer = get_tokenizer(tokenizer='spacy', language='en_core_web_sm')
    
    # print(x_data)
    vocab = build_vocab_from_iterator([tokenizer(text) for text in x_data])

    print(vocab.__len__())
    text_pipeline = lambda x: vocab(tokenizer(x))
    
    temp_x_data = [text_pipeline(text) for text in x_data]
    temp_x_data = [np.asarray(sample, dtype=np.int32) for sample in temp_x_data]
    temp_x_data = [np.pad(sample, (0, max(0, max_len-len(sample))), mode='constant', constant_values=0) for sample in temp_x_data]
    temp_x_data = [sample[:max_len] for sample in temp_x_data]
    x_data = np.asarray(temp_x_data, dtype=np.int32)
    
    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=test_prob,
                                                        shuffle=True, random_state=120124)

    return x_train, y_train, x_test, y_test

# Prepare Dataloader

In [85]:
import torch
from torch.utils.data import TensorDataset, DataLoader


batch_size = 64

x_train, y_train, x_test, y_test = smsspam_load_data()
x_train = torch.from_numpy(x_train)
y_train = torch.from_numpy(y_train)

train_dataset = TensorDataset(x_train, y_train)
data_loader = DataLoader(train_dataset,
                        shuffle=True,
                        batch_size=batch_size)

  df_data['text'] = df_data['text'].str.replace('[^\w\s]', '')
  df_data['text'] = df_data['text'].str.replace('\d', '')


+++ httpparams dataset: +++
	Number of normal requests:  4516
	Number of anomalous requests:  653
	Number of total requests:  5169
6972


# Define BIDIRECT LSTM Model

In [146]:
import torch.nn as nn
import torch


device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

class LSTMNet(nn.Module):
    
    def __init__(self, vocab_size=6972, embed_dim=128, hidden_dim=32, nb_classes=2, n_layers=2):
        super(LSTMNet,self).__init__()
        
        self.emded_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        # Embedding layer converts integer sequences to vector sequences
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.emded_dim, padding_idx=0)
        
        # LSTM layer process the vector sequences 
        self.lstm_1 = nn.LSTM(input_size=self.emded_dim,
                            hidden_size=self.hidden_dim,
                            num_layers=self.n_layers,
                            bidirectional=True,
                            dropout=0.2,
                            batch_first=True
                           )
        self.tanh = nn.Tanh()

        self.lstm_2 = nn.LSTM(input_size=self.hidden_dim*2,
                            hidden_size=self.hidden_dim,
                            num_layers=self.n_layers,
                            bidirectional=True,
                            batch_first=True
                           )

        self.fc = nn.Linear(in_features=self.hidden_dim*2, out_features=nb_classes) # 2 for bidirection
        # Prediction activation function
        self.softmax = nn.Softmax(dim=1)
        
    
    def forward(self, x_batch):
        embedded = self.embedding(x_batch)
        # print(embedded.shape)
        hidden_0 = torch.randn(2*self.n_layers, len(x_batch), self.hidden_dim).to(device) # 2 for bidirection
        carry_0 = torch.randn(2*self.n_layers, len(x_batch), self.hidden_dim).to(device)
        # print(hidden_0.shape)
        output, (hidden_1, carry_1) = self.lstm_1(embedded, (hidden_0, carry_0))
        # print(output.shape)
        # print(hidden_1.shape)
        output = self.tanh(output)
        output, (hidden_2, carry_2) = self.lstm_2(output, (hidden_1, carry_1))
        #Final activation function
        output = self.fc(output[:, -1, :])
        output = self.softmax(output)
        
        # print(output.shape)
        return output

In [138]:
import torch

sample_data = torch.randint(0, 6972, (32, 71)).to(device)
model = LSTMNet().to(device)

pytorch_total_params = sum(p.numel() for p in model.parameters())

print(pytorch_total_params)
output = model(sample_data)

546690


# Train Model

In [144]:
from tek4fed.decorator import timer
from torch.optim import Adam
from torch import nn
import gc


device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')



model = LSTMNet()
optimizer = Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()


@timer
def train():
    model.to(device)
    # set the model_lib in training mode
    model.train()
    
    total_test_loss = 0
    test_correct = 0
    for (x_batch, y_batch) in data_loader:
        # send the input to the device
        (x_batch, y_batch) = (x_batch.to(device),
                                y_batch.long().to(device))
        
        # perform a forward pass and calculate the training loss
        pred = model(x_batch)
        loss = criterion(pred, y_batch)

        # zero out the gradients, perform the backpropagation step,
        # and update the weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_test_loss = total_test_loss + loss
        test_correct = test_correct + (pred.argmax(1) == y_batch).type(
            torch.float
        ).sum().item()

    avg_test_loss = total_test_loss / len(data_loader)
    test_correct = test_correct / len(x_train)

    results_dict = {
        'loss': avg_test_loss.cpu().detach().item(),
        'accuracy': test_correct
    }


    print(results_dict)
    gc.collect()

In [145]:
for e in range(20):
    print('Epoch: {}'.format(e))
    train()

Epoch: 0
{'loss': 0.49329206347465515, 'accuracy': 0.8727932285368802}
		 Total time taken to train: 1.0474s
Epoch: 1
{'loss': 0.4405587315559387, 'accuracy': 0.8727932285368802}
		 Total time taken to train: 0.8752s
Epoch: 2
{'loss': 0.44076070189476013, 'accuracy': 0.8727932285368802}
		 Total time taken to train: 0.8041s
Epoch: 3
{'loss': 0.44086816906929016, 'accuracy': 0.8727932285368802}
		 Total time taken to train: 0.8158s
Epoch: 4
{'loss': 0.44052964448928833, 'accuracy': 0.8727932285368802}
		 Total time taken to train: 0.7930s
Epoch: 5
{'loss': 0.44065067172050476, 'accuracy': 0.8727932285368802}
		 Total time taken to train: 0.7886s
Epoch: 6
{'loss': 0.44037896394729614, 'accuracy': 0.8727932285368802}
		 Total time taken to train: 0.7974s
Epoch: 7
{'loss': 0.3937194347381592, 'accuracy': 0.9172914147521161}
		 Total time taken to train: 0.7811s
Epoch: 8
{'loss': 0.3393701910972595, 'accuracy': 0.973881499395405}
		 Total time taken to train: 0.7876s
Epoch: 9
{'loss': 0.331

In [92]:
print(torch.cuda.is_available())

True
