In [1]:
import torch
import pandas as pd

# nlp library of Pytorch
from torchtext import data

import warnings as wrn
wrn.filterwarnings('ignore')
SEED = 2021

torch.manual_seed(SEED)
torch.backends.cuda.deterministic = True

file_path = '/Users/mobin/Documents/quant finance Interview/My interviews/Kaggle/'
data_ = pd.read_csv(file_path+'/sms_spam.csv')
data_.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader  # For creating custom datasets and data loading
import spacy  # Import spaCy for natural language processing tasks

nlp = spacy.load("en_core_web_sm")  # Load a pre-trained English language model for tokenization

def tokenizer(text):
    return [token.text for token in nlp(text)]  # Use spaCy to tokenize text into individual words

class SMSDataset(Dataset):
    def __init__(self, data):
        self.data = data  # Store the input data
    
    def __len__(self):
        return len(self.data) 
    
    def __getitem__(self, idx):
        return self.data.iloc[idx]['type'], self.data.iloc[idx]['text']  # Return a single sample (label and text)


file_path = '/Users/mobin/Documents/quant finance Interview/My interviews/Kaggle/'
data_ = pd.read_csv(file_path + '/sms_spam.csv')  
dataset = SMSDataset(data_)  # Instantiate our custom dataset with the loaded data

from collections import Counter 

def build_vocab(dataset, min_freq=2):
    counter = Counter()  # Initialize a counter for word frequencies
    for _, text in dataset:
        counter.update(tokenizer(text))  # Count word occurrences across all texts
    return {word: i+1 for i, (word, count) in enumerate(counter.items()) if count >= min_freq}  # Create word-to-index mapping

vocab = build_vocab(dataset)  # Build the vocabulary from our dataset
vocab['<unk>'] = 0  # Add an 'unknown' token to handle words not in the vocabulary

# Define text pipeline
text_pipeline = lambda x: [vocab.get(word, vocab['<unk>']) for word in tokenizer(x)]  # Convert text to sequence of indices
label_pipeline = lambda x: 1 if x.lower() == 'spam' else 0  # Convert label to binary (1 for spam, 0 for ham)

# Collate function for DataLoader
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))  # Process labels
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)  # Convert text to tensor of indices
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))  # Store length of each sequence
    label_list = torch.tensor(label_list)
    text_list = torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True)  # Pad sequences to same length
    return label_list.float(), text_list, torch.tensor(lengths)

# Create DataLoader
BATCH_SIZE = 64
train_dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)  # Create DataLoader for efficient batching

# Print an example
for label, text, lengths in train_dataloader:
    print(f"Label: {label[0].item()}")  # Print the label of the first item in the batch
    print(f"Text: {' '.join([list(vocab.keys())[list(vocab.values()).index(i)] for i in text[0] if i != 0])}")  # Reconstruct and print the text
    print(f"Length: {lengths[0].item()}")  # Print the length of the first sequence
    break  # Only print the first batch

Label: 0.0
Text: Should i send you naughty pix ? :)
Length: 8


In [3]:
# Pytorch's nn module has lots of useful feature
import torch.nn as nn

class LSTMNet(nn.Module):
    
    def __init__(self,vocab_size,embedding_dim,hidden_dim,output_dim,n_layers,bidirectional,dropout):
        
        super(LSTMNet,self).__init__()
        
        # Embedding layer converts integer sequences to vector sequences
        self.embedding = nn.Embedding(vocab_size,embedding_dim)
        
        # LSTM layer process the vector sequences 
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers = n_layers,
                            bidirectional = bidirectional,
                            dropout = dropout,
                            batch_first = True
                           )
        
        # Dense layer to predict 
        self.fc = nn.Linear(hidden_dim * 2,output_dim)
        # Prediction activation function
        self.sigmoid = nn.Sigmoid()
        
    
    def forward(self,text,text_lengths):
        embedded = self.embedding(text)
        
        # Thanks to packing, LSTM don't see padding tokens 
        # and this makes our model better
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(),batch_first=True)
        
        packed_output,(hidden_state,cell_state) = self.lstm(packed_embedded)
        
        # Concatenating the final forward and backward hidden states
        hidden = torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1)
        
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.sigmoid(dense_outputs)
        
        return outputs

In [4]:
SIZE_OF_VOCAB = len(TEXT.vocab)
EMBEDDING_DIM = 100
NUM_HIDDEN_NODES = 64
NUM_OUTPUT_NODES = 1
NUM_LAYERS = 2
BIDIRECTION = True
DROPOUT = 0.2
model = LSTMNet(SIZE_OF_VOCAB,
                EMBEDDING_DIM,
                NUM_HIDDEN_NODES,
                NUM_OUTPUT_NODES,
                NUM_LAYERS,
                BIDIRECTION,
                DROPOUT
               )
import torch.optim as optim
model = model.to(device)
optimizer = optim.Adam(model.parameters(),lr=1e-4)
criterion = nn.BCELoss()
criterion = criterion.to(device)
model

NameError: name 'TEXT' is not defined