THE PURPOSE OF THIS PROJECT IS TO CREATE A MODEL USING LSTM THAT READS TEXT DATA AND CLASSIFIES IT AS EITHER HAM OR SPAM.
DATA PREPROCESSING WILL BE DONE WITH THE HELP OF TORCH TEXT

In [None]:
# Necessary imports 
import pandas as pd
import numpy as np
import torch
from torch import nn,optim
import torchtext
from torchtext import data

In [None]:
df = pd.read_csv("C:\\Datasets\\smsspamcollection\\SMSSpamCollection", sep='\t',
                       names=["label", "message"])

In [None]:
df.head()

In [None]:
df['length'] = df['message'].apply(len)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(20,8))
sns.histplot(data=df,x='length',color='black')
plt.show()

In [None]:
df.info()

In [None]:
# Field the Normal column and fieldlabel the label column
import spacy
spacy= spacy.load('en')
TEXT = data.Field(tokenize=spacy,batch_first=True)
LABEL = data.LabelField(dtpe=torch.float(),batch_first=True)

In [None]:
fields = [("type",LABEL),("text",TEXT)]
training_data = data.TabularDataset(path="C:\\Datasets\\smsspamcollection\\SMSSpamCollection",format='csv',fields=fields,skip_header=True)

In [None]:
# Splitting our data into training and validation set
import random
train_data,valid_data = random.split(split_ratio=0.75,random_state=101)


In [None]:
# Building vocabulary
TEXT.build_vocab(train_data,min_freq=5)
LABEL.build_vocab(train_data)

# device agnostic code
device ='cuda' if torch.cuda.is_available else 'cpu'
batch_size = 64

# using bucket iterator we will iterate through data to get batches of we need them
train_iterator,valid_iterator = data.BucketIterator.split((train_data,valid_data),batch_size=batch_size,sort_key=
                                                         lambda x:len(x.text),sort_within_batch=True,device=device)

# LSTM MODEL :: THE IDEA OF EMBEDDINGS+LSTM

In [None]:
class TextClassifier(nn.Module):
    def __init__(self,vocabsize,emb_dim,hidden_dim,output_dim,
                 num_layers,bidirectional,dropout):
        super(TextClassifier,self)
        self.embedding_dim = nn.Embedding(vocabsize,emb_dim)
        self.lstm = nn.LSTM(embedding_dim,hidden_dim,num_layers=num_layers,
                           bidirectional=bidirectional,dropout=dropout)
        # an output layer which will be a linear layer
        self.fc = nn.Linear(hidden_dim*2,output_dim)
        # a sigmoid activation for our outputs 
        self.sigmoid = nn.Sigmoid()
    def forward(self,text,textlength):
        embedded = self.embedding_dim(text)
        # using padding sequence since LSTM rarely sees padding sequences
        packed_embedding = nn.utils.rnn.packed_padded_sequence(embedded,
                                                              textlength.cpu(),batch_first=True)
        packed_output,(hidden_state,cell_state) = self.lstm(packed_embedded)
        # Concataneting the final output
        hidden = torch.cat((hidden_state[:,:,2],hidden_state[:,:,-1]),dim=1)
        dense_output = self.fc(hidden)
        # applying our sigmoid function to the dense output to get a clear output
        output = self.sigmoid(dense_output)
        return output
        
    

#HYPER PARAMETRES

In [None]:
SIZE_OF_VOCAB = len(TEXT.vocab)
EMBEDDING_DIM = 100
NUM_HIDDEN_DIMS = 64
NUM_OUTPUT_DIMS =1
NUM_LAYERS =2
BIDIRECTION = True
DROPOUT =0.2


# TRAINING AND TESTING OUR MODEL

In [None]:
model = TextClassifier(SIZE_OF_VOCAB,EMBEDDING_DIM,NUM_HIDDEN_DIMS,NUM_OUTPUT_DIMS,NUM_LAYERS,BIDIRECTION,DROPI)
optimizer = torch.optim.Adam(model.parametres(),lr=0.001)
criterion = nn.BCELoss()


In [None]:
# function that returns the accuracy
def binary_accuracy(preds,y):
    rounded_preds = torch.round(preds)
    correct = (rounded_preds ==y).float()
    acc = correct.sum()/len(correct)
def train(model,iterator,optimizer,criterion):
    epoch_acc = 0.0
    epoch_loss = 0.0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        text,textlength = batch.text
        predictions = model(text,textlength).squeeze()
        loss = criterion(prediction,batch.type)
        loss.backwards()
        acc= binary_accuracy(predictions,batch.type)
        optimizer.step()
        epoch_loss+=loss.item()
        epoch_acc+=acc.item()
        return epoch_loss/len(iterator),epoch_acc/len(iterator)
def evaluate(model,iterator,optimizer,criterion):
    epoch_loss=0.0
    epoch_acc = 0.0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text,textlenght = batch.text
            predictions = model(text,textlenght).squeeze()
            loss = criterion(predictions,batch.type)
            acc = binary_accuracy(predictions,batch.type)
            epoch_loss+=loss.item()
            epoch_acc +=acc.item()
            return epoch_loss/len(iterator),epoch_acc/len(iterator)
            
            

In [None]:
epochs = 20
for epoch in range(epochs):
    train_loss,train_acc = train(model,iterator,optimizer,criterion)
    valid_loss,valid_acc = evaluate(model,iterator,optimizer,criterion)
    print (f"The train loss in {train_loss}|the train accuracy is {train_acc}%")
    print (f"The validatation loss in {valid_loss}|the train accuracy is {valid_acc}%")