<a href="https://colab.research.google.com/github/matthewreader/continuous-learning/blob/main/projects/manning-liveprojects/spam-sms-bert/Spam_SMS_Detection_Part_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1 - A cell containing imports of all the libraries.
!pip install transformers
!pip install pytorch_lightning

import pandas as pd
import regex as re
from google.colab import drive
import sklearn
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import pytorch_lightning as pl
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertTokenizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from google.colab import data_table
data_table.enable_dataframe_formatter()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# 2 - A cell containing code for mounting your Google Drive in Colab.
drive.mount('/content/gdrive')
GDRIVE_DATA = "gdrive/My Drive/Colab Notebooks/Data/"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# 3 - A cell containing code for reading the data using pandas.
data = pd.read_csv(GDRIVE_DATA + "Khilnani_LP_spam_detection_data.csv")
data.head(20)

Unnamed: 0.1,Unnamed: 0,SMS,label
0,0,"Go until jurong point, crazy.. Available only ...",0
1,1,Ok lar... Joking wif u oni...\n,0
2,2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,3,U dun say so early hor... U c already then say...,0
4,4,"Nah I don't think he goes to usf, he lives aro...",0
5,5,FreeMsg Hey there darling it's been 3 week's n...,1
6,6,Even my brother is not like to speak with me. ...,0
7,7,As per your request 'Melle Melle (Oru Minnamin...,0
8,8,WINNER!! As a valued network customer you have...,1
9,9,Had your mobile 11 months or more? U R entitle...,1


In [None]:
# 4 - Cells containing code for writing a function to clean the text data, 
# and then applying that function on the dataset.
STOPWORDS = set(stopwords.words('english'))

def clean_sms(sms):
  sms = sms.lower().strip()
  sms = re.sub('[^a-zA-Z ]+', '', sms)
  sms =  ' '.join([word for word in sms.split() if word not in STOPWORDS])
  return sms
data["SMS Clean"] = data["SMS"].apply(clean_sms)
data.head(20)

Unnamed: 0.1,Unnamed: 0,SMS,label,SMS Clean
0,0,"Go until jurong point, crazy.. Available only ...",0,go jurong point crazy available bugis n great ...
1,1,Ok lar... Joking wif u oni...\n,0,ok lar joking wif u oni
2,2,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry wkly comp win fa cup final tkts st ...
3,3,U dun say so early hor... U c already then say...,0,u dun say early hor u c already say
4,4,"Nah I don't think he goes to usf, he lives aro...",0,nah dont think goes usf lives around though
5,5,FreeMsg Hey there darling it's been 3 week's n...,1,freemsg hey darling weeks word back id like fu...
6,6,Even my brother is not like to speak with me. ...,0,even brother like speak treat like aids patent
7,7,As per your request 'Melle Melle (Oru Minnamin...,0,per request melle melle oru minnaminunginte nu...
8,8,WINNER!! As a valued network customer you have...,1,winner valued network customer selected receiv...
9,9,Had your mobile 11 months or more? U R entitle...,1,mobile months u r entitled update latest colou...


In [None]:
# 5 - Cells containing code for splitting the data into train and validation 
# sets
#  using sklearn.
x = data["SMS Clean"].values
y = data["label"].values

train, val, train_labels, val_labels = train_test_split(x, y)

In [None]:
# 6 - Cells containing code for loading the pre-trained DistilBertTokenizer and
# tokenizing the data.

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_tokens = tokenizer(list(train), return_tensors="pt", padding=True, truncation=True, max_length=64)
val_tokens = tokenizer(list(val), return_tensors="pt", padding=True, truncation=True, max_length=64)

train_tokens

{'input_ids': tensor([[  101,  8840,  2140,  ...,     0,     0,     0],
        [  101, 20996, 10258,  ...,     0,     0,     0],
        [  101,  5796,  2290,  ...,     0,     0,     0],
        ...,
        [  101,  7929,   102,  ...,     0,     0,     0],
        [  101,  7696,  3042,  ...,     0,     0,     0],
        [  101,  7632,  4830,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
# 7 - A cell containing the code for DataModule class built in PyTorch lightning.
BATCH_SIZE = 32

class ClassificationData(pl.LightningDataModule):
    def __init__(self, train_tokens, val_tokens):
        super().__init__()
       
        trn = [train_tokens["input_ids"].to(device), train_tokens["attention_mask"].to(device),
              torch.tensor(train_labels).to(device)]
        val = [val_tokens["input_ids"].to(device), val_tokens["attention_mask"].to(device),
              torch.tensor(val_labels).to(device)]

        self.trn = DataLoader(TensorDataset(*trn), batch_size=BATCH_SIZE)
        self.val = DataLoader(TensorDataset(*val), batch_size=BATCH_SIZE)

    def train_dataloader(self): return self.trn
    def val_dataloader(self): return self.val

dls = ClassificationData(train_tokens, val_tokens)
next(iter(dls.trn))

[tensor([[  101,  8840,  2140,  ...,     0,     0,     0],
         [  101, 20996, 10258,  ...,     0,     0,     0],
         [  101,  5796,  2290,  ...,     0,     0,     0],
         ...,
         [  101,  2699,  3967,  ...,     0,     0,     0],
         [  101,  2590,  2592,  ...,     0,     0,     0],
         [  101,  9805,  2361,  ...,     0,     0,     0]], device='cuda:0'),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'),
 tensor([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 1, 1, 1, 1, 0], device='cuda:0')]