<a href="https://colab.research.google.com/github/phaniparsa/NLP/blob/main/sms_classification_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import string
import pandas as pd

import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

from sklearn import naive_bayes
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy.sparse import hstack, csr_matrix

In [None]:
# Mounting the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load dataset
df_sms = pd.read_csv('drive/My Drive/Master\'s in NLP/Natural Language Processing Concepts/sms_classification_project/spamdata.csv')

In [None]:
df_sms.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
print("Number of SMSs in the dataset: ", len(df_sms))
print("Spam vs Ham Frequency Distribution: ")
print(df_sms["label"].value_counts(dropna=False))

Number of SMSs in the dataset:  5572
Spam vs Ham Frequency Distribution: 
ham     4825
spam     747
Name: label, dtype: int64


- The dataset contains a total of 5572 SMSs.
- The dataset does not contain any Null values under the label column.
- The dataset is imbalanced with spam samples being the under-represented class.

# Data Cleaning

In [11]:
# Creating Punctuations String for SMS text Cleaning
punctuations = string.punctuation

# Creating Spacy English Class Object
nlp = English()

def clean_sms_text(sms_text):
    # Converting the SMS text to lower case 
    sms_text = sms_text.lower()
    
    # Removing Punctuation Characters from SMS text
    sms_text = "".join(char for char in sms_text if char not in punctuations)
    
    # Tokenizing SMS text 
    my_doc = nlp(sms_text)
    token_list = []
    for token in my_doc:
        token_list.append(token.text)
    
    # Removing Stop words from SMS Text
    filtered_sentence =[] 

    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word)
    
    # Joining the tokens to obtain the SMS Text
    sms_text = " ".join(filtered_sentence)
    
    return sms_text

In [13]:
df_sms["cleaned_text"] = df_sms["text"].apply(clean_sms_text)

In [14]:
df_sms.head()

Unnamed: 0,label,text,cleaned_text
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives


# Feature Engineering

In [15]:
df_sms["sms_text_num_of_words"] = df_sms["text"].apply(lambda x : len(x.split()))

df_sms["cleaned_sms_text_num_of_words"] = df_sms["cleaned_text"].apply(lambda x : len(x.split()))
df_sms["cleaned_sms_text_num_of_chars"] = df_sms["cleaned_text"].apply(lambda x : len(x))
df_sms["cleaned_sms_text_num_of_chars_without_spaces"] = df_sms["cleaned_text"].apply(lambda x : len(x.replace(" ","")))
df_sms["cleaned_sms_text_num_of_digits"] = df_sms["cleaned_text"].apply(lambda x :  sum([1 if w.isdigit() else 0 for w in x.split()]))

In [16]:
df_sms.head()

Unnamed: 0,label,text,cleaned_text,sms_text_num_of_words,cleaned_sms_text_num_of_words,cleaned_sms_text_num_of_chars,cleaned_sms_text_num_of_chars_without_spaces,cleaned_sms_text_num_of_digits
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,20,15,79,65,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,6,6,23,18,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,22,131,110,3
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c,11,6,19,14,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives,13,6,27,22,0


In [17]:
pos_dic = {"noun" : ["NNP", "NN", "NNS", "NNPS"], "verb" : ["VBZ", "VB", "VBD","VBG", "VBN"]}

nlp = spacy.load("en_core_web_sm")

def count_pos(sms_text, family):

    # spacy document
    my_doc = nlp(sms_text)
    
    all_tags = []

    # Get pos tag
    for word in my_doc:
        all_tags.append(word.tag_)
    
    count = 0

    # Count number of nouns and verbs
    for tag in all_tags:
        if tag in pos_dic[family]:
            count += 1

    return count

In [18]:
df_sms["noun_count"] = df_sms["cleaned_text"].apply(lambda x : count_pos(x, "noun"))
df_sms["verb_count"] = df_sms["cleaned_text"].apply(lambda x : count_pos(x, "verb"))

# Modeling

In [21]:
target = df_sms["label"].values
y = LabelEncoder().fit_transform(target)

In [22]:
X = df_sms[["sms_text_num_of_words", 
            "cleaned_sms_text_num_of_words", 
            "cleaned_sms_text_num_of_chars", 
            "cleaned_sms_text_num_of_chars_without_spaces", 
            "cleaned_sms_text_num_of_digits", 
            "noun_count", 
            "verb_count"]]

In [23]:
x_train, x_valid, y_train, y_valid = train_test_split(X, y, random_state=20, stratify=y)

In [24]:
# Train and Validatio dataset
(x_train.shape, y_train.shape), (x_valid.shape, y_valid.shape)

(((4179, 7), (4179,)), ((1393, 7), (1393,)))

In [25]:
# Multinomial naive bayes
model = naive_bayes.MultinomialNB()

In [26]:
# Fit model on training data
model.fit(x_train, y_train)

MultinomialNB()

In [27]:
# Prediction on training data
pred_train = model.predict(x_train)

# Prediction on validation data
pred_valid = model.predict(x_valid)

In [28]:
# Training accuracy
accuracy_score(y_train, pred_train)

0.9430485762144054

In [29]:
# Validation accuracy
accuracy_score(y_valid, pred_valid)

0.9382627422828428