# Done by Mohammad Navid Nayyem

## Import Libraries and Downloads for SMS Text Processing and Bag-of-Words (BoW) Model

In [1]:
import pandas as pd  # Required for handling and analyzing data in tabular format
import re  # Required for working with regular expressions
import nltk  # Required for natural language processing tasks like tokenization, stemming, and more.
from nltk.tokenize import word_tokenize # Required for tokenization
from nltk.corpus import stopwords # Required for removing the stopword
nltk.download('stopwords') # Download NLTK stopwords
nltk.download('punkt') # Download NLTK punctuations
from sklearn.feature_extraction.text import CountVectorizer # Required for training BoW model

[nltk_data] Downloading package stopwords to C:\Users\Mohammad
[nltk_data]     Navid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Mohammad
[nltk_data]     Navid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load the dataset

In [2]:
# B1. Load the dataset
data_SMS = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=["Class", "SMS_Text"])
#print(data_SMS)

## Perform case normalization (lower case) to the text data

In [3]:
# B2. Perform case normalization (lower case) to the text data
data_SMS['PREPROCESSED_SMS'] = data_SMS['SMS_Text'].str.lower()
#print(data_SMS['PREPROCESSED_SMS'])

## Remove the punctuations and digits from the text data

In [4]:
# B3. Remove the punctuations and digits from the text data
data_SMS['PREPROCESSED_SMS'] = data_SMS['PREPROCESSED_SMS'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', ' ', x))
#print(data_SMS['PREPROCESSED_SMS'])

## Tokenize the text data

In [5]:
# B4. Tokenize the text data
data_SMS['PREPROCESSED_SMS'] = data_SMS['PREPROCESSED_SMS'].apply(word_tokenize)
#print(data_SMS['PREPROCESSED_SMS'])

## Remove the stopwords from text data

In [6]:
# B5. Remove the stopwords from text data
StopwordsEnglish = stopwords.words('english')
data_SMS['StopwordsRemoved'] = data_SMS['PREPROCESSED_SMS'].apply(lambda x: [word for word in x if word not in StopwordsEnglish])
#print(data_SMS['StopwordsRemoved'])

## Train the BoW model

In [7]:
# B6. Train the BoW model
SMS_vectorizer = CountVectorizer()
SMS_bow = SMS_vectorizer.fit_transform(data_SMS['StopwordsRemoved'].apply(lambda x: ' '.join(x)))
SMS_bow.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Print out the names of the features

In [8]:
# B7. Print out the names of the features
print(SMS_vectorizer.get_feature_names_out())

['aa' 'aah' 'aaniye' ... 'zouk' 'zs' 'zyada']


## Transform BoW matrix into Pandas DataFrame

In [9]:
# B8. Transform BoW matrix into Pandas DataFrame
bow_df = pd.DataFrame(SMS_bow.toarray(), columns=SMS_vectorizer.get_feature_names_out())
bow_df

Unnamed: 0,aa,aah,aaniye,aaooooright,aathi,ab,abbey,abdomen,abeg,abel,...,zeros,zf,zhong,zindgi,zoe,zogtorius,zoom,zouk,zs,zyada
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
