# Done by Mohammad Navid Nayyem

## Import Libraries and Downloads for SMS Text Processing and Bag-of-Words (BoW) Model

In [7]:
import pandas as pd  # Required for handling and analyzing data in tabular format
import re  # Required for working with regular expressions
import nltk  # Required for natural language processing tasks like tokenization, stemming, and more.
from nltk.tokenize import word_tokenize # Required for tokenization
from nltk.corpus import stopwords # Required for removing the stopword
nltk.download('stopwords') # Download NLTK stopwords
nltk.download('punkt') # Download NLTK punctuations
from sklearn.feature_extraction.text import CountVectorizer # Required for training BoW model

[nltk_data] Downloading package stopwords to C:\Users\Mohammad
[nltk_data]     Navid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Mohammad
[nltk_data]     Navid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Define Functions

In [8]:
# Define functions for each data preprocessing step

def load_data():
    # B1. Load the dataset
    data_SMS = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=["Class", "SMS_Text"])
    #print(data_SMS)
    return data_SMS

def preprocess_text(data_SMS):
    # B2. Perform case normalization (lower case) to the text data
    data_SMS['PREPROCESSED_SMS'] = data_SMS['SMS_Text'].str.lower()
    #print(data_SMS['PREPROCESSED_SMS'])
    return data_SMS

def remove_punctuations_digits(data_SMS):
    # B3. Remove the punctuations and digits from the text data
    data_SMS['PREPROCESSED_SMS'] = data_SMS['PREPROCESSED_SMS'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', ' ', x))
    #print(data_SMS['PREPROCESSED_SMS'])
    return data_SMS

def tokenize_text(data_SMS):
    # B4. Tokenize the text data
    data_SMS['PREPROCESSED_SMS'] = data_SMS['PREPROCESSED_SMS'].apply(word_tokenize)
    #print(data_SMS['PREPROCESSED_SMS'])
    return data_SMS

def remove_stopwords(data_SMS):
    # B5. Remove the stopwords from text data
    StopwordsEnglish = stopwords.words('english')
    data_SMS['StopwordsRemoved'] = data_SMS['PREPROCESSED_SMS'].apply(lambda x: [word for word in x if word not in StopwordsEnglish])
    #print(data_SMS['StopwordsRemoved'])
    return data_SMS

def train_bow_model(data_SMS):
    # B6. Train the BoW model
    SMS_vectorizer = CountVectorizer()
    SMS_bow = SMS_vectorizer.fit_transform(data_SMS['StopwordsRemoved'].apply(lambda x: ' '.join(x)))
    SMS_bow.toarray()
    return SMS_bow, SMS_vectorizer

def print_feature_names(SMS_vectorizer):
    # B7. Print out the names of the features
    print(SMS_vectorizer.get_feature_names_out())

def convert_bow_matrix_to_df(SMS_bow, SMS_vectorizer):
    # B8. Transform BoW matrix into Pandas DataFrame
    bow_df = pd.DataFrame(SMS_bow.toarray(), columns=SMS_vectorizer.get_feature_names_out())
    print(bow_df)
    return bow_df

## Main Function Call

In [9]:
# Main function to execute the data preprocessing and transformation steps
def main():
    data_SMS = load_data()
    preprocessed_data = preprocess_text(data_SMS)
    punctuations_digits_removed_data=remove_punctuations_digits(preprocessed_data)
    tokenized_data = tokenize_text(punctuations_digits_removed_data)
    stopword_removed_data = remove_stopwords(tokenized_data)
    SMS_bow, SMS_vectorizer = train_bow_model(stopword_removed_data)
    print_feature_names(SMS_vectorizer)
    bow_df = convert_bow_matrix_to_df(SMS_bow, SMS_vectorizer)
    
if __name__ == '__main__':
    main()

['aa' 'aah' 'aaniye' ... 'zouk' 'zs' 'zyada']
      aa  aah  aaniye  aaooooright  aathi  ab  abbey  abdomen  abeg  abel  \
0      0    0       0            0      0   0      0        0     0     0   
1      0    0       0            0      0   0      0        0     0     0   
2      0    0       0            0      0   0      0        0     0     0   
3      0    0       0            0      0   0      0        0     0     0   
4      0    0       0            0      0   0      0        0     0     0   
...   ..  ...     ...          ...    ...  ..    ...      ...   ...   ...   
5567   0    0       0            0      0   0      0        0     0     0   
5568   0    0       0            0      0   0      0        0     0     0   
5569   0    0       0            0      0   0      0        0     0     0   
5570   0    0       0            0      0   0      0        0     0     0   
5571   0    0       0            0      0   0      0        0     0     0   

      ...  zeros  zf  zhong  