In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

  from IPython.core.display import display, HTML


# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [3]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv", encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


### Let's divide the training and test set into two partitions

In [4]:
# Your code
from sklearn.model_selection import train_test_split

# Assuming 'label' is the target column based on the context of SPAM/HAM classification
X_train, X_val, y_train, y_val = train_test_split(
    data['text'],
    data['label'],
    test_size=0.2, # 80% train, 20% validation
    random_state=42, 
    stratify=data['label'] # Important for balanced classes in NLP/classification
)

# Recombine into DataFrames for easier feature engineering later
data_train = pd.DataFrame({'text': X_train, 'label': y_train}).reset_index(drop=True)
data_val = pd.DataFrame({'text': X_val, 'label': y_val}).reset_index(drop=True)

print(f"Training set shape: {data_train.shape}")
print(f"Validation set shape: {data_val.shape}")

Training set shape: (800, 2)
Validation set shape: (200, 2)


## Data Preprocessing

In [5]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [6]:
# Your code
import re

def clean_html(text):
    # 1. Remove inline JavaScript/CSS (anything inside <script> and <style> tags)
    text = re.sub(r'<script\b[^>]*>.*?</script>', '', text, flags=re.IGNORECASE | re.DOTALL)
    text = re.sub(r'<style\b[^>]*>.*?</style>', '', text, flags=re.IGNORECASE | re.DOTALL)
    
    # 2. Remove html comments
    text = re.sub(r'', '', text, flags=re.DOTALL)
    
    # 3. Remove the remaining tags (e.g., <div>, <p>, <a>)
    text = re.sub(r'<[^>]*>', '', text)
    
    return text

# Apply the cleaning to both datasets
data_train['text_cleaned'] = data_train['text'].apply(clean_html)
data_val['text_cleaned'] = data_val['text'].apply(clean_html)

print(data_train[['text', 'text_cleaned']].head())

                                                text  \
0  Dear=2C Good day hope fine=2Cdear am writting ...   
1  FROM MR HENRY KABORETHE CHIEF AUDITOR INCHARGE...   
2                                           Will do.   
3  FROM THE DESK OF DR.ADAMU  ISMALERAUDITING AND...   
4  Dear Friend, My name is LOI C.ESTRADA,The wife...   

                                        text_cleaned  
0  Dear=2C Good day hope fine=2Cdear am writting ...  
1  FROM MR HENRY KABORETHE CHIEF AUDITOR INCHARGE...  
2                                           Will do.  
3  FROM THE DESK OF DR.ADAMU  ISMALERAUDITING AND...  
4  Dear Friend, My name is LOI C.ESTRADA,The wife...  


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [7]:
# Your code
def normalize_text(text):
    # Convert to Lowercase
    text = text.lower()
    
    # Remove prefixed 'b' (often seen in raw byte strings, though maybe not needed here)
    text = re.sub(r"^b\\'", '', text)

    # Remove all the special characters (keeping only letters and spaces)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Remove numbers
    text = re.sub(r'\d+', ' ', text)
    
    # Substitute multiple spaces with single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  

    # Remove single characters from the start
    text = re.sub(r'^[a-zA-Z]\s+', ' ', text) 

    return text

from re import sub # Import re's sub function directly for brevity

data_train['preprocessed_text'] = data_train['text_cleaned'].apply(normalize_text)
data_val['preprocessed_text'] = data_val['text_cleaned'].apply(normalize_text)

print(data_train[['text_cleaned', 'preprocessed_text']].head())

                                        text_cleaned  \
0  Dear=2C Good day hope fine=2Cdear am writting ...   
1  FROM MR HENRY KABORETHE CHIEF AUDITOR INCHARGE...   
2                                           Will do.   
3  FROM THE DESK OF DR.ADAMU  ISMALERAUDITING AND...   
4  Dear Friend, My name is LOI C.ESTRADA,The wife...   

                                   preprocessed_text  
0  dear good day hope fine cdear am writting this...  
1  from mr henry kaborethe chief auditor incharge...  
2                                            will do  
3  from the desk of dr adamu ismalerauditing and ...  
4  dear friend my name is loi estrada the wife of...  


## Now let's work on removing stopwords
Remove the stopwords.

In [8]:
# Your code
def remove_stopwords(text):
    # Tokenize the text (split into words)
    words = text.split()
    
    # Get the English stopwords list
    stop_words = set(stopwords.words("english"))
    
    # Remove stopwords
    words = [w for w in words if w not in stop_words]
    
    # Rejoin the words into a single string
    return " ".join(words)

data_train['preprocessed_text'] = data_train['preprocessed_text'].apply(remove_stopwords)
data_val['preprocessed_text'] = data_val['preprocessed_text'].apply(remove_stopwords)

print(data_train['preprocessed_text'].head())

0    dear good day hope fine cdear writting mail du...
1    mr henry kaborethe chief auditor inchargeforei...
2                                                     
3    desk dr adamu ismalerauditing accounting manag...
4    dear friend name loi estrada wife mr josephest...
Name: preprocessed_text, dtype: object


## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [9]:
# Your code
def stem_text(text):
    # Tokenize the text
    words = text.split()
    
    # Apply stemming
    stemmed_words = [snowball.stem(w) for w in words]
    
    # Rejoin the words
    return " ".join(stemmed_words)

data_train['preprocessed_text'] = data_train['preprocessed_text'].apply(stem_text)
data_val['preprocessed_text'] = data_val['preprocessed_text'].apply(stem_text)

print(data_train['preprocessed_text'].head())

0    dear good day hope fine cdear writ mail due re...
1    mr henri kaboreth chief auditor inchargeforeig...
2                                                     
3    desk dr adamu ismaleraudit account manag bank ...
4    dear friend name loi estrada wife mr josephest...
Name: preprocessed_text, dtype: object


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [10]:
# Your code
from sklearn.feature_extraction.text import CountVectorizer

# 1. Separate data by label
spam_text = " ".join(data_train[data_train['label'] == 1]['preprocessed_text'])
ham_text = " ".join(data_train[data_train['label'] == 0]['preprocessed_text'])

# 2. Use CountVectorizer to get word counts
vectorizer = CountVectorizer(stop_words=None, max_features=5000) # Simple counts

# Fit on all text combined
vectorizer.fit([spam_text, ham_text])

# Transform and get feature names
spam_counts = vectorizer.transform([spam_text])
ham_counts = vectorizer.transform([ham_text])
feature_names = vectorizer.get_feature_names_out()

# 3. Get top 10 words for Spam
spam_word_counts = pd.Series(spam_counts.toarray()[0], index=feature_names)
top_10_spam = spam_word_counts.nlargest(10)
print("--- Top 10 Words in SPAM Messages ---")
print(top_10_spam)

# 4. Get top 10 words for Ham
ham_word_counts = pd.Series(ham_counts.toarray()[0], index=feature_names)
top_10_ham = ham_word_counts.nlargest(10)
print("\n--- Top 10 Words in HAM Messages ---")
print(top_10_ham)

--- Top 10 Words in SPAM Messages ---
money       756
account     701
bank        685
fund        606
us          597
transact    437
transfer    433
busi        415
countri     397
foreign     394
dtype: int64

--- Top 10 Words in HAM Messages ---
presid     97
state      97
work       97
call       94
would      92
mr         85
obama      82
percent    80
time       73
one        69
dtype: int64


## Extra features

In [11]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","â‚¬",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

Unnamed: 0,text,label,text_cleaned,preprocessed_text,money_mark,suspicious_words,text_len
0,Dear=2C Good day hope fine=2Cdear am writting ...,1,Dear=2C Good day hope fine=2Cdear am writting ...,dear good day hope fine cdear writ mail due re...,1,1,905
1,FROM MR HENRY KABORETHE CHIEF AUDITOR INCHARGE...,1,FROM MR HENRY KABORETHE CHIEF AUDITOR INCHARGE...,mr henri kaboreth chief auditor inchargeforeig...,0,1,1710
2,Will do.,0,Will do.,,0,0,0
3,FROM THE DESK OF DR.ADAMU ISMALERAUDITING AND...,1,FROM THE DESK OF DR.ADAMU ISMALERAUDITING AND...,desk dr adamu ismaleraudit account manag bank ...,1,1,347
4,"Dear Friend, My name is LOI C.ESTRADA,The wife...",1,"Dear Friend, My name is LOI C.ESTRADA,The wife...",dear friend name loi estrada wife mr josephest...,1,1,1304


## How would work the Bag of Words with Count Vectorizer concept?

In [12]:
# Your code
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
count_vectorizer = CountVectorizer()

# Fit and transform the training data
X_train_bow = count_vectorizer.fit_transform(data_train['preprocessed_text'])

# Transform the validation data using the SAME fitted vectorizer
X_val_bow = count_vectorizer.transform(data_val['preprocessed_text'])

print("--- CountVectorizer Shape ---")
print(f"Training BOW shape: {X_train_bow.shape}")
print(f"Validation BOW shape: {X_val_bow.shape}")

--- CountVectorizer Shape ---
Training BOW shape: (800, 26335)
Validation BOW shape: (200, 26335)


## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [13]:
# Your code
# TfidfVectorizer was already imported in Cell 3

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(data_train['preprocessed_text'])

# Transform the validation data using the SAME fitted vectorizer
X_val_tfidf = tfidf_vectorizer.transform(data_val['preprocessed_text'])

print("--- TfidfVectorizer Shape ---")
print(f"Training TF-IDF shape: {X_train_tfidf.shape}")
print(f"Validation TF-IDF shape: {X_val_tfidf.shape}")

--- TfidfVectorizer Shape ---
Training TF-IDF shape: (800, 26335)
Validation TF-IDF shape: (200, 26335)


## And the Train a Classifier?

In [14]:
# Your code
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Classifier must be MultinomialNB with default parameters
nb_classifier = MultinomialNB()

# Train the model using TF-IDF features
nb_classifier.fit(X_train_tfidf, data_train['label'])

# Predict on the validation set
y_val_pred = nb_classifier.predict(X_val_tfidf)

# Evaluate performance
accuracy = accuracy_score(data_val['label'], y_val_pred)

print("--- MultinomialNB (TF-IDF) Performance ---")
print(f"Validation Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(data_val['label'], y_val_pred))

--- MultinomialNB (TF-IDF) Performance ---
Validation Accuracy: 0.9350

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.88      0.94       112
           1       0.87      1.00      0.93        88

    accuracy                           0.94       200
   macro avg       0.94      0.94      0.93       200
weighted avg       0.94      0.94      0.94       200



### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [15]:
# Your code
from scipy.sparse import hstack
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# --- Option 4: TF-IDF + Extra Flags (The best combination for the extra task) ---

# 1. Vectorize text (already done in Cell 25, just reuse the matrices)
# X_train_tfidf, X_val_tfidf

# 2. Extract extra features
# Note: Extra features were created in Cell 21. We access them here.
extra_features_train = data_train[['money_mark', 'suspicious_words', 'text_len']]
extra_features_val = data_val[['money_mark', 'suspicious_words', 'text_len']]

# 3. Horizontally stack the TF-IDF matrix with the extra features
X_train_combined = hstack([X_train_tfidf, extra_features_train.values])
X_val_combined = hstack([X_val_tfidf, extra_features_val.values])

# 4. Train the Multinomial Naive Bayes Classifier
nb_classifier_combined = MultinomialNB()
nb_classifier_combined.fit(X_train_combined, data_train['label'])

# 5. Predict and Evaluate
y_val_pred_combined = nb_classifier_combined.predict(X_val_combined)
accuracy_combined = accuracy_score(data_val['label'], y_val_pred_combined)

print("--- Performance (TF-IDF + Extra Flags) ---")
print(f"Validation Accuracy (Combined Features): {accuracy_combined:.4f}")
print("\nClassification Report (Combined Features):")
print(classification_report(data_val['label'], y_val_pred_combined))

--- Performance (TF-IDF + Extra Flags) ---
Validation Accuracy (Combined Features): 0.6100

Classification Report (Combined Features):
              precision    recall  f1-score   support

           0       1.00      0.30      0.47       112
           1       0.53      1.00      0.69        88

    accuracy                           0.61       200
   macro avg       0.77      0.65      0.58       200
weighted avg       0.79      0.61      0.57       200

