In [1]:
from IPython.display import display, HTML
from keras.src.ops import dtype

display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [3]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


In [4]:
data.head()

Unnamed: 0,text,label
0,"DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...",1
1,Will do.,0
2,Nora--Cheryl has emailed dozens of memos about...,0
3,Dear Sir=2FMadam=2C I know that this proposal ...,1
4,fyi,0


### Let's divide the training and test set into two partitions

In [5]:
from sklearn.model_selection import train_test_split

X = data[['text']]
y = data[['label']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [6]:
print(X_train.shape)
print(X_train.head())
print(X_test.shape)
print(X_test.head())
print(y_train.shape)
print(y_test.shape)


(800, 1)
                                                  text
29   ----------- REGARDS, MR NELSON SMITH.KINDLY RE...
535  I have not been able to reach oscar this am. W...
695  ; Huma Abedin B6I'm checking with Pat on the 5...
557  I can have it announced here on Monday - can't...
836      BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...
(200, 1)
                                                  text
521  Dear Sir=2C I wish you go through this offer t...
737  To take your mind off the Balkans for a second...
740                       Pls keep the updates coming!
660  </STRONG><STRONG>CHRIST BETHEL HOSPITAL<BR>11 ...
411  sbwhoeopFriday February 5 2010 7:11 AMHRe: Bra...
(800, 1)
(200, 1)


## Data Preprocessing

In [7]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [8]:
import re
import html

def remove_html_javascript(text):
    # Remove <script> or <style> blocks
    text = re.sub(r'<(script|style).*?>.*?</\1>', '', text, flags=re.DOTALL)

    #Unescape HTML entities (convert &nbsp; → space, &amp; → &, etc.)
    text = html.unescape(text)

    # Remove HTML comments
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)

    # Remove remaining HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    return text

X_train['text'] = X_train['text'].apply(remove_html_javascript)
X_test['text'] = X_test['text'].apply(remove_html_javascript)

In [9]:
X_train.head()

Unnamed: 0,text
29,"----------- REGARDS, MR NELSON SMITH.KINDLY RE..."
535,I have not been able to reach oscar this am. W...
695,; Huma Abedin B6I'm checking with Pat on the 5...
557,I can have it announced here on Monday - can't...
836,BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [10]:
import re

def clean_text(text):
    # Remove all special characters except letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)

    # Remove numbers (digits \d)
    text = re.sub(r'\d+', ' ', text)

    # Remove all single characters (isolated letters)
    text = re.sub(r'\b[a-zA-Z]\b', ' ', text)

    # Remove single characters from the start
    text = re.sub(r'^[a-zA-Z]\s+', '', text)

    # Substitute multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # Remove prefixed 'b' (e.g., b'text')
    text = re.sub(r"^b\s+", '', text)
    text = re.sub(r"^b'(.*)'$", r'\1', text)
    text = re.sub(r'^b"(.*)"$', r'\1', text)

    # Convert to lowercase
    text = text.lower().strip()

    return text

X_train['text'] = X_train['text'].apply(clean_text)
X_test['text'] = X_test['text'].apply(clean_text)

In [11]:
print(X_train.head)

<bound method NDFrame.head of                                                   text
29   regards mr nelson smith kindly reply me on my ...
535  have not been able to reach oscar this am we a...
695  huma abedin checking with pat on the will work...
557     can have it announced here on monday can today
836  bank of africaagence san pedro bp san pedro co...
..                                                 ...
106  adama ibrahim tout savoir sur la curit de votr...
270              what does that mean for our schedules
860  dear friend my compliment to you guess this le...
435  dear president fdirector my name is mr micheal...
102  let me know if today or tomorrow works for you...

[800 rows x 1 columns]>


## Now let's work on removing stopwords
Remove the stopwords.

In [12]:
stop_words = stopwords.words("english")

def clean_stopwords(text):
    words = [w for w in text.split() if w not in stop_words]
    return ' '.join(words)

X_train['text'] = X_train['text'].apply(clean_stopwords)
X_test['text'] = X_test['text'].apply(clean_stopwords)

In [13]:
print(X_train.head)
print(X_test.head)

<bound method NDFrame.head of                                                   text
29   regards mr nelson smith kindly reply private e...
535         able reach oscar supposed send pdb receive
695  huma abedin checking pat work jack jake rest a...
557                             announced monday today
836  bank africaagence san pedro bp san pedro cote ...
..                                                 ...
106  adama ibrahim tout savoir sur la curit de votr...
270                                     mean schedules
860  dear friend compliment guess letter may come s...
435  dear president fdirector name mr micheal ipenz...
102  let know today tomorrow works would rather fin...

[800 rows x 1 columns]>
<bound method NDFrame.head of                                                   text
521  dear sir wish go offer consider partner ei mr ...
737  take mind balkans second see great plug global...
740                            pls keep updates coming
660  christ bethel hospital rue abo

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [14]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

X_train['text'] = X_train['text'].apply(lemmatizer.lemmatize)
X_test['text'] = X_test['text'].apply(lemmatizer.lemmatize)

In [15]:

print("X_train_lemmatized")
print(X_train.head())
print("X_test_lemmatized")
print(X_test.head())

X_train_lemmatized
                                                  text
29   regards mr nelson smith kindly reply private e...
535         able reach oscar supposed send pdb receive
695  huma abedin checking pat work jack jake rest a...
557                             announced monday today
836  bank africaagence san pedro bp san pedro cote ...
X_test_lemmatized
                                                  text
521  dear sir wish go offer consider partner ei mr ...
737  take mind balkans second see great plug global...
740                            pls keep updates coming
660  christ bethel hospital rue abobote abidjanivor...
411  sbwhoeopfriday february amhre bravo brava issu...


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [16]:
from collections import Counter
import pandas as pd

def get_top_n_words(text_series, n=10):
    all_text = ' '.join(text_series)
    words = all_text.split()
    word_counts = Counter(words)
    return word_counts.most_common(n)

# Convert X_train and y_train to DataFrame
df_train = pd.concat([X_train['text'], y_train['label']], axis=1)
df_test = pd.concat([X_test['text'], y_test['label']], axis=1)
df_all = pd.concat([df_train, df_test])

# Top 10 words in ham messages
top_10_ham = get_top_n_words(df_all[df_all['label'] == 0]['text'])
top_10_spam = get_top_n_words(df_all[df_all['label'] == 1]['text'])

top_10_ham = pd.DataFrame(top_10_ham, columns=['words', 'count'])
top_10_spam = pd.DataFrame(top_10_spam, columns=['words', 'count'])

print("Top 10 Ham Words:")
print(top_10_ham)
print("Top 10 Spam Words:")
print(top_10_spam)



Top 10 Ham Words:
       words  count
0         pm    127
1      state    113
2      would    107
3  president     98
4         mr     86
5      obama     84
6    percent     81
7       call     78
8  secretary     76
9         us     76
Top 10 Spam Words:
         words  count
0        money    981
1      account    836
2         bank    780
3           us    734
4  transaction    538
5     business    514
6      country    475
7         fund    474
8      million    439
9     transfer    421


## Extra features

In [17]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

X_train['money_mark'] = X_train['text'].str.contains(money_simbol_list)*1
X_train['suspicious_words'] = X_train['text'].str.contains(suspicious_words)*1
X_train['text_len'] = X_train['text'].apply(lambda x: len(x))

X_test['money_mark'] = X_test['text'].str.contains(money_simbol_list)*1
X_test['suspicious_words'] = X_test['text'].str.contains(suspicious_words)*1
X_test['text_len'] = X_test['text'].apply(lambda x: len(x))

X_train.head()

Unnamed: 0,text,money_mark,suspicious_words,text_len
29,regards mr nelson smith kindly reply private e...,0,0,80
535,able reach oscar supposed send pdb receive,0,0,42
695,huma abedin checking pat work jack jake rest a...,0,0,76
557,announced monday today,0,0,22
836,bank africaagence san pedro bp san pedro cote ...,1,1,1062


## How would work the Bag of Words with Count Vectorizer concept?

In [18]:
# Your code

## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [19]:
# Your code

## And the Train a Classifier?

In [20]:
# Your code

### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [21]:
# Your code