In [59]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [60]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from google.colab import drive


In [61]:
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development.

In [62]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("/content/drive/MyDrive/Lab Procesing/kg_train.csv",encoding='latin-1')
# Reduce the training set to speed up development.
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


### Let's divide the training and test set into two partitions

In [63]:
from sklearn.model_selection import train_test_split

In [64]:
print(data.columns)

Index(['text', 'label'], dtype='object')


In [65]:

X = data["text"]
y = data["label"]

# Ahora divides el dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)

Training set size: (800,)
Test set size: (200,)


## Data Preprocessing

In [66]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [67]:
stop_words = set(stopwords.words('english'))

In [68]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [69]:
import re

def clean_html(text):
    # 1️⃣ Remove inline JavaScript/CSS between <script> and <style> tags
    text = re.sub(r'<(script|style).*?>.*?(</\1>)', '', text, flags=re.DOTALL)

    # 2️⃣ Remove HTML comments (they can contain ">")
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)

    # 3️⃣ Remove remaining HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # 4️⃣ Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text).strip()

    return text

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters

- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [70]:
def clean_text(text):
    # 1️⃣ Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # 2️⃣ Remove numbers
    text = re.sub(r'\d+', '', text)

    # 3️⃣ Remove single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

    # 4️⃣ Remove single characters from the start of the text
    text = re.sub(r'^[a-zA-Z]\s+', '', text)

    # 5️⃣ Substitute multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # 6️⃣ Remove prefixed 'b' (from byte strings)
    text = re.sub(r'^b\s+', '', text)

    # 7️⃣ Convert to lowercase
    text = text.lower()

    return text.strip()

In [71]:
data['clean_text'] = data['text'].apply(clean_html).apply(clean_text).apply(remove_stopwords)

## Now let's work on removing stopwords
Remove the stopwords.

In [72]:
top_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [73]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def apply_lemmatization(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

In [74]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [75]:
data['clean_text'] = data['clean_text'].apply(apply_lemmatization)


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [76]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [77]:
ham_texts = data[data['label'] == 0]['clean_text']
spam_texts = data[data['label'] == 1]['clean_text']


In [78]:
vectorizer = CountVectorizer(stop_words='english')


In [79]:
X_ham = vectorizer.fit_transform(ham_texts)
ham_words = pd.DataFrame({'word': vectorizer.get_feature_names_out(),
                          'count': X_ham.sum(axis=0).A1})
top_ham = ham_words.sort_values(by='count', ascending=False).head(10)
print("Top 10 words in ham emails:")
print(top_ham)


Top 10 words in ham emails:
           word  count
4539         pm    115
5710      state    103
4691  president     94
6160       time     84
4443    percent     77
5365  secretary     76
6808       work     73
3895         mr     71
5269       said     62
241    american     62


In [80]:
X_spam = vectorizer.fit_transform(spam_texts)
spam_words = pd.DataFrame({'word': vectorizer.get_feature_names_out(),
                           'count': X_spam.sum(axis=0).A1})
top_spam = spam_words.sort_values(by='count', ascending=False).head(10)
print("\nTop 10 words in spam emails:")
print(top_spam)



Top 10 words in spam emails:
              word  count
8196         money    920
143        account    794
1454          bank    745
5399          fund    703
2047      business    473
12982  transaction    416
3152       country    406
13051     transfer    392
8060       million    385
2709       company    365


## Extra features

In [87]:
print(data_train.columns)

Index(['text'], dtype='object')


In [88]:
data_train['preprocessed_text'] = data_train['text'].str.lower().str.replace(r'\W', ' ', regex=True)


In [89]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x))

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x))

data_train.head()

NameError: name 'data_val' is not defined

## How would work the Bag of Words with Count Vectorizer concept?

In [90]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    "I love NLP",
    "NLP is amazing"
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())  # ['amazing', 'is', 'love', 'nlp']
print(X.toarray())

['amazing' 'is' 'love' 'nlp']
[[0 0 1 1]
 [1 1 0 1]]


## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [91]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [92]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)


In [93]:
X_tfidf = vectorizer.fit_transform(data['clean_text'])


In [94]:
print("Shape of TF-IDF matrix:", X_tfidf.shape)


Shape of TF-IDF matrix: (1000, 5000)


## And the Train a Classifier?

In [96]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [97]:
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set:", X_train.shape)
print("Test set:", X_test.shape)


Training set: (800, 5000)
Test set: (200, 5000)


### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [None]:
# Your code