In [None]:
%pip install nltk

In [None]:
import nltk
import pandas as pd

In [None]:
df = pd.read_csv("./SMSSpamCollection", sep="\t", header=None, names=["label", "message"])

In [None]:
df.head()

In [None]:
df["length"] = df["message"].apply(len)

In [None]:
df.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df["length"].plot(kind="hist", bins=100, figsize=(10, 5))

In [None]:
df.hist(column="length", by="label", bins=100, figsize=(10, 5))

In [None]:
# text processing

import string

msj = "Mensajito: este es un mensaje de prueba, con puntuación y números 1234567890"

In [None]:
string.punctuation

In [None]:
with_punctuation = [c for c in msj if c not in string.punctuation]

In [None]:
with_punctuation = "".join(with_punctuation)

In [None]:
with_punctuation

In [None]:
from nltk.corpus import stopwords

In [None]:
nltk.download("stopwords")

In [None]:
stopwords = stopwords.words("spanish")

In [None]:
words = with_punctuation.split()

In [None]:
cleaned_words = [w for w in words if w.lower() not in stopwords]
print(cleaned_words)

In [None]:
def preprocess_text(text):
    with_punctuation = [c for c in text if c not in string.punctuation]
    with_punctuation = "".join(with_punctuation)
    words = with_punctuation.split()
    cleaned_words = [w for w in words if w.lower() not in stopwords]
    return cleaned_words

In [None]:
df["message"] = df["message"].apply(preprocess_text)

In [None]:
df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
transformer = CountVectorizer(analyzer=preprocess_text).fit(df["message"])

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df["message"], df["label"], test_size=0.2, random_state=42) 

In [None]:
pipeline = Pipeline([
    ("vectorizer", CountVectorizer(analyzer=preprocess_text)),
    ("tfidf", TfidfTransformer()),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

In [None]:
pipeline.fit(x_train, y_train)

In [None]:
preds = pipeline.predict(x_test)

In [None]:
print(classification_report(y_test, preds))