## Pipeline + string manipulation + scikit-learn

In [1]:
import pandas
from sklearn.feature_extraction.text import CountVectorizer , TfidfTransformer , TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords

## Carga de datos

In [10]:
messages = [line.rstrip() for line in open('.\Datasets\SMSSpamCollection')]
print(len(messages))

#ham = no es spam
#spam = spam

5574


In [11]:
for message_no, message in enumerate(messages[:10]):
    print(message_no, message)
    #print('\n')

0 ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
1 ham	Ok lar... Joking wif u oni...
2 spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3 ham	U dun say so early hor... U c already then say...
4 ham	Nah I don't think he goes to usf, he lives around here though
5 spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, Â£1.50 to rcv
6 ham	Even my brother is not like to speak with me. They treat me like aids patent.
7 ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
8 spam	WINNER!! As a valued network customer you have been selected to receivea Â£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 1

In [13]:
messages = pandas.read_csv('.\Datasets\SMSSpamCollection', sep='\t',names=["label", "message"])
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
messages['length'] = messages['message'].apply(len) #agrega una columna con la longitud de cada mensaje 
messages.head()

Unnamed: 0,label,message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


## Manipulación de strings

In [16]:
import string

mess = 'Sample message! Notice: it has punctuation.' #mensaje de ejemplo 

# verificar si cada caracter en mess es un signo de puntuación o no
nopunc = [char for char in mess if char not in string.punctuation]

# une los caracteres nuevamente para formar la cadena.
nopunc = ''.join(nopunc)

In [17]:
nopunc

'Sample message Notice it has punctuation'

In [18]:
nopunc.split() # dividir en palabras separadas por espacios en blanco (sin signos de puntuación) 

['Sample', 'message', 'Notice', 'it', 'has', 'punctuation']

In [19]:
clean_mess = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')] # remover las stopwords

In [20]:
def text_process(mess):
    """
    Tomar un string en bruto y hacer lo siguiente:
    1. remueve puntuacion
    2. remueve stopwords
    3. regresa una lista de las palabras limpias
    """
    # verifica los caracteres para ver si estan en puntuacion
    nopunc = [char for char in mess if char not in string.punctuation]

    # une los caracteres nuevamente para formar el string.
    nopunc = ''.join(nopunc)
    
    # elimina las stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [21]:
#  verificamos que la función funciona
messages['message'].head(5).apply(text_process) # tokenizar el texto

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: message, dtype: object

In [22]:
#importa train_test_split
from sklearn.model_selection import train_test_split

msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['label'], test_size=0.2) #separación de los datos en conjuntos de entrenamiento y prueba (80% y 20% respectivamente)

print(len(msg_train), len(msg_test), len(msg_train) + len(msg_test)) #verifica el tamaño de los conjuntos de entrenamiento y prueba (80% y 20% respectivamente)

4457 1115 5572


In [23]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [24]:
pipeline.fit(msg_train,label_train) # entrena el modelo con los datos de entrenamiento y las etiquetas de entrenamiento 

In [25]:
predictions = pipeline.predict(msg_test) # predecir el conjunto de prueba (test) 

In [26]:
print(classification_report(predictions,label_test)) # imprimir reporte de clasificación 

              precision    recall  f1-score   support

         ham       1.00      0.95      0.98      1000
        spam       0.71      1.00      0.83       115

    accuracy                           0.96      1115
   macro avg       0.86      0.98      0.90      1115
weighted avg       0.97      0.96      0.96      1115

