In [1]:
import pandas as pd

In [3]:
spam = pd.read_csv('/content/drive/MyDrive/Codar Software/Estudos/Kaggle/dados/spam.csv')

In [4]:
spam.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
import spacy

In [6]:
# Create an empty model
nlp = spacy.blank('en')

In [7]:
# Create the TextCategorizer with exclusive classes and "bow" architecture
textcat = nlp.create_pipe(
    "textcat",
    config={
        "exclusive_classes": True,
        "architecture": "bow"
    }
)

In [8]:
# Add the TextCategorizer to the empty model
nlp.add_pipe(textcat)

In [9]:
# Add labels to text classifier
textcat.add_label("ham")
textcat.add_label("spam")

1

In [10]:
train_texts = spam['text'].values
train_labels = [{'cats': {'ham': label == 'ham', 'spam': label == 'spam'}} for label in spam['label']]

In [11]:
train_data = list(zip(train_texts, train_labels))
train_data[:3]

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  {'cats': {'ham': True, 'spam': False}}),
 ('Ok lar... Joking wif u oni...', {'cats': {'ham': True, 'spam': False}}),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  {'cats': {'ham': False, 'spam': True}})]

In [12]:
from spacy.util import minibatch

In [13]:
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

In [14]:
# Create the batch generator with batch size = 8
batches = minibatch(train_data, size=8)

In [15]:
# Iterate through minibatches
for batch in batches:
    # Each batch is a list of (text, label) but we need to
    # send separete lists for texts and labels to update().
    # This is a quick way to split a list of tuples into lists
    texts, labels = zip(*batch)
    nlp.update(texts, labels, sgd=optimizer)

In [16]:
import random 

In [17]:
random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

In [18]:
losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        # Each batch is a list of (text, label) but we need to
        # send separete lists for texts and labels to update().
        # This is a quick way to split a list of tuples into lists
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    print(losses)

{'textcat': 0.43189741921099767}
{'textcat': 0.6474976215331196}
{'textcat': 0.7842154536487618}
{'textcat': 0.8716683716818165}
{'textcat': 0.9280939335008995}
{'textcat': 0.9655779922872296}
{'textcat': 0.9939651840090362}
{'textcat': 1.0127976631523663}
{'textcat': 1.0275637812859075}
{'textcat': 1.0378531470013608}


In [20]:
texts = ["Are you ready for the tea party????? It's gonna be wild",
         "URGENT Reply to this message for GUARANTEED FREE TEA" ]
docs = [nlp.tokenizer(text) for text in texts]

In [21]:
# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)


print(scores)

[[9.9994397e-01 5.6023764e-05]
 [1.1491306e-02 9.8850864e-01]]


In [22]:
# From the scores, find the label with the highest score/probability
predicted_labels = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_labels])

['ham', 'spam']
