# 12. NATURAL LANGUAGE PROCESSING

# 12.2. Text Classification

# 12.2.1. COURS

In [1]:
# Chemin des sources
import os
os.chdir('C:/Users/PC Maison/4-KAGGLE/KAGGLE_DEV/KAGGLE_COURS_12-NATURAL_LANGUAGE_PROCESSING')

In [3]:
import pandas as pd

In [5]:
# Loading the spam data
# ham is the label for non-spam messages
spam = pd.read_csv('input/spam.csv')
spam.head(10)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


# Building a Bag of Words model

In [6]:
import spacy

# Create an empty model
nlp = spacy.blank("en")

# Create the TextCategorizer with exclusive classes and "bow" architecture
textcat = nlp.create_pipe(
              "textcat",
              config={
                "exclusive_classes": True,
                "architecture": "bow"})

# Add the TextCategorizer to the empty model
nlp.add_pipe(textcat)

In [7]:
# Add labels to text classifier
textcat.add_label("ham")
textcat.add_label("spam")

1

# Training a Text Categorizer Model

In [10]:
train_texts = spam['text'].values
print('Train test : ',train_texts)
train_labels = [{'cats': {'ham': label == 'ham',
                          'spam': label == 'spam'}} 
                for label in spam['label']]
print('train_labels', train_labels)

Train test :  ['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
 ... 'Pity, * was in mood for that. So...any other suggestions?'
 "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free"
 'Rofl. Its true to its name']
train_labels [{'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': False, 'spam': True}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': False, 'spam': True}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': False, 'spam': True}}, {'cats': {'ham': False, 'spam': True}}, {'cats': {'ham': True, 'spam': False}}, 

In [11]:
train_data = list(zip(train_texts, train_labels))
train_data[:3]

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  {'cats': {'ham': True, 'spam': False}}),
 ('Ok lar... Joking wif u oni...', {'cats': {'ham': True, 'spam': False}}),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  {'cats': {'ham': False, 'spam': True}})]

In [12]:
from spacy.util import minibatch

spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

# Create the batch generator with batch size = 8
batches = minibatch(train_data, size=8)
# Iterate through minibatches
for batch in batches:
    # Each batch is a list of (text, label) but we need to
    # send separate lists for texts and labels to update().
    # This is a quick way to split a list of tuples into lists
    texts, labels = zip(*batch)
    nlp.update(texts, labels, sgd=optimizer)

In [13]:
import random

random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        # Each batch is a list of (text, label) but we need to
        # send separate lists for texts and labels to update().
        # This is a quick way to split a list of tuples into lists
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    print(losses)

{'textcat': 0.43189746545721164}
{'textcat': 0.6474976880825238}
{'textcat': 0.7842155215195028}
{'textcat': 0.8716684506576318}
{'textcat': 0.9280940218226823}
{'textcat': 0.9655780800097745}
{'textcat': 0.9939652740363053}
{'textcat': 1.0127977507994492}
{'textcat': 1.0275638671829697}
{'textcat': 1.0378532280854098}


# Making Predictions

In [14]:
texts = ["Are you ready for the tea party????? It's gonna be wild",
         "URGENT Reply to this message for GUARANTEED FREE TEA" ]
docs = [nlp.tokenizer(text) for text in texts]
    
# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)

print(scores)

[[9.9994397e-01 5.6023655e-05]
 [1.1491281e-02 9.8850876e-01]]


In [15]:
# From the scores, find the label with the highest score/probability
predicted_labels = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_labels])

['ham', 'spam']


# 12.2.2. EXERCICES

In [16]:
import pandas as pd

# Step 2: Review Data and Create the model

In [24]:
def load_data(csv_file, split=0.9):
    data = pd.read_csv(csv_file)
    
    # Shuffle data
    train_data = data.sample(frac=1, random_state=7)
    
    texts = train_data.text.values
    labels = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)}
              for y in train_data.sentiment.values]
    split = int(len(train_data) * split)
    
    train_labels = [{"cats": labels} for labels in labels[:split]]
    val_labels = [{"cats": labels} for labels in labels[split:]]
    
    return texts[:split], train_labels, texts[split:], val_labels, data

In [25]:
train_texts, train_labels, val_texts, val_labels, data = load_data('input/yelp_ratings.csv')

In [26]:
data

Unnamed: 0,text,stars,sentiment
0,Total bill for this horrible service? Over $8G...,1.0,0
1,I *adore* Travis at the Hard Rock's new Kelly ...,5.0,1
2,I have to say that this office really has it t...,5.0,1
3,Went in for a lunch. Steak sandwich was delici...,5.0,1
4,Today was my second out of three sessions I ha...,1.0,0
...,...,...,...
44525,"Petit café sympa, peu de place assise. Bonne b...",4.0,1
44526,Absolutely delicious! The food was full of ama...,5.0,1
44527,I love this place! They always have the cutes...,4.0,1
44528,i would understand a 2 hour wait for food if i...,2.0,0


In [20]:
train_texts

array(["Some of the best sushi I've ever had....and I come from the East Coast.  Unreal toro, have some of it's available.",
       "One of the best burgers I've ever had and very well priced. I got the tortilla burger and is was delicious especially with there tortilla soup!",
       'Review by a vegetarian family with two young kids. \n\nSeveral reviews have lamented the small number of vegetarian options on the menu and, while it is true that there are far more options for meat eaters and there is unfortunately no vegetarian noodle soup option, once you get over these 2 facts this is an excellent place for vegetarians.',
       ...,
       'Their chicken wings is the bomb. I live in Mississauga but drove all the way up there for the wings. We ordered honey garlic, sweet chilli and mango chipotle. \nHalf price wings on Mondays.',
       'The pizza is really good! Staff does a great job especially with how busy they seem to get they handle it like champs! Manager is amazing she does a

In [21]:
train_labels

[{'cats': {'POSITIVE': True, 'NEGATIVE': False}},
 {'cats': {'POSITIVE': True, 'NEGATIVE': False}},
 {'cats': {'POSITIVE': True, 'NEGATIVE': False}},
 {'cats': {'POSITIVE': True, 'NEGATIVE': False}},
 {'cats': {'POSITIVE': False, 'NEGATIVE': True}},
 {'cats': {'POSITIVE': True, 'NEGATIVE': False}},
 {'cats': {'POSITIVE': False, 'NEGATIVE': True}},
 {'cats': {'POSITIVE': False, 'NEGATIVE': True}},
 {'cats': {'POSITIVE': True, 'NEGATIVE': False}},
 {'cats': {'POSITIVE': True, 'NEGATIVE': False}},
 {'cats': {'POSITIVE': True, 'NEGATIVE': False}},
 {'cats': {'POSITIVE': False, 'NEGATIVE': True}},
 {'cats': {'POSITIVE': False, 'NEGATIVE': True}},
 {'cats': {'POSITIVE': True, 'NEGATIVE': False}},
 {'cats': {'POSITIVE': True, 'NEGATIVE': False}},
 {'cats': {'POSITIVE': True, 'NEGATIVE': False}},
 {'cats': {'POSITIVE': True, 'NEGATIVE': False}},
 {'cats': {'POSITIVE': True, 'NEGATIVE': False}},
 {'cats': {'POSITIVE': True, 'NEGATIVE': False}},
 {'cats': {'POSITIVE': True, 'NEGATIVE': False}},


In [28]:
print('Texts from training data\n------')
print(train_texts[:2])
print('\nLabels from training data\n------')
print(train_labels[:2])

Texts from training data
------
["Some of the best sushi I've ever had....and I come from the East Coast.  Unreal toro, have some of it's available."
 "One of the best burgers I've ever had and very well priced. I got the tortilla burger and is was delicious especially with there tortilla soup!"]

Labels from training data
------
[{'cats': {'POSITIVE': True, 'NEGATIVE': False}}, {'cats': {'POSITIVE': True, 'NEGATIVE': False}}]


# Step 3: Train Function

In [29]:
from spacy.util import minibatch
import random

def train(model, train_data, optimizer):
    losses = {}
    random.seed(1)
    random.shuffle(train_data)
    
    batches = minibatch(train_data, size=8)
    for batch in batches:
        # train_data is a list of tuples [(text0, label0), (text1, label1), ...]
        # Split batch into texts and labels
        texts, labels = zip(*batch)
        
        # Update model with texts and labels
        model.update(texts, labels, sgd=optimizer, losses=losses)
        
    return losses


In [30]:
# Fix seed for reproducibility
spacy.util.fix_random_seed(1)
random.seed(1)

# This may take a while to run!
optimizer = nlp.begin_training()
train_data = list(zip(train_texts, train_labels))
losses = train(nlp, train_data, optimizer)
print(losses['textcat'])

0.0


In [31]:
text = "This tea cup was full of holes. Do not recommend."
doc = nlp(text)
print(doc.cats)

{'ham': 0.5000000596046448, 'spam': 0.4999999403953552}


# Step 4: Making Predictions

In [33]:
def predict(nlp, texts): 
    # Use the model's tokenizer to tokenize each input text
    docs = [nlp.tokenizer(text) for text in texts]
    
    # Use textcat to get the scores for each doc
    textcat = nlp.get_pipe('textcat')
    scores, _ = textcat.predict(docs)
    
    # From the scores, find the class with the highest score/probability
    predicted_class = scores.argmax(axis=1)
    
    return predicted_class

In [34]:
texts = val_texts[34:38]
predictions = predict(nlp, texts)

for p, t in zip(predictions, texts):
    print(f"{textcat.labels[p]}: {t} \n")

ham: Came over and had their "Pick 2" lunch combo and chose their best selling 1/2 chicken sandwich with quinoa.  Both were tasty, the chicken salad is a bit creamy but was perfect with quinoa on the side.  This is a good lunch joint, casual and clean! 

ham: Went here last night and got oysters, fried okra, fries, and onion rings. I cannot complain. The portions were great and tasty!!! I will definitely be back for more. I cannot wait to try the crawfish boudin and soft shell crab. 

ham: This restaurant was fantastic! 
The concept of eating without vision was intriguing. The dinner was filled with laughs and good conversation. 

We were lead in a line to our table and each person to their seat. This was not just dark but you could not see something right in front of your face. 

The waiters/waitresses were all blind and allowed us to see how aware you need to be without the vision. 

Taking away one sense is said to increase your other senses so as taste and hearing which I believed 

# Step 5: Evaluate The Model

In [35]:
def evaluate(model, texts, labels):
    """ Returns the accuracy of a TextCategorizer model. 
    
        Arguments
        ---------
        model: ScaPy model with a TextCategorizer
        texts: Text samples, from load_data function
        labels: True labels, from load_data function
    
    """
    # Get predictions from textcat model (using your predict method)
    predicted_class = predict(model, texts)
    
    # From labels, get the true class as a list of integers (POSITIVE -> 1, NEGATIVE -> 0)
    true_class = [int(each['cats']['POSITIVE']) for each in labels]
    
    # A boolean or int array indicating correct predictions
    correct_predictions = predicted_class == true_class
    
    # The accuracy, number of correct predictions divided by all predictions
    accuracy = correct_predictions.mean()
    
    return accuracy

In [36]:
accuracy = evaluate(nlp, val_texts, val_labels)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.2477


In [37]:
# This may take a while to run!
n_iters = 5
for i in range(n_iters):
    losses = train(nlp, train_data, optimizer)
    accuracy = evaluate(nlp, val_texts, val_labels)
    print(f"Loss: {losses['textcat']:.3f} \t Accuracy: {accuracy:.3f}")

Loss: 0.000 	 Accuracy: 0.248
Loss: 0.000 	 Accuracy: 0.248
Loss: 0.000 	 Accuracy: 0.248
Loss: 0.000 	 Accuracy: 0.248
Loss: 0.000 	 Accuracy: 0.248


# Step 6: Keep Improving