In [1]:
import pandas as pd
import numpy as np
import spacy
import torchtext
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
from torchtext.legacy.data import LabelField

In [178]:
data = pd.read_csv(r'C:\Users\Antoine\Coding Bootcamp\Open Food Facts\data_nlp_small.csv')
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])

In [179]:
data.to_csv(r'C:\Users\Antoine\Coding Bootcamp\Open Food Facts\data_nlp_sample.csv', index=False, header=True)
df = pd.read_csv(r'C:\Users\Antoine\Coding Bootcamp\Open Food Facts\data_nlp_sample.csv')
df

Unnamed: 0,text,label
0,Skim milk; milk; vitamin a palmitate; vitamin ...,6
1,Boneless duck breast; water; sea salt Boneless...,4
2,HARINA DE TRIGO (GLUTEN); LEVADURA; AZUCAR; AC...,1
3,Sugar; enriched flour bleached (wheat flour; n...,9
4,glucose syrup (wheat sugar; peanuts; rice cris...,9
...,...,...
95,Peaches; water; and sugar. Yellow cling sliced...,5
96,Crème de coco 49.3% (crème de coco; protéines ...,6
97,Riz basmati semi-complet Riz basmati semi-complet,1
98,Peanuts*; tapioca syrup*; dark chocolate flavo...,9


In [180]:
path_data = r'C:\Users\Antoine\Coding Bootcamp\Open Food Facts'

In [181]:
le.classes_

array(['Beverages', 'Cereals and potatoes', 'Composite foods',
       'Fat and sauces', 'Fish Meat Eggs', 'Fruits and vegetables',
       'Milk and dairy products', 'Salty snacks', 'Sugary snacks',
       'unknown'], dtype=object)

In [182]:
spacy_en = spacy.load('en')
def tokenizer(text): return [tok.text for tok in spacy_en.tokenizer(text)]

In [200]:
TEXT = Field(sequential=True, lower=True, include_lengths=False, pad_token='<pad>', unk_token= '<unk>', batch_first=True, tokenize= tokenizer)
LABELS = Field(sequential=False, use_vocab=False, batch_first=True)
fields = [('text', TEXT), ('label', LABELS)]

In [201]:
train, valid, test = TabularDataset.splits(path=path_data, train='data_nlp_sample.csv', validation='data_nlp_sample.csv', test='data_nlp_sample.csv',
                                           format='csv', fields=fields, skip_header=True)

In [186]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

In [202]:
train_iter = BucketIterator(train, batch_size=32, sort_key=lambda x: len(x.text),
                            device=device, sort=True, sort_within_batch=True, shuffle=True, repeat=False)
valid_iter = BucketIterator(valid, batch_size=32, sort_key=lambda x: len(x.text),
                            device=device, sort=True, sort_within_batch=True, shuffle=True, repeat=False)
test_iter = BucketIterator(test, batch_size=64, sort_key=lambda x: len(x.text),
                            device=device, sort=True, sort_within_batch=True, shuffle=True, repeat=False)

In [203]:
TEXT.build_vocab(train, min_freq=2, vectors='glove.6B.300d')

In [204]:
len(TEXT.vocab)

504

In [191]:
TEXT.vocab.vectors.shape

torch.Size([504, 300])

In [207]:
batch = next(iter(train_iter))

In [192]:
class LSTMG1(nn.Module):
    def __init__(self, embedding_dim=300, n_layers=2, p=0.5):
        super(LSTMG1, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(TEXT.vocab.vectors, freeze=False)
        self.lstm = nn.LSTM(
            input_size= embedding_dim, 
            hidden_size=embedding_dim, 
            num_layers=n_layers,
            bidirectional=True,
            batch_first=True)
        self.drop = nn.Dropout(p)
        self.fc = nn.Linear(embedding_dim, 10)
        self.h_o.weight = self.i_h.weight
    
    def forward(self, inputs):
        embeds = self.embedding(inputs)
        outputs, (h_n, c_n) = self.lstm(embeds)
        x = h_n[0]
        x = self.fc(x)
        return x

In [208]:
net = LSTMG1(embedding_dim=300).to(device)

In [209]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [210]:
net

LSTMG1(
  (embedding): Embedding(504, 300)
  (lstm): LSTM(300, 300, batch_first=True)
  (fc): Linear(in_features=300, out_features=10, bias=True)
)

In [218]:
%%time
n_epochs = 50
for epoch in range(n_epochs):
    batch = next(iter(train_iter))
    data = batch.text.to(device) #send x
    labels = batch.label.to(device) #send y

    outputs = net(data) #calc outputs
    loss = criterion(outputs, labels) #calc loss

    optimizer.zero_grad() #reinit gradient
    loss.backward() #backward loss
    optimizer.step() #update weights

Wall time: 450 ms


In [219]:
y_true = []
y_preds = []
with torch.no_grad():
    for batch in test_iter:
        data = batch.text.to(device)
        labels = batch.label.to(device)
        outputs = net(data)
        _, predicted = torch.max(outputs.data, 1)
        y_preds.append(predicted.cpu().numpy())
        y_true.append(labels.cpu().numpy())
        break
y_true = np.concatenate(y_true)
y_preds = np.concatenate(y_preds)

In [223]:
lab_y_true = le.inverse_transform(y_true)
lab_y_preds = le.inverse_transform(y_preds)

In [225]:
from sklearn.metrics import classification_report
print(classification_report(lab_y_true, lab_y_preds, zero_division=0))

                         precision    recall  f1-score   support

              Beverages       1.00      0.50      0.67        10
   Cereals and potatoes       1.00      0.80      0.89         5
        Composite foods       0.33      0.33      0.33         3
         Fat and sauces       0.00      0.00      0.00         5
         Fish Meat Eggs       0.80      0.50      0.62         8
  Fruits and vegetables       0.07      0.33      0.11         3
Milk and dairy products       0.00      0.00      0.00         3
           Salty snacks       0.00      0.00      0.00         2
          Sugary snacks       0.31      0.80      0.44         5
                unknown       0.62      0.50      0.56        20

               accuracy                           0.45        64
              macro avg       0.41      0.38      0.36        64
           weighted avg       0.57      0.45      0.48        64

