<a href="https://colab.research.google.com/github/mtermor/NTIC_DeepLearning/blob/main/NLP/03_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset

In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [7]:
df = pd.read_csv('https://raw.githubusercontent.com/eduardofc/data/main/amazon_sports.csv')
df['review_body'] = df['review_body'].str.replace("[^a-zA-ZñÑáéíóú .,:;]", "", regex=True)
df['review_body'] = df['review_body'].str.lower()
df.head()

Unnamed: 0,stars,review_body,review_title,product_category
0,1,nunca llego el pedido y el vendedor pasa de todo no contestan,No llego nunca,sports
1,1,"no sé como es, porque debería haber llegado ayer día de marzo, y hoy por la noche sigo esperando que llegue el frontal. y me habéis mandado el formulario de opinión. pues de momento mala no ha cumplido con la fecha de entrega.",Todavía no ha llegado,sports
2,1,"guantes cómodos, no lo niego, pero de mala calidad. yo creo que en caso de caída no valdrian para mucho, dos meses de uso y se están rajando.",Guantes de baja calidad,sports
3,1,hasta hoy no he visto el producto. el pedido hace ya casi mes. y notifico que he usado prime para está compra.,Muy Mala experiencia,sports
4,1,"no puedo valorarla porque, después de casi una semana, aún no he recibido mi pedido. pienso que amazon tendría que valorar las compañías de transporte con que trabaja, porque es indignante que pague mi cuota prime y nunca reciba mi pedido el día que toca",Paquete perdido?,sports


In [8]:
# positivos -> 5, 4 stars
# negativos -> 1, 2 stars

df = df[df.stars !=3]
df['bad_product'] = (df.stars > 3).astype(int)
df.groupby('bad_product').size()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['bad_product'] = (df.stars > 3).astype(int)


bad_product
0    4989
1    5372
dtype: int64

# Preparación de las variables (X, y)

In [10]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
X = df.review_body.values
y = df.bad_product

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state = 33)

print(len(X_train))
print(len(X_test))

8288
2073


## Tokenizer

In [15]:
vocab_size = 5000 # primer parámetro configurable

tokenizer = Tokenizer(
    num_words = vocab_size,
    oov_token = '<OOV>'
)
tokenizer.fit_on_texts(X_train)

In [18]:
tokenized_train = tokenizer.texts_to_sequences(X_train)
tokenized_test = tokenizer.texts_to_sequences(X_test) # para después validar

## Padding

In [19]:
max_length = 50

padd_train = pad_sequences(tokenized_train, maxlen=max_length, truncating='post')
padd_test = pad_sequences(tokenized_test, maxlen=max_length, truncating='post')

In [21]:
padd_test[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,   23,  608,   37,  255,    3,    4,  147,
         24,   34,  771,    7,    8,   12,  855,   18,   58,    4,    1,
          2,   16, 1069,   10,   39,   53], dtype=int32)

# Models

In [22]:
import tensorflow.keras as keras
import numpy as np
from time import time

from keras import Sequential
from keras.layers import *
from keras.optimizers import Adam

In [40]:
def model_compile(model):
  model.compile(
    loss = 'binary_crossentropy',
    optimizer='adam',
    metrics='accuracy'
  ) # optimizer could be Adam(lr), SGD...
  print(model.summary())

def model_fit(model):
  start = time()
  n_epochs = 20 # no vale la pena poner muy altos, luego se puede fittear 2 veces
  n_batch_size = 100 # puede afectar un poco más

  model.fit(
      padd_train,
      y_train,
      epochs = n_epochs,
      batch_size = n_batch_size,
      validation_data = (padd_test, y_test),
      verbose = True
  )

  end = time()
  print(f'>>>>> Elapsed time: {(end-start):.2f}s')
# Entrena durante una época con train y al final de cada época va dando un report
# en función a validation (sin mezclar los datos)

## Model 1: Dense

In [41]:
keras.utils.set_random_seed(812)

model = Sequential([
    Flatten(input_shape=(max_length,)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_compile(model)
model_fit(model)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_5 (Flatten)         (None, 50)                0         
                                                                 
 dense_8 (Dense)             (None, 128)               6528      
                                                                 
 dense_9 (Dense)             (None, 64)                8256      
                                                                 
 dense_10 (Dense)            (None, 1)                 65        
                                                                 
Total params: 14849 (58.00 KB)
Trainable params: 14849 (58.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/2

## Model 2: Dense + Embeddings

In [42]:
keras.utils.set_random_seed(812)

embed_dim = 20

model = Sequential([
    Embedding(
        input_length = max_length,
        output_dim = embed_dim,
        input_dim = vocab_size
    ),
    Flatten(),
    Dense(1, activation='sigmoid')
])

model_compile(model)
model_fit(model)

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50, 20)            100000    
                                                                 
 flatten_6 (Flatten)         (None, 1000)              0         
                                                                 
 dense_11 (Dense)            (None, 1)                 1001      
                                                                 
Total params: 101001 (394.54 KB)
Trainable params: 101001 (394.54 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
>>>>> Elapsed time: 21.21s


## Model 3: CNN

In [44]:
keras.utils.set_random_seed(812)

embed_dim = 20

n_filters = 64
kernel_size = 5

model = Sequential([
    Embedding(
        input_length = max_length,
        output_dim = embed_dim,
        input_dim = vocab_size
    ),
    Conv1D(n_filters, kernel_size, activation='relu'),
    GlobalAveragePooling1D(),
    Dense(6, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_compile(model)
model_fit(model)

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 50, 20)            100000    
                                                                 
 conv1d_1 (Conv1D)           (None, 46, 64)            6464      
                                                                 
 global_average_pooling1d (  (None, 64)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense_14 (Dense)            (None, 6)                 390       
                                                                 
 dense_15 (Dense)            (None, 1)                 7         
                                                                 
Total params: 106861 (417.43 KB)
Trainable params: 106861 (417.43 KB)
Non-trainable params: 0 (0.00 Byte)
______________

## Model 4: (Bi-)LSTM + Embeddings

In [45]:
keras.utils.set_random_seed(812)

embed_dim = 20
lstm_dim = 32

model = Sequential([
    Embedding(
        input_length = max_length,
        output_dim = embed_dim,
        input_dim = vocab_size
    ),
    LSTM(lstm_dim),
    Dense(6, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_compile(model)
model_fit(model)

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 50, 20)            100000    
                                                                 
 lstm (LSTM)                 (None, 32)                6784      
                                                                 
 dense_16 (Dense)            (None, 6)                 198       
                                                                 
 dense_17 (Dense)            (None, 1)                 7         
                                                                 
Total params: 106989 (417.93 KB)
Trainable params: 106989 (417.93 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 

In [46]:
keras.utils.set_random_seed(812)

embed_dim = 20
lstm_dim = 32

model = Sequential([
    Embedding(
        input_length = max_length,
        output_dim = embed_dim,
        input_dim = vocab_size
    ),
    Bidirectional(LSTM(lstm_dim)),
    Dense(6, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_compile(model)
model_fit(model)

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 50, 20)            100000    
                                                                 
 bidirectional (Bidirection  (None, 64)                13568     
 al)                                                             
                                                                 
 dense_18 (Dense)            (None, 6)                 390       
                                                                 
 dense_19 (Dense)            (None, 1)                 7         
                                                                 
Total params: 113965 (445.18 KB)
Trainable params: 113965 (445.18 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/

# Técnicas de mejora

## Reducción de overfitting

In [47]:
keras.utils.set_random_seed(812)

embed_dim = 20
lstm_dim = 32

model = Sequential([
    Embedding(
        input_length = max_length,
        output_dim = embed_dim,
        input_dim = vocab_size
    ),
    Dropout(.5),
    Bidirectional(LSTM(lstm_dim)),
    Dropout(0.5),
    BatchNormalization(),
    Dense(6, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_compile(model)
model_fit(model)

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 50, 20)            100000    
                                                                 
 dropout (Dropout)           (None, 50, 20)            0         
                                                                 
 bidirectional_1 (Bidirecti  (None, 64)                13568     
 onal)                                                           
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 batch_normalization (Batch  (None, 64)                256       
 Normalization)                                                  
                                                                 
 dense_20 (Dense)            (None, 6)               

## Tamaño del dataset

In [50]:
df1 = pd.read_csv('https://raw.githubusercontent.com/eduardofc/data/main/amazon_sports.csv')
df2 = pd.read_csv('https://raw.githubusercontent.com/eduardofc/data/main/amazon_home.csv')
df3 = pd.read_csv('https://raw.githubusercontent.com/eduardofc/data/main/amazon_electronics.csv')
df = pd.concat([df1, df2, df3])

df['review_body'] = df['review_body'].str.replace("[^a-zA-ZñÑáéíóú .,:;]", "", regex=True)
df['review_body'] = df['review_body'].str.lower()
df.shape

(50536, 4)

In [51]:
# positivos -> 5, 4 stars
# negativos -> 1, 2 stars

df = df[df.stars !=3]
df['bad_product'] = (df.stars > 3).astype(int)
df.groupby('bad_product').size()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['bad_product'] = (df.stars > 3).astype(int)


bad_product
0    20512
1    19817
dtype: int64

In [52]:
X = df.review_body.values
y = df.bad_product

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state = 33)

print(len(X_train))
print(len(X_test))

32263
8066


In [53]:
vocab_size = 5000 # primer parámetro configurable

tokenizer = Tokenizer(
    num_words = vocab_size,
    oov_token = '<OOV>'
)
tokenizer.fit_on_texts(X_train)

In [54]:
tokenized_train = tokenizer.texts_to_sequences(X_train)
tokenized_test = tokenizer.texts_to_sequences(X_test) # para después validar

In [55]:
max_length = 50

padd_train = pad_sequences(tokenized_train, maxlen=max_length, truncating='post')
padd_test = pad_sequences(tokenized_test, maxlen=max_length, truncating='post')

### Modelling

In [56]:
keras.utils.set_random_seed(812)

embed_dim = 20
lstm_dim = 32

model = Sequential([
    Embedding(
        input_length = max_length,
        output_dim = embed_dim,
        input_dim = vocab_size
    ),
    Bidirectional(LSTM(lstm_dim)),
    Dense(6, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_compile(model)
model_fit(model)

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 50, 20)            100000    
                                                                 
 bidirectional_2 (Bidirecti  (None, 64)                13568     
 onal)                                                           
                                                                 
 dense_22 (Dense)            (None, 6)                 390       
                                                                 
 dense_23 (Dense)            (None, 1)                 7         
                                                                 
Total params: 113965 (445.18 KB)
Trainable params: 113965 (445.18 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/