In [None]:
import logging
logging.disable(logging.INFO) # disable INFO and DEBUG logging everywhere
# or 
logging.disable(logging.WARNING) # disable WARNING, INFO and DEBUG logging everywhere
!pip install transformers


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from sklearn.model_selection import train_test_split

In [None]:
max_length = 128  # Maximum length of input sentence to the model.
batch_size = 32
epochs = 2

# Labels in our dataset.
labels = [0,1]

In [None]:
train = pd.read_csv('https://raw.githubusercontent.com/GIL-UNAM/PARMEX_2022/main/parmex_train.csv')
print("Train Dataframe:")
train.head(3)
print(f'Train dataframe contains {train.shape[0]} samples.')
print('Number of features in train data : ', train.shape[1])
print('Train Features : ', train.columns.values)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
train_df,test_df=train_test_split(train,test_size=0.25,random_state=10)

In [None]:
valid_df=test_df

In [None]:
# Shape of the data
print(f"Total train samples : {train_df.shape[0]}")
print(f"Total validation samples: {valid_df.shape[0]}")
print(f"Total test samples: {valid_df.shape[0]}")

In [None]:
print(f"Sentence1: {train_df.loc[1, 'Text1']}")
print(f"Sentence2: {train_df.loc[1, 'Text2']}")
print(f"Similarity: {train_df.loc[1, 'Label']}")

In [None]:
train_df = (
    train_df[train_df.Label != "-"]
    .sample(frac=1.0, random_state=42)
    .reset_index(drop=True)
)
valid_df = (
    valid_df[valid_df.Label != "-"]
    .sample(frac=1.0, random_state=42)
    .reset_index(drop=True)
)

In [None]:
'''train_df["label"] = train_df["similarity"].apply(
    lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2
)'''
y_train = tf.keras.utils.to_categorical(train_df.Label, num_classes=2)

'''valid_df["label"] = valid_df["similarity"].apply(
    lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2
)'''
y_val = tf.keras.utils.to_categorical(valid_df.Label, num_classes=2)

'''test_df["label"] = test_df["similarity"].apply(
    lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2
)'''
y_test = tf.keras.utils.to_categorical(test_df.Label, num_classes=2)

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


tokenizer = transformers.BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')
tokenizer.save_pretrained('/content/drive/My Drive/tokenizer2/')


In [None]:
bert_model = transformers.TFBertModel.from_pretrained("hiiamsid/sentence_similarity_spanish_es",from_pt=True)
bert_model.save_pretrained('/content/drive/My Drive/tokenizer2/')

In [None]:
class BertSemanticDataGenerator(tf.keras.utils.Sequence):
    """Generates batches of data.

    Args:
        sentence_pairs: Array of premise and hypothesis input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to incude the labels.

    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "/content/drive/My Drive/tokenizer2/", do_lower_case=True##################################################################################################
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

In [None]:
# Create the model under a distribution strategy scope.
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    # Encoded token ids from BERT tokenizer.
    input_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="input_ids"
    )
    # Attention masks indicates to the model which tokens should be attended to.
    attention_masks = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="attention_masks"
    )
    # Token type ids are binary masks identifying different sequences in the model.
    token_type_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="token_type_ids"
    )
    # Loading pretrained BERT model.
    bert_model = transformers.TFBertModel.from_pretrained("/content/drive/My Drive/tokenizer2/")########################################################################
    # Freeze the BERT model to reuse the pretrained features without modifying them.
    bert_model.trainable = False

    bert_output = bert_model(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )
    sequence_output = bert_output.last_hidden_state
    pooled_output = bert_output.pooler_output
    # Add trainable layers on top of frozen layers to adapt the pretrained features on the new data.
    bi_lstm = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, return_sequences=True)
    )(sequence_output)
    # Applying hybrid pooling approach to bi_lstm sequence output.
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm)
    concat = tf.keras.layers.concatenate([avg_pool, max_pool])
    dropout = tf.keras.layers.Dropout(0.3)(concat)
    output = tf.keras.layers.Dense(2, activation="softmax")(dropout)
    model = tf.keras.models.Model(
        inputs=[input_ids, attention_masks, token_type_ids], outputs=output
    )

    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss="categorical_crossentropy",
        metrics=["acc"],
    )


print(f"Strategy: {strategy}")
model.summary()

In [None]:
train_data = BertSemanticDataGenerator(
    train_df[["Text1", "Text2"]].values.astype("str"),
    y_train,
    batch_size=batch_size,
    shuffle=True,
)
valid_data = BertSemanticDataGenerator(
    valid_df[["Text1", "Text2"]].values.astype("str"),
    y_val,
    batch_size=batch_size,
    shuffle=False,
)

In [None]:

history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=epochs,
    use_multiprocessing=True,
    workers=-1,
)

In [None]:
# Unfreeze the bert_model.
bert_model.trainable = True
# Recompile the model to make the change effective.
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-5),
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)
model.summary()

In [None]:
history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=epochs,
    use_multiprocessing=True,
    workers=-1,
)



Epoch 1/2
Epoch 2/2


In [None]:
test_data = BertSemanticDataGenerator(
    test_df[["Text1", "Text2"]].values.astype("str"),
    y_test,
    batch_size=batch_size,
    shuffle=False,
)
model.evaluate(test_data, verbose=1)

In [None]:
def check_similarity(sentence1, sentence2):
    sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
    test_data = BertSemanticDataGenerator(
        sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
    )

    proba = model.predict(test_data[0])[0]
    idx = np.argmax(proba)
    proba = f"{proba[idx]: .2f}%"
    pred = labels[idx]
    return pred, proba

In [None]:
sentence1 = "A smiling costumed woman is holding an umbrella"
sentence2 = "A happy woman in a fairy costume holds an umbrella"
check_similarity(sentence1, sentence2)[0]

In [None]:
test_dff=test_df.reset_index(drop=True)
train_dff=train_df.reset_index(drop=True)

In [None]:
ppp=[]
for i in range(len(train_dff)):
  
  ppp.append(check_similarity(train_dff['Text1'][i], train_dff['Text2'][i])[0])




In [None]:
train_dff['predictions'] = ppp
train_dff.to_csv('misclassifyTrain.csv', header=True, index=False, columns=list(train_dff.axes[1]))

In [None]:
mdf=train_dff[train_dff['Label'] != train_dff['predictions']]
pd.set_option('display.max_colwidth', None)
mdf.head(10)

Unnamed: 0,Text1,Text2,Label,predictions
5,los colores y el ánimo mexicano son factores fundamentales en esta época del año que resaltan lo especial de esta fecha.,una de las tradiciones más populares y que es un referente internacional sin duda es el día de muertos.,1,0
29,"una de las referencias escritas más tempranas sobre el uso del sushi en japón data del año 718 como parte de un tratado de leyes denominado yororitsuryo, en el se hace referencia al uso del sushi como forma de pago de impuestos.",al mezclarse el vinagre con el arroz el proceso de fermentar el pescado para obtener los olores y el sabor del nare sushi devino obsoleto.,1,0
37,"durante el período del imperio mongol que se expandió por el territorio chino, esta forma de conservar el pescado quedó parcialmente olvidada, quizás por ser las costumbres mongolas más carnívoras.",no era necesario usar estos métodos de conservar el pescado en las poblaciones de la costa porque el pescado lo tenían todo el año.,0,1
152,"dentro, los frutos cocidos se mantienen calientes y, por consiguiente, listos para ser consumidos.",en su interior se guarda la temperatura adecuada para que estos platillos puedan ser comidos sin mayor preparación.,1,0
254,"existen 10 casas tequileras que se consideran las más importantes del país, tanto por su antigüedad e importancia en el mercado y dicha lista está encabezada por tequila josé cuervo.","debido a lo anterior, varios productores de diferentes zonas buscan aprovecharse del buen nombre que los productores originales han ganado con el tiempo.",1,0
328,"para su elaboración, primero se extraen azúcares contenidos en las piñas de agave y se separan de la fibra. después, se decide qué bebida se elaborará a partir de sus componentes.","aunque es diferente al mezcal y podría parecer que no tienen ninguna relación, gracias su elaboración, el tequila se considera como un tipo específico de mezcal.",1,0
331,"al ser una bebida distinta al mezcal, se cree que no tendría relación en aspectos de elaboración con el tequila, pero es considerado como un tipo específico de mezcal.","el destilado de agave que se utiliza para preparar el tequila se realiza solo en zonas específicas de méxico, por esta razón es un producto de origen registrado",0,1
346,"la carne tradicional de este platillo es el cordero, pero depende de los gustos locales y prohibiciones religiosas, en la actualidad se prepara con carne de cordero, cabra, pollo, cerdo o pescado.","en irán, hay muchas variaciones del mismo platillo, y últimamente se ve que al cocinarlo también se usa pescado o pavo.",1,0
366,"con esta tradición, se busca rendir tributo a los antepasados familiares, a los difuntos cercanos o a personajes importantes.","usualmente, los altares están compuestos por varios niveles que representan la cosmovisión de quienes lo ponen variando la región en la que se hace, y conecta al mundo material con el inmaterial, cada nivel tiene un significado diferente.",1,0
494,expresiones como cocina molecular o gastronomía molecular son más populares en los medios que en la hostelería.,"este método consiste en hervir el líquido con baja presión y a temperatura baja con el fin de conservar las moléculas del sabor, una técnica usada inicialmente en laboratorios químicos que después se empleó en la cocina, inclusive puede extraer el suave aroma de los pétalos de rosa.",1,0


In [None]:
from sklearn.metrics import classification_report
print(classification_report(train_dff.Label,ppp))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4602
           1       0.97      0.93      0.95      1098

    accuracy                           0.98      5700
   macro avg       0.98      0.96      0.97      5700
weighted avg       0.98      0.98      0.98      5700



In [None]:
ppp=[]
for i in range(len(test_dff)):
  ppp.append(check_similarity(test_dff['Text1'][i], test_dff['Text2'][i])[0])


In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_dff.Label,ppp))

In [None]:
test_dff['predictions'] = ppp
test_dff.to_csv('misclassify.csv', header=True, index=False, columns=list(test_dff.axes[1]))

In [None]:
test_dff.shape

In [None]:
mdf=test_dff[test_dff['Label'] != test_dff['predictions']]


In [None]:
mdf.shape

In [None]:
pd.set_option('display.max_colwidth', None)


In [None]:
mdf.head(10)

In [None]:
dt= pd.read_csv("/content/misclassifyTrain.csv")
dt1= pd.read_csv("/content/misclassify.csv")

In [None]:
print(classification_report(dt.Label,dt.predictions))

In [None]:
print(classification_report(dt1.Label,dt1.predictions))

In [None]:
dtx= pd.read_csv("misclassifyTrain2.csv")
dtx1= pd.read_csv("misclassify2.csv")

In [None]:
print(classification_report(dtx.Label,dtx.predictions))

In [None]:
print(classification_report(dtx1.Label,dtx1.predictions))

In [None]:
dt.shape

In [None]:
dt1.shape

In [None]:
fp=dt[dt['Label'] != dt['predictions']]
fptest=dt1[dt1['Label'] != dt1['predictions']]

In [None]:
fp.shape

In [None]:
fptest.shape

In [None]:
fp.head(10)

In [None]:
fptest.head(10)

In [None]:
dups_color = train_df.pivot_table(columns=['Text1'], aggfunc='size')

In [None]:
dups_color

In [None]:
v = train_df.Text1.value_counts()
train_df[train_df.Text1.isin(v.index[v.gt(100)])]

In [None]:
dups.head(100)

In [None]:
import pandas as pd
test = pd.read_csv("misclassify.csv")
train = pd.read_csv("misclassifyTrain.csv")

In [None]:
fp=train[train['Label'] != train['predictions']]
fptest=test[test['Label'] != test['predictions']]

In [None]:
fp.to_csv("misclassTrain.csv")
fptest.to_csv("misclassTest.csv")



In [None]:
pd.set_option('display.max_colwidth', None)

fp.head(20)
fptest.head(20)

sentence-transformers/bert-base-nli-mean-tokens:
  Train set: 0.95 M_F1
  Test set: 0.91