In [16]:
!pip install sentence_transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [17]:
import pandas as pd
import numpy as np

In [18]:
model_names = [
    'deberta-v3-large',
    'cross-encoder-stsb-deberta-v3-large',
    'stsb-roberta-large',
]

In [31]:
from sentence_transformers import CrossEncoder

models = []
model_directory_path = '/kaggle/input/models'
for model_name in model_names:
    model = CrossEncoder(f'{model_directory_path}/{model_name}/model', num_labels=1)
    models.append(model)
print(f"Loaded {len(models)} models")

Loaded 3 models


In [21]:
# preprocessing: tokenize and remove punctuations
from string import punctuation  
from nltk.tokenize import word_tokenize

def preprocess(sent):
    # use the model's default tokenizer to tokenize the sentence
    tokens = word_tokenize(sent)

    # remove punctuations
    tokens = [token for token in tokens if token not in punctuation]

    return " ".join(tokens)

In [53]:
from numpy.typing import ArrayLike


def predict(sentences1: ArrayLike, sentences2: ArrayLike):
    """
    :param sentences1: A list of the first sentences
    :param sentences2: A list of the second sentneces
    """
    sentences_pairs = [[sent1, sent2]
                       for sent1, sent2 in zip(sentences1, sentences2)]

    similarity_scores = []
    for model in models:
        similarity_scores.append(model.predict(sentences_pairs))
    similarity_scores = np.array(similarity_scores)

    predictions = np.round(similarity_scores)
    predictions = np.array(predictions, dtype=int)

    # mode along axis=1
    majority_vote = np.apply_along_axis(
        lambda x: np.argmax(np.bincount(x)), axis=0, arr=predictions)

    result = {
        #         'model_names': model_names,
        #         'similarity_scores': similarity_scores,
        #         'predictions': predictions,
        # key: model_name for each model, value: {score: similarity_scores, prediction: predictions}
        'model_results': {
            model_name: {
                'score': similarity_scores[i], 'prediction': predictions[i]
            } for i, model_name in enumerate(model_names)
        },
        'majority_vote': majority_vote
    }

    return result

In [55]:
test_df = pd.read_csv('/kaggle/input/paraphrase/test.csv')

test_sent_1 = np.array(test_df['text_1'].apply(preprocess))
test_sent_2 = np.array(test_df['text_2'].apply(preprocess))
test_labels = np.array(test_df['label'])

test_predictions = predict(test_sent_1, test_sent_2)

Batches:   0%|          | 0/54 [00:00<?, ?it/s]

Batches:   0%|          | 0/54 [00:00<?, ?it/s]

Batches:   0%|          | 0/54 [00:00<?, ?it/s]

In [56]:
test_predictions

{'model_results': {'deberta-v3-large': {'score': array([0.9997069 , 0.9995962 , 0.9997476 , ..., 0.00108177, 0.9997271 ,
          0.99963415], dtype=float32),
   'prediction': array([1, 1, 1, ..., 0, 1, 1])},
  'cross-encoder-stsb-deberta-v3-large': {'score': array([9.9962234e-01, 9.9958736e-01, 9.9968755e-01, ..., 8.7468809e-04,
          9.9968338e-01, 9.9961758e-01], dtype=float32),
   'prediction': array([1, 1, 1, ..., 0, 1, 1])},
  'stsb-roberta-large': {'score': array([0.99967515, 0.9995394 , 0.99969196, ..., 0.00193954, 0.9996835 ,
          0.00567359], dtype=float32),
   'prediction': array([1, 1, 1, ..., 0, 1, 0])}},
 'majority_vote': array([1, 1, 1, ..., 0, 1, 1])}

In [51]:
from sklearn.metrics import classification_report

In [60]:
for model_name in test_predictions['model_results']:
    print(model_name)
    print(classification_report(test_df['label'].to_list(), 
                                test_predictions['model_results'][model_name]['prediction'],
                                digits=4))

deberta-v3-large
              precision    recall  f1-score   support

           0     0.8809    0.8062    0.8419       578
           1     0.9064    0.9451    0.9253      1147

    accuracy                         0.8986      1725
   macro avg     0.8936    0.8757    0.8836      1725
weighted avg     0.8978    0.8986    0.8974      1725

cross-encoder-stsb-deberta-v3-large
              precision    recall  f1-score   support

           0     0.8696    0.8304    0.8496       578
           1     0.9165    0.9372    0.9267      1147

    accuracy                         0.9014      1725
   macro avg     0.8930    0.8838    0.8881      1725
weighted avg     0.9007    0.9014    0.9009      1725

stsb-roberta-large
              precision    recall  f1-score   support

           0     0.8405    0.8478    0.8441       578
           1     0.9229    0.9189    0.9209      1147

    accuracy                         0.8951      1725
   macro avg     0.8817    0.8833    0.8825      1725
we

In [63]:
for model_name in test_predictions['model_results']:
    test_df[f'{model_name}_pred'] = test_predictions['model_results'][model_name]['prediction']
    test_df[f'{model_name}_score'] = test_predictions['model_results'][model_name]['score']
test_df['final_pred'] = test_predictions['majority_vote']
test_df

Unnamed: 0,label,id_1,id_2,text_1,text_2,deberta-v3-large_pred,deberta-v3-large_score,cross-encoder-stsb-deberta-v3-large_pred,cross-encoder-stsb-deberta-v3-large_score,stsb-roberta-large_pred,stsb-roberta-large_score,final_pred
0,1,1089874,1089925,"PCCW's chief operating officer, Mike Butcher, ...",Current Chief Operating Officer Mike Butcher a...,1,0.999707,1,0.999622,1,0.999675,1
1,1,3019446,3019327,The world's two largest automakers said their ...,Domestic sales at both GM and No. 2 Ford Motor...,1,0.999596,1,0.999587,1,0.999539,1
2,1,1945605,1945824,According to the federal Centers for Disease C...,The Centers for Disease Control and Prevention...,1,0.999748,1,0.999688,1,0.999692,1
3,0,1430402,1430329,A tropical storm rapidly developed in the Gulf...,A tropical storm rapidly developed in the Gulf...,0,0.000936,1,0.982065,0,0.014677,0
4,0,3354381,3354396,The company didn't detail the costs of the rep...,But company officials expect the costs of the ...,0,0.001443,0,0.000571,0,0.002013,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1720,0,2685984,2686122,"After Hughes refused to rehire Hernandez, he c...",Hernandez filed an Equal Employment Opportunit...,0,0.000725,0,0.000591,0,0.001903,0
1721,0,339215,339172,There are 103 Democrats in the Assembly and 47...,Democrats dominate the Assembly while Republic...,0,0.000829,0,0.000671,0,0.001786,0
1722,0,2996850,2996734,Bethany Hamilton remained in stable condition ...,"Bethany, who remained in stable condition afte...",0,0.001082,0,0.000875,0,0.001940,0
1723,1,2095781,2095812,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...",1,0.999727,1,0.999683,1,0.999683,1


In [64]:
test_df.to_csv('ensemble_infer.csv', index=False)

In [65]:
print(classification_report(test_df['label'].to_list(), test_df['final_pred'].to_list(), digits=4))

              precision    recall  f1-score   support

           0     0.8917    0.8408    0.8655       578
           1     0.9220    0.9486    0.9351      1147

    accuracy                         0.9125      1725
   macro avg     0.9069    0.8947    0.9003      1725
weighted avg     0.9119    0.9125    0.9118      1725



1	1756397	1756332	Evidence suggests two of the victims were taken by surprise, while the other two might have tried to flee or to defend themselves or the others, police said.	Evidence suggests two victims were taken by surprise, while the others may have tried to flee or perhaps defend themselves or their friends, police said.


In [74]:
# get prediction for a single pair
text1 = 'Evidence suggests two of the victims were taken by surprise, while the other two might have tried to flee or to defend themselves or the others, police said.'
text2 = 'Evidence suggests two victims were taken by surprise, while the others may have tried to flee or perhaps defend themselves or their friends, police said.'

predict([text1], [text2])['majority_vote']

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

array([1])