In [8]:
# import required libraries
import os
import pandas as pd
import numpy as np

import random
from scipy import stats
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras import backend as K
#import tensorflow_addons as tfa

import transformers

import warnings
warnings.filterwarnings("ignore") 

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
AUTOTUNE = tf.data.experimental.AUTOTUNE
AUTOTUNE

-1

In [9]:
# read csv files in the us-patent-phrase-to-phrase-matching
# directory and store them in a list
path = 'us-patent-phrase-to-phrase-matching'
files = os.listdir(path)

In [10]:
# read the csv files
df_train = pd.read_csv(path + '/' + 'train.csv')
df_test = pd.read_csv(path + '/' + 'test.csv')
df_sample = pd.read_csv(path + '/' + 'sample_submission.csv')

## Get additional data

In [11]:
parsed = {x: [] for x in ['code', 'title', 'section', 'class', 'subclass', 'group', 'main_group']}
os.chdir(path)
for letter in 'ABCDEFGHY':
    file = f'cpc-section-{letter}_20220201.txt'
    with open(file) as f:
        for line in f:
            vals = line.strip().split('\t')
            if len(vals) == 2:
                parsed['code'].append(vals[0])
                parsed['title'].append(vals[1])
            elif len(vals) == 3:
                parsed['code'].append(vals[0])
                parsed['title'].append(vals[2])
for i in range(len(parsed['code'])):
    code = parsed['code'][i]
    main_group = code.split('/')[-1] if "/" in code else None
    group = code.split('/')[0][4:] if len(code) >= 5 else None
    subclass = code[3] if len(code) >= 4 else None
    class_ = code[1:3] if len(code) >= 3 else None
    section = code[0] if len(code) >= 1 else None
    
    parsed['main_group'].append(main_group)
    parsed['group'].append(group)
    parsed['subclass'].append(subclass)
    parsed['class'].append(class_)
    parsed['section'].append(section)

In [12]:
# merge both dataframes
df_codes = pd.DataFrame.from_dict(parsed)
codes = df_codes.rename(columns = {"code" : "context"})
train_data=pd.merge(df_train,codes[["context","title"]],on="context",how="left")
test_data=pd.merge(df_test,codes[["context","title"]],on="context",how="left")
train_data.head()

Unnamed: 0,id,anchor,target,context,score,title
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...


In [13]:
train_data.shape, test_data.shape

((36473, 6), (36, 5))

In [14]:
train_data.score.value_counts()

0.50    12300
0.25    11519
0.00     7471
0.75     4029
1.00     1154
Name: score, dtype: int64

In [15]:
#TODO - we will enable it after our intial training
# tf.config.optimizer.set_jit(True) 
class Config():
    seed = 42
    epochs = 10
    num_folds = 5
    max_length = 96 # 192
    batch_size = 16 #64
    learning_rate = 2e-5
    weight_decay = 0.01
    base_model = "anferico/bert-for-patents"
    
    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)
    
#seed_everything(seed=42)
# strategy = tf.distribute.MirroredStrategy()

In [16]:
train_data['title'] = train_data['title'].str.lower()
train_data['anchor'] = train_data['anchor'].str.lower()
train_data['target'] = train_data['target'].str.lower()
# Tokenizer.
tokenizer = transformers.AutoTokenizer.from_pretrained(Config.base_model)
# Context tokens. 
train_data['context_token'] = '[' + train_data.context + ']'
train_data['sep_token'] = '[SEP]'
train_data['cls_token'] = '[CLS]'
context_tokens = list(train_data.context_token.unique())
tokenizer.add_special_tokens({'additional_special_tokens': context_tokens})

# Preparing input text for the model.
# We are adding context_token before the context title
# to let model learn the context of anchor and target.
train_data['text'] = train_data['cls_token'] + \
                    train_data['context_token'] + train_data['title'] + \
                    train_data['sep_token'] + train_data['anchor'] + \
                    train_data['sep_token'] + train_data['target'] + \
                train_data['sep_token']

In [17]:
def encode_text(text, 
                tokenizer,
                max_length):
    
    # With tokenizer's batch_encode_plus batch of both the sentences are
    # encoded together and separated by [SEP] token.
    encoded = tokenizer.batch_encode_plus(
        text,
        add_special_tokens=False,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors="tf",
    )

    # Convert batch of encoded features to numpy array.
    input_ids = np.array(encoded["input_ids"], dtype="int32")
    attention_masks = np.array(encoded["attention_mask"], dtype="int32")
    token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

    return {
        "input_ids": input_ids,
        "attention_masks": attention_masks,
        "token_type_ids": token_type_ids
    }

In [18]:
class Pearsonr(tf.keras.callbacks.Callback):
    def __init__(self, val_data, y_val):
        self.val_data = val_data
        self.y_val = y_val
    def on_epoch_end(self, epoch, logs):
        val_preds = self.model.predict(self.val_data, verbose=0)
        
        val_pearsonr = stats.pearsonr(self.y_val, val_preds.ravel())[0]

        print(f"val_pearsonr: {val_pearsonr:.4f}\n")
        logs["val_pearsonr"] = val_pearsonr

In [19]:
def build_model(config, num_train_steps):
    # Create the model under a distribution strategy scope.
    #with strategy.scope():
        # Encoded token ids from BERT tokenizer.
    input_ids = tf.keras.layers.Input(
        shape=(config.max_length,), dtype=tf.int32, name="input_ids"
    )
    # Attention masks indicates to the model which tokens should be attended to.
    attention_masks = tf.keras.layers.Input(
        shape=(config.max_length,), dtype=tf.int32, name="attention_masks"
    )
    # Token type ids are binary masks identifying different sequences in the model.
    token_type_ids = tf.keras.layers.Input(
        shape=(config.max_length,), dtype=tf.int32, name="token_type_ids"
    )
    # Loading pretrained BERT model.
    base_model = transformers.TFAutoModel.from_pretrained(config.base_model, from_pt=True)

    base_model_output = base_model(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )

    last_hidden_state = base_model_output.last_hidden_state
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(last_hidden_state)
    dropout = tf.keras.layers.Dropout(0.3)(avg_pool)

    output = tf.keras.layers.Dense(1, activation="sigmoid")(dropout)

    model = tf.keras.models.Model(
        inputs=[input_ids, attention_masks, token_type_ids], outputs=output
    )

    model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate=config.learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy()
    )

    return model

In [22]:
def train_folds(train, config):
    oof = np.zeros(len(train))
    
    train['score_map'] = train['score'].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})
    
    skf = StratifiedKFold(n_splits=config.num_folds, 
                      shuffle=True,
                      random_state=config.seed)
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(train, train['score_map'])):
        print("*" * 50)
        print(f"Training fold: {fold+1}")

        train_df = train.loc[train_idx].reset_index(drop=True)
        val_df = train.loc[val_idx].reset_index(drop=True)
        
        # Clear keras session.
        K.clear_session()
        
        train_encoded =  encode_text(train_df["text"].tolist(),
                                     tokenizer=tokenizer,
                                     max_length=config.max_length)
        
        val_encoded =  encode_text(val_df["text"].tolist(),
                                     tokenizer=tokenizer,
                                     max_length=config.max_length)
        # Dataloader.
        train_data = tf.data.Dataset.from_tensor_slices((train_encoded, train_df['score'].tolist()))
        val_data = tf.data.Dataset.from_tensor_slices((val_encoded, val_df['score'].tolist()))

        train_data = (
                        train_data
                        .shuffle(1024)
                        .batch(config.batch_size)
                        .prefetch(AUTOTUNE)
                     )
        
        val_data = (
                        val_data
                        .batch(config.batch_size)
                        .prefetch(AUTOTUNE)
                    )

        # Callbacks.
        checkpoint = tf.keras.callbacks.ModelCheckpoint(f'model-{fold+1}.h5',
                                                        monitor='val_loss',
                                                        mode='min',
                                                        save_best_only=True,
                                                        save_weights_only=True,
                                                        save_freq='epoch',
                                                        verbose=1)
        
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                          mode='min',
                                                          patience=3,
                                                          verbose=1)
        
        pearsonr_callback = Pearsonr(val_data, val_df['score'].values)
        num_train_steps = int(len(train_df) / config.batch_size * config.epochs)
        
        # Build and Train model.
        model = build_model(config, num_train_steps)
        history = model.fit(
                        train_data,
                        validation_data=val_data,
                        epochs=config.epochs,
                        callbacks=[checkpoint, 
                                   early_stopping, 
                                   pearsonr_callback],
                        verbose=1
                    )
        
        print('\nLoading best model weights...')
        model.load_weights(f'model-{fold+1}.h5')
        
        print('Predicting OOF...')
        oof[val_idx] = model.predict(val_data,
                                     batch_size=config.batch_size,
                                     verbose=0).reshape(-1)
        
        
        score = stats.pearsonr(val_df['score'].values, oof[val_idx])[0]
        print(f'\nFold {fold + 1}: OOF pearson_r: {score:.4f}')        
        print("*" * 25)
        
    score = stats.pearsonr(train['score'].values, oof)[0]
    print(f'\nOverall OOF pearson_r: {score:.4f}')
    return oof

In [23]:
config = Config()
oof_preds = train_folds(train_data, config)

**************************************************
Training fold: 1


Downloading:   0%|          | 0.00/1.29G [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.decoder.bias', 'bert.embeddings.position_ids', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

Epoch 1/10


InvalidArgumentError:  indices[5,1] = 39890 is not in [0, 39859)
	 [[node functional_1/tf_bert_model/bert/embeddings/Gather (defined at \anaconda3\envs\kaggle-nlp\lib\site-packages\transformers\models\bert\modeling_tf_bert.py:190) ]] [Op:__inference_train_function_55905]

Errors may have originated from an input operation.
Input Source operations connected to node functional_1/tf_bert_model/bert/embeddings/Gather:
 IteratorGetNext (defined at \AppData\Local\Temp\ipykernel_16692\3990938244.py:70)

Function call stack:
train_function
