<a href="https://colab.research.google.com/github/mohameddhameem/kaggle-us-patent-phrase-to-phrase-matching/blob/master/colab-training-v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers

[K     |████████████████████████████████| 4.2 MB 25.7 MB/s 
[K     |████████████████████████████████| 596 kB 52.5 MB/s 
[K     |████████████████████████████████| 84 kB 3.0 MB/s 
[K     |████████████████████████████████| 6.6 MB 32.1 MB/s 
[?25h

In [3]:
import pandas as pd
import numpy as np
import math
import os
import random
from scipy import stats
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras import backend as K
#import tensorflow_addons as tfa

import transformers

import warnings
warnings.filterwarnings("ignore") 

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

### Setup TPU or GPU for training

In [16]:
print("we are using Tensorflow version ", tf.__version__)
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("we are running on CPU. switch to GPU for full training")

try:
  resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
  tf.config.experimental_connect_to_cluster(resolver)
  tf.tpu.experimental.initialize_tpu_system(resolver)
  print("All devices: ", tf.config.list_logical_devices('TPU'))
  strategy = tf.distribute.experimental.TPUStrategy(resolver)
except ValueError:
    print("No TPU detected. Running on CPU")
    strategy = tf.distribute.get_strategy() 

print('Strategy:', strategy.scope())

we are using Tensorflow version  2.8.0
we are running on CPU. switch to GPU for full training
No TPU detected. Running on CPU
Strategy: <tensorflow.python.distribute.distribute_lib._DefaultDistributionContext object at 0x7fb7c223ff00>


In [6]:
# read csv files in the us-patent-phrase-to-phrase-matching
# directory and store them in a list
path = 'us-patent-phrase-to-phrase-matching'
files = os.listdir(path)
# read the csv files
df_train = pd.read_csv(path + '/' + 'train.csv')
df_test = pd.read_csv(path + '/' + 'test.csv')
df_sample = pd.read_csv(path + '/' + 'sample_submission.csv')
parsed = {x: [] for x in ['code', 'title', 'section', 'class', 'subclass', 'group', 'main_group']}
os.chdir(path)
for letter in 'ABCDEFGHY':
    file = f'cpc-section-{letter}_20220201.txt'
    with open(file) as f:
        for line in f:
            vals = line.strip().split('\t')
            if len(vals) == 2:
                parsed['code'].append(vals[0])
                parsed['title'].append(vals[1])
            elif len(vals) == 3:
                parsed['code'].append(vals[0])
                parsed['title'].append(vals[2])
for i in range(len(parsed['code'])):
    code = parsed['code'][i]
    main_group = code.split('/')[-1] if "/" in code else None
    group = code.split('/')[0][4:] if len(code) >= 5 else None
    subclass = code[3] if len(code) >= 4 else None
    class_ = code[1:3] if len(code) >= 3 else None
    section = code[0] if len(code) >= 1 else None
    
    parsed['main_group'].append(main_group)
    parsed['group'].append(group)
    parsed['subclass'].append(subclass)
    parsed['class'].append(class_)
    parsed['section'].append(section)


# merge both dataframes
df_codes = pd.DataFrame.from_dict(parsed)
codes = df_codes.rename(columns = {"code" : "context"})
train_data=pd.merge(df_train,codes[["context","title"]],on="context",how="left")
test_data=pd.merge(df_test,codes[["context","title"]],on="context",how="left")

In [7]:
## Our ownn dataset and model file. Lets declare it
TRAINED_MODEL_PATH = '/content/USPPM-Trained-Model.h5'
# BERT_FOR_PATENT = '/kaggle/input/usppm-pretrained-data/kaggle-dataset/bert-for-patents/' # only for inferencing we need this

In [22]:
#TODO - we will enable it after our intial training
tf.config.optimizer.set_jit(True) 
class Config():
    seed = 42
    epochs = 2 # Original 10
    num_folds = 2 # Original 5
    max_length = 192 # 192 #96 - working - old 192
    batch_size = 64 #64 # 16 working. old 64
    learning_rate = 2e-5
    weight_decay = 0.01
    base_model = "anferico/bert-for-patents"
    
    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)

### Build dataset

In [9]:
def dataset_split(dataset, split_val):
    lengths = int(len(dataset) * split_val)
    train_data = dataset[:lengths]
    valid_data = dataset[lengths:]
    return train_data, valid_data


def dataset_load(train_data, test_data):
    train_data['sep_token'] = '[SEP]'
    train_data['cls_token'] = '[CLS]'
    train_data['context_token'] = '[' + train_data.context + ']'
    context_tokens = list(train_data.context_token.unique())
    train_data = train_data.sample(frac=1).reset_index(drop=True)
    train_data, valid_data = dataset_split(dataset=train_data, split_val=0.9)
    test_data['sep_token'] = '[SEP]'
    test_data['cls_token'] = '[CLS]'
    test_data['context_token'] = '[' + test_data.context + ']'
    return train_data, valid_data, test_data, context_tokens

# create a learning rate scheduler

def create_learning_rate_scheduler(max_learn_rate=5e-5,
                                   end_learn_rate=1e-7,
                                   warmup_epoch_count=10,
                                   total_epoch_count=90):

    def lr_scheduler(epoch):

        if epoch < warmup_epoch_count:
            res = (max_learn_rate/warmup_epoch_count) * (epoch + 1)
        else:
            res = max_learn_rate*math.exp(math.log(end_learn_rate/max_learn_rate)*(
                epoch-warmup_epoch_count+1)/(total_epoch_count-warmup_epoch_count+1))
        return float(res)
    learning_rate_scheduler = tf.keras.callbacks.LearningRateScheduler(
        lr_scheduler, verbose=1)

    return learning_rate_scheduler


def encode_text(text,
                tokenizer,
                max_length):

    # With tokenizer's batch_encode_plus batch of both the sentences are
    # encoded together and separated by [SEP] token.
    encoded = tokenizer.batch_encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors="tf",
    )

    # Convert batch of encoded features to numpy array.
    input_ids = np.array(encoded["input_ids"], dtype="int32")
    attention_masks = np.array(encoded["attention_mask"], dtype="int32")
    token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

    return {
        "input_ids": input_ids,
        "attention_masks": attention_masks,
        "token_type_ids": token_type_ids
    }

In [10]:
train_data, valid_data, test_data, context_tokens = dataset_load(train_data, test_data)
labels = list(set(train_data["score"].values))
labels.sort()

print(len(train_data), len(valid_data), len(test_data))
print(labels)
print(context_tokens)

32825 3648 36
[0.0, 0.25, 0.5, 0.75, 1.0]
['[A47]', '[A61]', '[A62]', '[C01]', '[F16]', '[F24]', '[F28]', '[H01]', '[H04]', '[B23]', '[B41]', '[D03]', '[E03]', '[C08]', '[D01]', '[D21]', '[C07]', '[A45]', '[B01]', '[B08]', '[G04]', '[G06]', '[B65]', '[G16]', '[G01]', '[A41]', '[C23]', '[F23]', '[B25]', '[A63]', '[B28]', '[B63]', '[F04]', '[B60]', '[B32]', '[C09]', '[C02]', '[G03]', '[C10]', '[B61]', '[C21]', '[F42]', '[A23]', '[C11]', '[B29]', '[F02]', '[B62]', '[B64]', '[E21]', '[B24]', '[B22]', '[H05]', '[B27]', '[E04]', '[B21]', '[D06]', '[C04]', '[B05]', '[G02]', '[H03]', '[C06]', '[G11]', '[C12]', '[E02]', '[F15]', '[A46]', '[B66]', '[G07]', '[G08]', '[C22]', '[B44]', '[A01]', '[F03]', '[C25]', '[F22]', '[G05]', '[G21]', '[B07]', '[F41]', '[E01]', '[H02]', '[C13]', '[F01]', '[F27]', '[C14]', '[A44]', '[B67]', '[A24]', '[B02]', '[E05]', '[D05]', '[F25]', '[A43]', '[A22]', '[A21]', '[E06]', '[F21]', '[G10]', '[C03]', '[B81]', '[F17]', '[B03]', '[G09]', '[D04]', '[F26]', '[B31]']


In [11]:
train_data['title'] = train_data['title'].str.lower()
train_data['anchor'] = train_data['anchor'].str.lower()
train_data['target'] = train_data['target'].str.lower()
# Tokenizer.
tokenizer = transformers.AutoTokenizer.from_pretrained(Config.base_model)
# Context tokens. 
train_data['context_token'] = '[' + train_data.context + ']'
train_data['sep_token'] = '[SEP]'
train_data['cls_token'] = '[CLS]'
context_tokens = list(train_data.context_token.unique())
tokenizer.add_special_tokens({'additional_special_tokens': context_tokens})

# Preparing input text for the model.
# We are adding context_token before the context title
# to let model learn the context of anchor and target.
train_data['text'] = train_data['cls_token'] + \
                    train_data['context_token'] + train_data['title'] + \
                    train_data['sep_token'] + train_data['anchor'] + \
                    train_data['sep_token'] + train_data['target'] + \
                train_data['sep_token']

test_data['title'] = test_data['title'].str.lower().str.replace(";","")
test_data['anchor'] = test_data['anchor'].str.lower()
test_data['target'] = test_data['target'].str.lower()

test_data['text'] = test_data['title'] + " " + test_data['anchor']

Downloading:   0%|          | 0.00/327 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/322k [00:00<?, ?B/s]

In [12]:
train_data.head()

Unnamed: 0,id,anchor,target,context,score,title,sep_token,cls_token,context_token,text
0,fe7f1e20598155a2,output center,output centres,D05,1.0,sewing; embroidering; tufting,[SEP],[CLS],[D05],[CLS][D05]sewing; embroidering; tufting[SEP]ou...
1,06a28c36febaa599,pick element,pick up element,A63,0.5,sports; games; amusements,[SEP],[CLS],[A63],[CLS][A63]sports; games; amusements[SEP]pick e...
2,ad7e0d56897281c6,slot open,slot opening,A61,1.0,medical or veterinary science; hygiene,[SEP],[CLS],[A61],[CLS][A61]medical or veterinary science; hygie...
3,efe5b896c27026d9,triethylammonium salt,triethylammonium,C12,0.25,biochemistry; beer; spirits; wine; vinegar; mi...,[SEP],[CLS],[C12],[CLS][C12]biochemistry; beer; spirits; wine; v...
4,175fa16cf457e871,show in chemical formula,within the chemical formula,C09,0.5,dyes; paints; polishes; natural resins; adhesi...,[SEP],[CLS],[C09],[CLS][C09]dyes; paints; polishes; natural resi...


In [13]:
test_data.head()

Unnamed: 0,id,anchor,target,context,title,sep_token,cls_token,context_token,text
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,optics,[SEP],[CLS],[G02],optics opc drum
1,09e418c93a776564,adjust gas flow,altering gas flow,F23,combustion apparatus combustion processes,[SEP],[CLS],[F23],combustion apparatus combustion processes adju...
2,36baf228038e314b,lower trunnion,lower locating,B60,vehicles in general,[SEP],[CLS],[B60],vehicles in general lower trunnion
3,1f37ead645e7f0c8,cap component,upper portion,D06,treatment of textiles or the like laundering f...,[SEP],[CLS],[D06],treatment of textiles or the like laundering f...
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04,electric communication technique,[SEP],[CLS],[H04],electric communication technique neural stimul...


In [14]:
encoded_test_data = encode_text(test_data[["text", "target"]].values.tolist(), tokenizer, Config.max_length)

In [15]:
print(encoded_test_data["input_ids"][0])
print(encoded_test_data["attention_masks"][0])
print(encoded_test_data["token_type_ids"][0])

[    2 20691  6393  1943  6608     3 27921  5967  8328  8231 16426  6608
     3     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [None]:
test_x = [encoded_test_data["input_ids"], encoded_test_data["attention_masks"], encoded_test_data["token_type_ids"]]
print("test x shape : ", test_x[0].shape, test_x[1].shape, test_x[2].shape)

test x shape :  (36, 96) (36, 96) (36, 96)


In [17]:
def build_model(config):
    # Create the model under a distribution strategy scope.
    with strategy.scope():
        # Encoded token ids from BERT tokenizer.
        input_ids = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="input_ids"
        )
        # Attention masks indicates to the model which tokens should be attended to.
        attention_masks = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="attention_masks"
        )
        # Token type ids are binary masks identifying different sequences in the model.
        token_type_ids = tf.keras.layers.Input(
            shape=(config.max_length,), dtype=tf.int32, name="token_type_ids"
        )
        # Loading pretrained BERT model.
        base_model = transformers.TFAutoModel.from_pretrained(config.base_model, from_pt=True)

        base_model_output = base_model(
            input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
        )

        last_hidden_state = base_model_output.last_hidden_state
        avg_pool = tf.keras.layers.GlobalAveragePooling1D()(last_hidden_state)
        dropout = tf.keras.layers.Dropout(0.3)(avg_pool)

        output = tf.keras.layers.Dense(1, activation="sigmoid")(dropout)

        model = tf.keras.models.Model(
            inputs=[input_ids, attention_masks, token_type_ids], outputs=output
        )

        model.compile(
            optimizer = tf.keras.optimizers.Adam(learning_rate=config.learning_rate),
            loss=tf.keras.losses.BinaryCrossentropy()
        )

    return model

### Metrics for Competition

In [18]:
class Pearsonr(tf.keras.callbacks.Callback):
    def __init__(self, val_data, y_val):
        self.val_data = val_data
        self.y_val = y_val
    def on_epoch_end(self, epoch, logs):
        val_preds = self.model.predict(self.val_data, verbose=0)
        
        val_pearsonr = stats.pearsonr(self.y_val, val_preds.ravel())[0]

        print(f"val_pearsonr: {val_pearsonr:.4f}\n")
        logs["val_pearsonr"] = val_pearsonr

### Model K Fold Training
Use previously trained weight first

In [32]:


def train_folds(train, config):
    oof = np.zeros(len(train))
    pretrained_model_loaded = False
    train['score_map'] = train['score'].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})
    
    skf = StratifiedKFold(n_splits=config.num_folds, 
                      shuffle=True,
                      random_state=config.seed)
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(train, train['score_map'])):
        print("*" * 50)
        print(f"Training fold: {fold+1}")

        train_df = train.loc[train_idx].reset_index(drop=True)
        # print(train_df.head(2))
        val_df = train.loc[val_idx].reset_index(drop=True)
        
        # Clear keras session.
        K.clear_session()
        
        train_encoded =  encode_text(train_df["text"].tolist(),
                                     tokenizer=tokenizer,
                                     max_length=config.max_length)
        
        val_encoded =  encode_text(val_df["text"].tolist(),
                                     tokenizer=tokenizer,
                                     max_length=config.max_length)
        # Dataloader.
        train_data = tf.data.Dataset.from_tensor_slices((train_encoded, train_df['score'].tolist()))
        val_data = tf.data.Dataset.from_tensor_slices((val_encoded, val_df['score'].tolist()))

        train_data = (
                        train_data
                        .shuffle(1024)
                        .batch(config.batch_size)
                        .prefetch(tf.data.AUTOTUNE)
                     )
        
        val_data = (
                        val_data
                        .batch(config.batch_size)
                        .prefetch(tf.data.AUTOTUNE)
                    )

        # Callbacks.
        checkpoint = tf.keras.callbacks.ModelCheckpoint(f'model-{fold+1}.h5',
                                                        monitor='val_loss',
                                                        mode='min',
                                                        save_best_only=True,
                                                        save_weights_only=True,
                                                        save_freq='epoch',
                                                        verbose=1)
        
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                          mode='min',
                                                          patience=3,
                                                          verbose=1)
        
        pearsonr_callback = Pearsonr(val_data, val_df['score'].values)
        num_train_steps = int(len(train_df) / config.batch_size * config.epochs)
        
        # Build and Train model.
        model = build_model(config) #, num_train_steps
        # for the first time in Colab load the pretrained model
        if pretrained_model_loaded == False:
          model.load_weights(TRAINED_MODEL_PATH)
          pretrained_model_loaded = True
          print("Our own pretrained model loaded......")
        history = model.fit(
                        train_data,
                        validation_data=val_data,
                        epochs=config.epochs,
                        callbacks=[checkpoint, 
                                   early_stopping, 
                                   pearsonr_callback],
                        verbose=1
                    )
        
        print('\nLoading best model weights...')
        model.load_weights(f'model-{fold+1}.h5')
        
        print('Predicting OOF...')
        oof[val_idx] = model.predict(val_data,
                                     batch_size=config.batch_size,
                                     verbose=0).reshape(-1)
        
        
        score = stats.pearsonr(val_df['score'].values, oof[val_idx])[0]
        print(f'\nFold {fold + 1}: OOF pearson_r: {score:.4f}')        
        print("*" * 25)
        
    score = stats.pearsonr(train['score'].values, oof)[0]
    print(f'\nOverall OOF pearson_r: {score:.4f}')
    return oof

In [33]:
config = Config()
oof_preds = train_folds(train_data, config)

**************************************************
Training fold: 1


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.decoder.weight', 'bert.embeddings.position_ids', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

OSError: ignored

In [None]:
# load model
model.load_weights('model-2.h5')

In [None]:
# Lets predict the test data.
predictions = model.predict(test_x)

In [None]:
# Lets read sample submission file.
submission = pd.read_csv('sample_submission.csv')
submission['score'] = predictions
submission['score'] = submission.score.apply(lambda x: 0 if x < 0 else x)
submission['score'] = submission.score.apply(lambda x: 1 if x > 1 else x)

In [None]:
submission.to_csv("submission.csv",index=False)

Unnamed: 0,id,score
0,4112d61851461f60,0.363815
1,09e418c93a776564,0.363816
2,36baf228038e314b,0.363814
3,1f37ead645e7f0c8,0.363814
4,71a5b6ad068d531f,0.363816
5,474c874d0c07bd21,0.363815
6,442c114ed5c4e3c9,0.363815
7,b8ae62ea5e1d8bdb,0.363815
8,faaddaf8fcba8a3f,0.363815
9,ae0262c02566d2ce,0.363815
