In [None]:
!pip install -q transformers

In [1]:
import pandas as pd
import numpy as np
import math
import os
import random
from scipy import stats
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras import backend as K
#import tensorflow_addons as tfa

import transformers

import warnings
warnings.filterwarnings("ignore") 

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

### Setup TPU or GPU for training

In [2]:
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("we are running on CPU. switch to GPU for full training")

try:
  resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
  tf.config.experimental_connect_to_cluster(resolver)
  tf.tpu.experimental.initialize_tpu_system(resolver)
  print("All devices: ", tf.config.list_logical_devices('TPU'))
  strategy = tf.distribute.experimental.TPUStrategy(resolver)
except ValueError:
    print("No TPU detected. Running on CPU")
    strategy = tf.distribute.get_strategy() 

print('Strategy:', strategy)

we are running on CPU. switch to GPU for full training
No TPU detected. Running on CPU
Strategy: <tensorflow.python.distribute.distribute_lib._DefaultDistributionStrategy object at 0x0000017A90CFA6C8>


In [3]:
# read csv files in the us-patent-phrase-to-phrase-matching
# directory and store them in a list
path = 'us-patent-phrase-to-phrase-matching'
files = os.listdir(path)
# read the csv files
df_train = pd.read_csv(path + '/' + 'train.csv')
df_test = pd.read_csv(path + '/' + 'test.csv')
df_sample = pd.read_csv(path + '/' + 'sample_submission.csv')
parsed = {x: [] for x in ['code', 'title', 'section', 'class', 'subclass', 'group', 'main_group']}
os.chdir(path)
for letter in 'ABCDEFGHY':
    file = f'cpc-section-{letter}_20220201.txt'
    with open(file) as f:
        for line in f:
            vals = line.strip().split('\t')
            if len(vals) == 2:
                parsed['code'].append(vals[0])
                parsed['title'].append(vals[1])
            elif len(vals) == 3:
                parsed['code'].append(vals[0])
                parsed['title'].append(vals[2])
for i in range(len(parsed['code'])):
    code = parsed['code'][i]
    main_group = code.split('/')[-1] if "/" in code else None
    group = code.split('/')[0][4:] if len(code) >= 5 else None
    subclass = code[3] if len(code) >= 4 else None
    class_ = code[1:3] if len(code) >= 3 else None
    section = code[0] if len(code) >= 1 else None
    
    parsed['main_group'].append(main_group)
    parsed['group'].append(group)
    parsed['subclass'].append(subclass)
    parsed['class'].append(class_)
    parsed['section'].append(section)


# merge both dataframes
df_codes = pd.DataFrame.from_dict(parsed)
codes = df_codes.rename(columns = {"code" : "context"})
train_data=pd.merge(df_train,codes[["context","title"]],on="context",how="left")
test_data=pd.merge(df_test,codes[["context","title"]],on="context",how="left")

In [10]:
#TODO - we will enable it after our intial training
tf.config.optimizer.set_jit(True) 
class Config():
    seed = 42
    epochs = 10
    num_folds = 5
    max_length = 96 # 192
    batch_size = 16 #64
    learning_rate = 2e-5
    weight_decay = 0.01
    base_model = "anferico/bert-for-patents"
    
    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)

### Build dataset

In [5]:
def dataset_split(dataset, split_val):
    lengths = int(len(dataset) * split_val)
    train_data = dataset[:lengths]
    valid_data = dataset[lengths:]
    return train_data, valid_data


def dataset_load(train_data, test_data):
    train_data['sep_token'] = '[SEP]'
    train_data['cls_token'] = '[CLS]'
    train_data['context_token'] = '[' + train_data.context + ']'
    context_tokens = list(train_data.context_token.unique())
    train_data = train_data.sample(frac=1).reset_index(drop=True)
    train_data, valid_data = dataset_split(dataset=train_data, split_val=0.9)
    test_data['sep_token'] = '[SEP]'
    test_data['cls_token'] = '[CLS]'
    test_data['context_token'] = '[' + test_data.context + ']'
    return train_data, valid_data, test_data, context_tokens

# create a learning rate scheduler

def create_learning_rate_scheduler(max_learn_rate=5e-5,
                                   end_learn_rate=1e-7,
                                   warmup_epoch_count=10,
                                   total_epoch_count=90):

    def lr_scheduler(epoch):

        if epoch < warmup_epoch_count:
            res = (max_learn_rate/warmup_epoch_count) * (epoch + 1)
        else:
            res = max_learn_rate*math.exp(math.log(end_learn_rate/max_learn_rate)*(
                epoch-warmup_epoch_count+1)/(total_epoch_count-warmup_epoch_count+1))
        return float(res)
    learning_rate_scheduler = tf.keras.callbacks.LearningRateScheduler(
        lr_scheduler, verbose=1)

    return learning_rate_scheduler


def encode_text(text,
                tokenizer,
                max_length):

    # With tokenizer's batch_encode_plus batch of both the sentences are
    # encoded together and separated by [SEP] token.
    encoded = tokenizer.batch_encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors="tf",
    )

    # Convert batch of encoded features to numpy array.
    input_ids = np.array(encoded["input_ids"], dtype="int32")
    attention_masks = np.array(encoded["attention_mask"], dtype="int32")
    token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

    return {
        "input_ids": input_ids,
        "attention_masks": attention_masks,
        "token_type_ids": token_type_ids
    }

In [8]:
train_data, valid_data, test_data, context_tokens = dataset_load(train_data, test_data)
labels = list(set(train_data["score"].values))
labels.sort()

print(len(train_data), len(valid_data), len(test_data))
print(labels)
print(context_tokens)

29542 3283 36
[0.0, 0.25, 0.5, 0.75, 1.0]
['[C09]', '[B63]', '[A47]', '[C03]', '[F04]', '[B23]', '[G01]', '[B60]', '[B05]', '[C07]', '[B41]', '[C04]', '[D04]', '[B81]', '[H04]', '[H01]', '[C11]', '[C25]', '[G02]', '[C10]', '[H02]', '[B22]', '[G06]', '[B62]', '[H05]', '[A46]', '[A01]', '[F01]', '[F02]', '[C13]', '[G07]', '[E21]', '[B29]', '[B01]', '[F41]', '[B21]', '[A21]', '[D05]', '[C02]', '[B44]', '[A61]', '[C12]', '[G03]', '[D03]', '[C06]', '[F16]', '[F24]', '[F15]', '[G21]', '[E02]', '[B61]', '[E01]', '[C22]', '[F25]', '[H03]', '[D06]', '[D01]', '[C08]', '[A63]', '[B03]', '[A41]', '[B65]', '[B02]', '[E06]', '[B66]', '[G10]', '[A24]', '[F23]', '[G04]', '[D21]', '[F42]', '[E04]', '[C01]', '[B08]', '[E03]', '[B27]', '[A23]', '[G11]', '[A44]', '[A22]', '[A43]', '[F21]', '[F17]', '[B25]', '[G08]', '[B28]', '[B32]', '[C23]', '[B07]', '[F28]', '[B24]', '[F27]', '[C14]', '[B31]', '[E05]', '[F22]', '[F03]', '[B67]', '[B64]', '[G09]', '[G16]', '[G05]', '[A45]', '[C21]', '[A62]', '[F26]']


In [13]:
train_data['title'] = train_data['title'].str.lower()
train_data['anchor'] = train_data['anchor'].str.lower()
train_data['target'] = train_data['target'].str.lower()
# Tokenizer.
tokenizer = transformers.AutoTokenizer.from_pretrained(Config.base_model)
# Context tokens. 
train_data['context_token'] = '[' + train_data.context + ']'
train_data['sep_token'] = '[SEP]'
train_data['cls_token'] = '[CLS]'
context_tokens = list(train_data.context_token.unique())
tokenizer.add_special_tokens({'additional_special_tokens': context_tokens})

# Preparing input text for the model.
# We are adding context_token before the context title
# to let model learn the context of anchor and target.
train_data['text'] = train_data['cls_token'] + \
                    train_data['context_token'] + train_data['title'] + \
                    train_data['sep_token'] + train_data['anchor'] + \
                    train_data['sep_token'] + train_data['target'] + \
                train_data['sep_token']

test_data['title'] = test_data['title'].str.lower().str.replace(";","")
test_data['anchor'] = test_data['anchor'].str.lower()
test_data['target'] = test_data['target'].str.lower()

test_data['text'] = test_data['title'] + " " + test_data['anchor']

In [14]:
train_data.head()

Unnamed: 0,id,anchor,target,context,score,title,sep_token,cls_token,context_token,text
0,9e2e6e9aa50cd8a1,define by memory,store by memory,G06,0.5,computing; calculating; counting,[SEP],[CLS],[G06],[CLS][G06]computing; calculating; counting[SEP...
1,1434a6f3ecb7d5b3,linear systems,rotating tool,B23,0.25,machine tools; metal-working not otherwise pro...,[SEP],[CLS],[B23],[CLS][B23]machine tools; metal-working not oth...
2,b709cd34cd3e5c43,metatarsal bones,forefoot bones,A61,0.75,medical or veterinary science; hygiene,[SEP],[CLS],[A61],[CLS][A61]medical or veterinary science; hygie...
3,e99070faf001ce93,generated electrical power,alternating signal,H02,0.5,generation; conversion or distribution of elec...,[SEP],[CLS],[H02],[CLS][H02]generation; conversion or distributi...
4,9f0b4938c61b5b89,high gradient magnetic separators,magnetic separator,C02,0.5,"treatment of water, waste water, sewage, or sl...",[SEP],[CLS],[C02],"[CLS][C02]treatment of water, waste water, sew..."


In [15]:
test_data.head()

Unnamed: 0,id,anchor,target,context,title,sep_token,cls_token,context_token,text
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,optics,[SEP],[CLS],[G02],optics opc drum
1,09e418c93a776564,adjust gas flow,altering gas flow,F23,combustion apparatus combustion processes,[SEP],[CLS],[F23],combustion apparatus combustion processes adju...
2,36baf228038e314b,lower trunnion,lower locating,B60,vehicles in general,[SEP],[CLS],[B60],vehicles in general lower trunnion
3,1f37ead645e7f0c8,cap component,upper portion,D06,treatment of textiles or the like laundering f...,[SEP],[CLS],[D06],treatment of textiles or the like laundering f...
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04,electric communication technique,[SEP],[CLS],[H04],electric communication technique neural stimul...


In [16]:
encoded_test_data = encode_text(test_data[["text", "target"]].values.tolist(), tokenizer, Config.max_length)

In [19]:
print(encoded_test_data["input_ids"][0])
print(encoded_test_data["attention_masks"][0])
print(encoded_test_data["token_type_ids"][0])

[    2 20691  6393  1943  6608     3 27921  5967  8328  8231 16426  6608
     3     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [20]:
test_x = [encoded_test_data["input_ids"], encoded_test_data["attention_masks"], encoded_test_data["token_type_ids"]]
print("test x shape : ", test_x[0].shape, test_x[1].shape, test_x[2].shape)

test x shape :  (36, 96) (36, 96) (36, 96)


In [21]:
def build_model(config):
    # Create the model under a distribution strategy scope.
    #with strategy.scope():
        # Encoded token ids from BERT tokenizer.
    input_ids = tf.keras.layers.Input(
        shape=(config.max_length,), dtype=tf.int32, name="input_ids"
    )
    # Attention masks indicates to the model which tokens should be attended to.
    attention_masks = tf.keras.layers.Input(
        shape=(config.max_length,), dtype=tf.int32, name="attention_masks"
    )
    # Token type ids are binary masks identifying different sequences in the model.
    token_type_ids = tf.keras.layers.Input(
        shape=(config.max_length,), dtype=tf.int32, name="token_type_ids"
    )
    # Loading pretrained BERT model.
    base_model = transformers.TFAutoModel.from_pretrained(config.base_model, from_pt=True)

    base_model_output = base_model(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )

    last_hidden_state = base_model_output.last_hidden_state
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(last_hidden_state)
    dropout = tf.keras.layers.Dropout(0.3)(avg_pool)

    output = tf.keras.layers.Dense(1, activation="sigmoid")(dropout)

    model = tf.keras.models.Model(
        inputs=[input_ids, attention_masks, token_type_ids], outputs=output
    )

    model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate=config.learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy()
    )

    return model

In [22]:
model =build_model(Config)
model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'bert.embeddings.position_ids', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 96)]         0                                            
__________________________________________________________________________________________________
attention_masks (InputLayer)    [(None, 96)]         0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 96)]         0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 344702976   input_ids[0][0]                  
                                                                 attention_masks[0][0] 

In [25]:
# load model
model.load_weights('model-2.h5')

In [26]:
# Lets predict the test data.
predictions = model.predict(test_x)

In [29]:
# Lets read sample submission file.
submission = pd.read_csv('sample_submission.csv')
submission['score'] = predictions
submission['score'] = submission.score.apply(lambda x: 0 if x < 0 else x)
submission['score'] = submission.score.apply(lambda x: 1 if x > 1 else x)

In [30]:
submission.to_csv("submission.csv",index=False)

Unnamed: 0,id,score
0,4112d61851461f60,0.363815
1,09e418c93a776564,0.363816
2,36baf228038e314b,0.363814
3,1f37ead645e7f0c8,0.363814
4,71a5b6ad068d531f,0.363816
5,474c874d0c07bd21,0.363815
6,442c114ed5c4e3c9,0.363815
7,b8ae62ea5e1d8bdb,0.363815
8,faaddaf8fcba8a3f,0.363815
9,ae0262c02566d2ce,0.363815
