# Patent Kaggle challenge

notebook from fastai course lesson 4

## Importing kaggle dataset

In [None]:
!kaggle competitions download -c us-patent-phrase-to-phrase-matching 
!unzip us-patent-phrase-to-phrase-matching.zip -d us-patent-phrase-to-phrase-matching
!rm us-patent-phrase-to-phrase-matching.zip

In [None]:
from pathlib import Path

path = Path('us-patent-phrase-to-phrase-matching')
!ls {path}

## Viewing data

In [None]:
import pandas as pd
df = pd.read_csv(path/'train.csv')
df.head()

In [None]:
df.describe(include='object')

In [None]:
df.target.value_counts()

In [None]:
df['section'] = df.context.str[0]
df.section.value_counts()

In [None]:
df.score.hist();

In [None]:
df[df.score==1]

In [None]:
df['input'] = 'TEXT1: ' + df.anchor + '; TEXT2: ' + df.target + '; CONTEXT: ' + df.context
df.input.head()

## Tokenisation

In [None]:
from datasets import Dataset,DatasetDict

ds = Dataset.from_pandas(df)
ds = ds.rename_columns({'score':'labels'})
ds

In [None]:
model_nm = 'google-bert/bert-base-uncased'
#model_nm = "distilbert/distilbert-base-uncased"
from transformers import AutoModelForSequenceClassification,AutoTokenizer
tokz = AutoTokenizer.from_pretrained(model_nm)

In [None]:
tokz.tokenize("G'day folks, I'm Jeremy from fast.ai!")

In [None]:
tokz.tokenize("A platypus is an ornithorhynchus anatinus.")

In [None]:
print(len(tokz.vocab))
print(tokz.vocab['of'])

In [None]:
print(ord('▁'))
print(ord('_'))

In [None]:
tokz.all_special_tokens

In [None]:
sep = tokz.sep_token
sep

In [None]:
df['input'] = df['anchor'] + sep + df['target'] + sep + df['context']
df.input.head()

In [None]:
ds = Dataset.from_pandas(df)
ds = ds.rename_columns({'score':'labels'})


In [None]:
def tok_func(x): return tokz(x["input"])
tok_ds = ds.map(tok_func, batched=True, 
               remove_columns=('id', 'anchor', 'target', 'context', 'input', 'section'))
tok_ds

In [None]:
tok_ds[0]

Test Set 

In [None]:
eval_df = pd.read_csv(path/'test.csv')
eval_df.describe()

In [None]:
eval_df['input'] = eval_df.anchor + sep + eval_df.target + sep + eval_df.context
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True, remove_columns=('id', 'anchor', 'target', 'context', 'input'))

### Validation and Test data

Validation dataset

In [None]:
# method 1 : randomly pick items from the training data
dds = tok_ds.train_test_split(0.25, seed=42)
dds

method 2 : randomly select anchor values and pick all rows with this anchor values.

This way, anchor values do not overlap btween validation and training dataset

In [None]:
anchors = df.anchor.unique()
anchors[:10]

In [None]:
import numpy as np
np.random.shuffle(anchors)
anchors[:10]

In [None]:
# proportion of distinct anchor values in the validation dataset
val_prop = 0.25

# number of distinct anchor values in the validation dataset
val_sz = int(len(anchors)*val_prop)

# list of anchor values in the validation dataset
val_anchors = anchors[:val_sz]

# boolean variable indicating whether an observation corresponds to a validation anchor value
is_val = df.anchor.isin(val_anchors)

# lists of rows in the validation dataset and in the train dataset
idxs = np.arange(len(df))
val_idxs = idxs[is_val]
trn_idxs = idxs[~is_val]



In [None]:
len(val_idxs),len(trn_idxs)

In [None]:
dds = DatasetDict({"train":tok_ds.select(trn_idxs),
             "test": tok_ds.select(val_idxs)})


In [None]:
df.iloc[trn_idxs].score.mean().round(2), df.iloc[val_idxs].score.mean().round(2)

## Metric

Transformers expect metrics to be returned as a dictionary

In [None]:
import numpy as np
def corr(x,y): return np.corrcoef(x.flatten(),y)[0][1]
def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}

## Training

In [None]:
from transformers import TrainingArguments,Trainer

In [None]:
bs = 128
epochs = 3
lr = 8e-5
wd = 0.01

args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
        eval_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
        num_train_epochs=epochs, weight_decay=wd, report_to='none')


In [None]:

model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
    
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                   processing_class=tokz, compute_metrics = corr_d)

In [None]:
trainer.train();

## Improving the model

In [None]:
def get_dds(df) : 
    ds = Dataset.from_pandas(df).rename_columns({'score':'labels'})
    tok_ds = ds.map(tok_func, batched=True, 
               remove_columns=('id', 'anchor', 'target', 'context', 'input', 'section'))
    dds = DatasetDict({"train":tok_ds.select(trn_idxs),
             "test": tok_ds.select(val_idxs)})
    return dds

def get_model(): 
    return AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1), AutoTokenizer.from_pretrained(model_nm)

def get_trainer(dds, model = None):
    if model is None: model, tokz = get_model()

    args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
        eval_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
        num_train_epochs=epochs, weight_decay=wd, report_to='none')

    return Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                   tokenizer=tokz, compute_metrics=corr_d)

In [None]:
# Our model as it is : 

#model_nm = 'google-bert/bert-base-uncased'
model_nm = "distilbert/distilbert-base-uncased"
dds = get_dds(df)
trainer = get_trainer(dds)

trainer.train()

In [None]:
# Try new separator
sep = " [s] "
SEP = tokz.sep_token
sep1 = " [s1] "
sep2 = " [s2] "
df['input'] = "  [TEXT1 : ]  " + df.context  + "  [TEXT2 : ]  " +  df.anchor  + "  [target]  " + df.target
database = get_dds(df)
get_trainer(database).train()

In [None]:
df['input'] = df['input'].str.lower()
dds = get_dds(df)
get_trainer(dds).train()

In [None]:
model_nm = "anferico/bert-for-patentse"
get_trainer(dds).train()


In [None]:
df['input'] 

## Predictions

In [None]:
preds = trainer.predict(eval_ds).predictions.astype(float)
preds = np.clip(preds, 0, 1)
print(preds)

In [None]:
import datasets

submission = datasets.Dataset.from_dict({
    'id': eval_ds['id'],
    'score': preds
})

submission.to_csv('submission.csv', index=False)