## MSDS 631 - Final Project

### Chandan Nayak, Jaysen Shi

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import spacy

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

In [None]:
train = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv')
train.head()

In [None]:
%cd /kaggle
!wget https://www.cooperativepatentclassification.org/sites/default/files/cpc/bulk/CPCTitleList202205.zip
!unzip CPCTitleList202205.zip

parsed = {x: [] for x in ['code', 'title', 'section', 'class', 'subclass', 'group', 'main_group']}

for letter in 'ABCDEFGHY':
    file = f'cpc-section-{letter}_20220501.txt'
    with open(file) as f:
        for line in f:
            vals = line.strip().split('\t')
            if len(vals) == 2:
                parsed['code'].append(vals[0])
                parsed['title'].append(vals[1])
            elif len(vals) == 3:
                parsed['code'].append(vals[0])
                parsed['title'].append(vals[2])

for i in range(len(parsed['code'])):
    code = parsed['code'][i]
    main_group = code.split('/')[-1] if "/" in code else None
    group = code.split('/')[0][4:] if len(code) >= 5 else None
    subclass = code[3] if len(code) >= 4 else None
    class_ = code[1:3] if len(code) >= 3 else None
    section = code[0] if len(code) >= 1 else None
    
    parsed['main_group'].append(main_group)
    parsed['group'].append(group)
    parsed['subclass'].append(subclass)
    parsed['class'].append(class_)
    parsed['section'].append(section)
    
df = pd.DataFrame.from_dict(parsed)
df.head()

df.to_csv('/kaggle/working/titles.csv', index=False)

In [None]:
code = pd.read_csv('/kaggle/working/titles.csv')
code.head()

In [None]:
code[code['code'] == 'A47']['title']

In [None]:
train = train.merge(code, how='inner', left_on='context', right_on='code')

train.head()

In [None]:
import string
def clean_txt(row):
    row = row.lower()
    row = row.translate(str.maketrans('', '', string.punctuation))
    return row
train['title'] = train['title'].apply(clean_txt)

In [None]:
def change_values(row):
    class_map = {0.00:0,
                0.25:1,
                0.50:2,
                0.75:3,
                1.00:4}
    return class_map[row]
train['score'] = train['score'].apply(change_values)

In [None]:
train.head()

## EDA

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'
plt.style.library['seaborn-darkgrid']

fig, ax = plt.subplots(figsize=(8,5))
ax.bar(['0.00','0.25','0.50','0.75','1.00'],[7471,11519,12300,4029,1154])
plt.title("Class imbalance - Histogram of context scores")
ax.spines.right.set_visible(False)
ax.spines.top.set_visible(False)
ax.set_xlabel("Context scores")
plt.show()
plt.savefig('classimb.png')



In [None]:
from wordcloud import WordCloud, STOPWORDS
biglist = []
for word in train['anchor']:
    temp = word.split(' ')
    biglist.extend(temp)
wordcloud = WordCloud().generate(' '.join(biglist))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
biglist2 = []
for word in train['target']:
    temp = word.split(' ')
    biglist2.extend(temp)
wordcloud = WordCloud().generate(' '.join(biglist2))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

### Dataset and dataloader preparation

We will use Pytorch's dataset class to construct a bespoke dataset class that will take either of the augmented train or test datasets

In [None]:
import datasets, transformers

from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import AutoModelForMaskedLM
        
        
    

In [None]:

class CFG:
    
    input_path = '../input/us-patent-phrase-to-phrase-matching/'
    model_path = '../input/bert-for-patent/bert-for-patents'
    model = "bert-for-patents"    
    learning_rate = 2e-5
    weight_decay = 0.01   
    epochs = 5
    batch_size = 32
    



In [None]:

model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)



In [None]:
train.to_csv('train_augmented.csv')

In [None]:
train_ds = datasets.Dataset.from_csv('train_augmented.csv')

In [None]:
def preprocess(row):
    text = row['anchor']
    extra = row['title']
    
    return {**tokenizer(text+' '+extra, row['target']),
           'label':row['score']}

encoded_ds = train_ds.map(preprocess, remove_columns= ['Unnamed: 0', 'id', 'anchor', 'target', 'context', 'score', 'code', 'title', 'section', 'class', 'subclass', 'group', 'main_group'])


In [None]:
encoded_ds = encoded_ds.train_test_split(test_size=0.1)
encoded_ds

In [None]:


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }


args = TrainingArguments(
    f"uspppm",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=CFG.learning_rate,
    per_device_train_batch_size=CFG.batch_size,
    per_device_eval_batch_size=CFG.batch_size,
    num_train_epochs=CFG.epochs,
    weight_decay=CFG.weight_decay,
    load_best_model_at_end=True,
    metric_for_best_model="pearson",
)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_ds["train"],
    eval_dataset=encoded_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()