<a href="https://colab.research.google.com/github/malinphy/q_17/blob/main/patent_data_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os 
import json
from typing import List
import collections
import numpy as np
import pandas as pd

In [3]:
biotech_patents_dir = 'drive/MyDrive/transfer/patent/vec_vectorized_biotech_patents_transformed_ols.json'
uspto_patents_dir = 'drive/MyDrive/transfer/patent/vec_vectorized_uspto_patents_transformed_ols.json'
nih_patents_dir = 'drive/MyDrive/transfer/patent/patent_raw/nih/nih_patents_ada_embeddings.json'

In [4]:
# with open(nih_patents_dir, 'r') as f:
#     data = json.load(f)

In [5]:
class patent_data:
    def __init__(self, dir:str):
        self.dir = dir

    def ab_title_bringer(self):
        with open (self.dir, 'r') as f:
            data = json.load(f)

        title = [data[i]['title'] for i in range(len(data))]
        abstract = [data[i]['abstract'] for i in range(len(data))]
        urls = [data[i]['url'] for i in range(len(data))]

        return title, abstract, urls

def unique_values(x: List[str]) -> List[int] :
    dup = [item for item, count in collections.Counter(x).items() if count > 1]
    unique_pos = []
    for i in dup:
        unique_pos.append(x.index(i))
    
    return(unique_pos)
    
def normalizer(str_data:str) -> str:
    return str_data.lower().rstrip().lstrip()

In [6]:
biotech_data = patent_data(biotech_patents_dir)
biotech_title, biotech_abstract,biotech_urls = biotech_data.ab_title_bringer()

uspto_data = patent_data(uspto_patents_dir)
uspto_title, uspto_abstract, biotech_urls = uspto_data.ab_title_bringer()

nih_data = patent_data(nih_patents_dir)
nih_title, nih_abstract, biotech_urls = nih_data.ab_title_bringer()

In [7]:
normalized_title_biotech = [normalizer(i) for i in biotech_title]
normalized_abstract_biotech = [normalizer(i) for i in biotech_abstract]

normalized_title_uspto = [normalizer(i) for i in uspto_title]
normalized_abstract_uspto = [normalizer(i) for i in uspto_abstract]

normalized_title_nih = [normalizer(i) for i in nih_title]
normalized_abstract_nih = [normalizer(i) for i in nih_abstract]

In [8]:
print('Title BIOTECH PATENTS :',len(normalized_title_biotech))
print('Unique titles BIOTECH PATENTS :',len(set(normalized_title_biotech)))
print('Title USPTO patents :',len(normalized_title_uspto))
print('Unique titles USPTO patents :',len(set(normalized_title_uspto)))
print('Title NIH PATENTS :',len(normalized_title_nih))
print('Unique titles NIH PATENTS :',len(set(normalized_title_nih)))

Title BIOTECH PATENTS : 484
Unique titles BIOTECH PATENTS : 228
Title USPTO patents : 4
Unique titles USPTO patents : 4
Title NIH PATENTS : 3243
Unique titles NIH PATENTS : 2842


In [9]:
print('Title BIOTECH PATENTS :',len(normalized_abstract_biotech))
print('Unique titles BIOTECH PATENTS :',len(set(normalized_abstract_biotech)))
print('Title USPTO patents :',len(normalized_abstract_uspto))
print('Unique titles USPTO patents :',len(set(normalized_abstract_uspto)))
print('Title NIH PATENTS :',len(normalized_abstract_nih))
print('Unique titles NIH PATENTS :',len(set(normalized_abstract_nih)))

Title BIOTECH PATENTS : 484
Unique titles BIOTECH PATENTS : 231
Title USPTO patents : 4
Unique titles USPTO patents : 4
Title NIH PATENTS : 3243
Unique titles NIH PATENTS : 2865


In [10]:
unique_pos_biotech = unique_values(normalized_title_biotech)
unique_pos_uspto = unique_values(normalized_title_uspto)
unique_pos_nih = unique_values(normalized_title_nih)
# np.where( np.array(normalized_title_nih) == 'selective recovery')

In [11]:
biotech_ada_title = np.load('drive/MyDrive/transfer/patent/biotech_title_vectors.npy')
biotech_ada_abstract = np.load('drive/MyDrive/transfer/patent/biotech_abstract_vectors.npy')
print(biotech_ada_title.shape)
print(biotech_ada_abstract.shape)

uspto_ada_title = np.load('drive/MyDrive/transfer/patent/uspto_title_vectors.npy')
uspto_ada_abstract = np.load('drive/MyDrive/transfer/patent/uspto_abstract_vectors.npy')
print(uspto_ada_title.shape)
print(uspto_ada_abstract.shape)

(484, 1536)
(484, 1536)
(4, 1536)
(4, 1536)


In [12]:
def index_diluter(normalized_title_data):   
    
    df = pd.DataFrame({'normalized_title_data':normalized_title_data})
    diluted_indices = list(df.drop_duplicates(subset='normalized_title_data', keep='first', inplace=False, ignore_index=False).index)
    return diluted_indices

In [13]:
def vector_diluter(title_vectors:List[float], abstract_vectors:List[float], diluted_index:List[int]) -> (List[float],List[float]) :

    title_diluted = []
    abstract_diluted = []
    for i in diluted_index:
        title_diluted.append(title_vectors[i])
        abstract_diluted.append(abstract_vectors[i])

    return title_diluted, abstract_diluted


In [14]:
biotech_title_dilution_index = index_diluter(normalized_title_biotech)
uspto_title_dilution_index = index_diluter(normalized_title_uspto)

biotech_ada_title_diluted,biotech_ada_abstract_diluted = vector_diluter(biotech_ada_title, biotech_ada_abstract, biotech_title_dilution_index)
uspto_ada_title_diluted,uspto_ada_abstract_diluted = vector_diluter(uspto_ada_title, uspto_ada_abstract, uspto_title_dilution_index)

In [18]:
biotech_ada_title_diluted= np.array(biotech_ada_title_diluted)
biotech_ada_abstract_diluted= np.array(biotech_ada_abstract_diluted)

uspto_ada_title_diluted= np.array(uspto_ada_title_diluted)
uspto_ada_abstract_diluted= np.array(uspto_ada_abstract_diluted)

In [19]:
print(biotech_ada_title_diluted.shape)
print(biotech_ada_abstract_diluted.shape)

print(uspto_ada_title_diluted.shape)
print(uspto_ada_abstract_diluted.shape)

(228, 1536)
(228, 1536)
(4, 1536)
(4, 1536)
