In [25]:
import os
import json
import sys
import pandas as pd
import requests
PATENT_BASE_URL = 'https://uspto-documents-storage.s3.amazonaws.com/docs/'
mocks_dir = os.path.join(os.path.abspath(''), 'mocks')
sys.path.insert(0, mocks_dir + '/../../classes/')
#print(mocks_dir + '/../../classes/')
from patent import Patent

In [34]:
def print_result(patent_number, key, expected, extracted):
    print(key + " is incorrect for patent id " + patent_number)
    print("Extracted: " + extracted)
    print("Expected: " + expected)

def get_patent_xml_path(patent_id):
    ret_path = None
    path_in_mock = os.path.join(mocks_dir, pat_id + '.xml')
    if os.path.exists(path_in_mock):
        ret_path = path_in_mock
    else:
        response = requests.get(PATENT_BASE_URL + patent_id + '.xml')
        if response.status_code == 200:
            with open(path_in_mock, 'wb') as f:
                f.write(response.content)
            ret_path = path_in_mock
    return ret_path

In [118]:
data_path = os.path.join(mocks_dir, 'validation_data.json')
data_file = open(data_path)
data = json.load(data_file)

In [119]:
epitope_results =  pd.DataFrame(columns=['patent_id', 'expected_is_subset', 'extracted_is_subset', 'expected_residues', 'extracted_residues'])

for datum in data:
    pat_id = datum['patentNumber']
    patent_xml_path = get_patent_xml_path(pat_id)
    try:
        patent = Patent(patent_xml_path, pat_id)
        
        for key in datum:
            if type(datum[key]) == str:
                if datum[key].lower() != patent.__dict__[key].lower(): 
                    print_result(datum['patentNumber'], key, datum[key], patent.__dict__[key])
            elif key == 'mentionedResidues':
                extracted_res = set()
                for seq_object in patent.__dict__[key]:
                    if seq_object['location'] == 'claim':
                        for res in seq_object['claimedResidues']:
                            extracted_res.add(res)
                #parsed_epitopes = list(res)
                expected_res = set(datum[key])
                exp_sub = expected_res.issubset(extracted_res)
                ext_sub = extracted_res.issubset(expected_res)
                epitope_results = epitope_results.append(
                    {'patent_id': pat_id, 'expected_is_subset': exp_sub, 'extracted_is_subset': ext_sub, 
                    'expected_residues': sorted(expected_res, key=lambda e: int(e)), 'extracted_residues': sorted(extracted_res, key=lambda e: int(e))}, ignore_index=True)
                
            else: 
                if datum[key] != patent.__dict__[key]:
                    print_result(datum['patentNumber'], key, str(datum[key]), str(patent.__dict__[key]))
    except Exception as e:
        print ("pat_id:", pat_id, "\n", e)


pat_id: US9334318B1 
 expected str, bytes or os.PathLike object, not NoneType
pat_id: US20160046675A1 
 expected str, bytes or os.PathLike object, not NoneType


In [120]:
pd.set_option('display.max_colwidth', None)
epitope_results

Unnamed: 0,patent_id,expected_is_subset,extracted_is_subset,expected_residues,extracted_residues
0,US8062640B2,True,True,[],[]
1,US8080243B2,True,True,[],[]
2,US8188234B2,True,True,[],[]
3,US8357371B2,True,True,[],[]
4,US8399646B2,True,True,[],[]
5,US8501184B2,True,True,[],[]
6,US8563698B2,False,True,"[123, 124, 125, 126, 127, 128, 129, 130, 131, 132]","[123, 124, 125, 126, 127, 128, 129, 130, 131]"
7,US8829165B2,True,True,"[153, 154, 155, 194, 238, 239, 369, 372, 374, 375, 377, 378, 379, 380, 381]","[153, 154, 155, 194, 238, 239, 369, 372, 374, 375, 377, 378, 379, 380, 381]"
8,US8889144B2,True,True,[],[]
9,US9175093B2,True,True,[],[]


In [121]:
exp_sub_ratio = len(epitope_results[epitope_results.expected_is_subset]) / len(epitope_results)
ext_sub_ratio = len(epitope_results[epitope_results.extracted_is_subset	]) / len(epitope_results)
exp_same_ext = len(epitope_results[epitope_results.expected_is_subset & epitope_results.extracted_is_subset]) / len(epitope_results)

In [122]:
print("Overall:")
print("Ratio of Expected is subset of Extracted:", exp_sub_ratio)
print("Ratio of Extracted is subset of Expected:", ext_sub_ratio)
print("Ratio of both are same:", exp_same_ext)

Overall:
exp_sub_ratio: 0.8695652173913043
ext_sub_ratio: 0.782608695652174
exp_same_ext: 0.6739130434782609


In [125]:
non_empty_exp = epitope_results[epitope_results.expected_residues.str.len() != 0]
non_empty_exp_sub_ratio = len(non_empty_exp[non_empty_exp.expected_is_subset]) / len(non_empty_exp)
non_empty_ext_sub_ratio = len(non_empty_exp[non_empty_exp.extracted_is_subset	]) / len(non_empty_exp)
non_empty_exp_same_ext = len(non_empty_exp[non_empty_exp.expected_is_subset & non_empty_exp.extracted_is_subset]) / len(non_empty_exp)

In [126]:
print("Non Empty Expected:")
print("Ratio of Expected is subset of Extracted:", non_empty_exp_sub_ratio)
print("Ratio of Extracted is subset of Expected:", non_empty_ext_sub_ratio)
print("Ratio of both are same:", non_empty_exp_same_ext)

Non Empty Expected:
non_empty_exp_sub_ratio: 0.25
non_empty_ext_sub_ratio: 0.875
non_empty_exp_same_ext: 0.25


In [127]:
non_empty_ext = epitope_results[epitope_results.extracted_residues.str.len() != 0]
non_empty_ext_exp_sub_ratio = len(non_empty_ext[non_empty_ext.expected_is_subset]) / len(non_empty_ext)
non_empty_ext_ext_sub_ratio = len(non_empty_ext[non_empty_ext.extracted_is_subset	]) / len(non_empty_ext)
non_empty_ext_exp_same_ext = len(non_empty_ext[non_empty_ext.expected_is_subset & non_empty_ext.extracted_is_subset]) / len(non_empty_ext)

In [128]:
print("Non Empty Extracted:")
print("Ratio of Expected is subset of Extracted:", non_empty_ext_exp_sub_ratio)
print("Ratio of Extracted is subset of Expected:", non_empty_ext_ext_sub_ratio)
print("Ratio of both are same:", non_empty_ext_exp_same_ext)

Non Empty Extracted:
non_empty_ext_exp_sub_ratio: 0.7857142857142857
non_empty_ext_ext_sub_ratio: 0.2857142857142857
non_empty_ext_exp_same_ext: 0.14285714285714285
