In [1]:
import pandas as pd
import numpy as np

In [29]:
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', None)

In [2]:
applications_file = r"./data/pregrant/application.tsv"
_applications_df = pd.read_csv(applications_file, sep='\t')
# 

inventor_file = r"./data/pregrant/inventor.tsv"
_inventor_df = pd.read_csv(inventor_file, sep='\t')
# 

assignee_file = r"./data/pregrant/assignee.tsv"
_assignee_df = pd.read_csv(assignee_file, sep='\t')
# 

location_file = r"./data/pregrant/location.tsv"
_location_df = pd.read_csv(location_file, sep='\t')
# 

publication_assignee_file = r"./data/pregrant/publication_assignee.tsv"
_publication_assignee_df = pd.read_csv(publication_assignee_file, sep='\t')
# 

publication_inventor_file = r"./data/pregrant/publication_inventor.tsv"
_publication_inventor_df = pd.read_csv(publication_inventor_file, sep='\t')
# 




## Application Data

In [3]:
# get year from date
_applications_df = _applications_df[~_applications_df.date.isnull()]
_applications_df['year'] = _applications_df.date.str[:4]
_applications_df = _applications_df.assign(year = _applications_df.year.astype('int'))


_applications_df.head()

Unnamed: 0,id,document_number,type,application_number,date,country,series_code,invention_title,invention_abstract,rule_47_flag,filename,year
0,00000184-4904-11ec-b427-12de62d610b1,20210259522,utility,16919476,2020-07-02,US,16,MEDICAL VISUALIZATION SYSTEM,A medical visualisation system including a fir...,False,,2020
1,0000021c-ccf9-11ea-ba95-121df0c29c1e,20100070354,utility,12412361,2009-03-27,US,12,SYSTEM AND METHOD FOR A MERCHANT DEBIT CARD PR...,A merchant debit card program is described tha...,False,ipa100318.xml,2009
2,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,,2018
3,00003dee-4904-11ec-b427-12de62d610b1,20210259523,utility,17316500,2021-05-10,US,17,MEDICAL IMAGING DEVICE WITH A TELESCOPIC SCOPE,The subject matter discloses a medical imaging...,False,,2021
4,00004fba-ccf9-11ea-ba95-121df0c29c1e,20100070355,utility,12623189,2009-11-20,US,12,Methods for Transmitting Multimedia Files and ...,The invention is directed to a method of trans...,False,ipa100318.xml,2009


In [4]:
# _applications_df.year.value_counts()

## publication_assignee crosswalk

In [5]:
_publication_assignee_df.head()

Unnamed: 0,document_number,assignee_id,sequence,location_id
0,20050000014,b27df54c-0a0d-4ae9-98c7-ccd172eb0c0e,1,f54d6149-cb8e-11eb-9615-121df0c29c1e
1,20050000023,1293d184-e3b2-467d-95f3-02fb4473cf6e,1,cfe77bc3-cb8e-11eb-9615-121df0c29c1e
2,20050000025,6f07167a-0983-4bb7-b7dc-0cd391e6c279,1,d6338035-cb8e-11eb-9615-121df0c29c1e
3,20050000029,417a65f9-a4c7-4061-964f-ffd992ce74bd,1,3cb80671-cb8e-11eb-9615-121df0c29c1e
4,20050000031,417a65f9-a4c7-4061-964f-ffd992ce74bd,1,3cb80671-cb8e-11eb-9615-121df0c29c1e


In [6]:
# Fixing one to many problem here.
publication_assignee_df = pd.DataFrame(_publication_assignee_df.groupby(['document_number', 'assignee_id', 'location_id']).size()).reset_index()
publication_assignee_df = publication_assignee_df.rename(columns={0: "count"})
publication_assignee_df.sort_values(by='count', ascending=False)

Unnamed: 0,document_number,assignee_id,location_id,count
1561380,20130067161,ae107234-b77a-4d73-97bc-ae1316ece835,e8e360fe-cb8e-11eb-9615-121df0c29c1e,5
1001823,20100254991,93c50a7b-1b47-4b8c-9474-7497967beaed,3e7b68e2-cb8e-11eb-9615-121df0c29c1e,4
406839,20070251083,ee88a895-514d-40bb-b9a6-db8fb20ad697,3cb80671-cb8e-11eb-9615-121df0c29c1e,4
1542091,20130041234,1219f1dd-7ce7-40fc-ae46-ac91aaf6a02c,e7275ca8-cb8f-11eb-9615-121df0c29c1e,4
2363595,20170119177,4e8fc194-af9a-41b8-931c-cd3ef3db9c11,fff50a17-cb8f-11eb-9615-121df0c29c1e,3
...,...,...,...,...
990773,20100238976,c08bdcaa-2331-4fa7-9e90-e5a4a659fc73,ec16f9be-cb90-11eb-9615-121df0c29c1e,1
990774,20100238977,80306c57-ba73-4e88-97c6-81b918f2dcf0,fe4bbe3c-cb8f-11eb-9615-121df0c29c1e,1
990775,20100238978,e77c94a4-d0b0-449e-be41-898f5f1bbc04,fdb12d00-cb8f-11eb-9615-121df0c29c1e,1
990776,20100238982,78b461d5-10ed-4ae2-b411-380977df8d07,fd46655c-09bc-11ec-893a-12de62d610b1,1


In [7]:
application_publication_assignee_df = pd.merge(_applications_df, publication_assignee_df, on=['document_number'], how='inner', indicator='matched', validate='one_to_many')
application_publication_assignee_df = application_publication_assignee_df.drop('matched', axis=1)

application_publication_assignee_df.head()

Unnamed: 0,id,document_number,type,application_number,date,country,series_code,invention_title,invention_abstract,rule_47_flag,filename,year,assignee_id,location_id,count
0,0000021c-ccf9-11ea-ba95-121df0c29c1e,20100070354,utility,12412361,2009-03-27,US,12,SYSTEM AND METHOD FOR A MERCHANT DEBIT CARD PR...,A merchant debit card program is described tha...,False,ipa100318.xml,2009,10068d52-f4ee-47a2-9950-599af79a6484,f9139cb2-cb8f-11eb-9615-121df0c29c1e,1
1,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,,2018,7b2f1943-78fd-4a9f-9c3e-d32ca19cc371,baa6fcdc-cb8e-11eb-9615-121df0c29c1e,1
2,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,,2018,b4401241-2dd1-46d2-af15-8d91b699e960,baa6fcdc-cb8e-11eb-9615-121df0c29c1e,1
3,00004fba-ccf9-11ea-ba95-121df0c29c1e,20100070355,utility,12623189,2009-11-20,US,12,Methods for Transmitting Multimedia Files and ...,The invention is directed to a method of trans...,False,ipa100318.xml,2009,c0129a05-4813-44df-871d-205e59aa0bf7,fd10d2b1-cb8e-11eb-9615-121df0c29c1e,1
4,0000680f-4904-11ec-b427-12de62d610b1,20210259524,utility,17319550,2021-05-13,US,17,"ENDOSCOPE HEAD, ENDOSCOPE AND ALBARRAN LEVER H...",The invention refers to an attachment for an e...,False,,2021,68671a0c-3274-4382-8c88-40b8cbb31765,cfca1ca8-cb90-11eb-9615-121df0c29c1e,1


## Assignees


In [8]:
_assignee_df.head()

Unnamed: 0,id,type,name_first,name_last,organization
0,000049db-1cf0-4735-8a71-5a44b04859a6,2.0,,,"Atlantech International, Inc."
1,000074b2-2134-459b-a481-26c72c2313ae,3.0,,,Shanghai Juge Electronics Technologies Co. Ltd.
2,00007cb5-f03a-4547-8b7a-dd97dd959ef8,2.0,,,UrVibe LLC
3,0000c3c1-5ce8-4f80-8e21-d263467ab1fd,4.0,Wiley L.,"Day, Jr.",
4,0000c853-052b-4c30-acea-0c64e371349c,2.0,,,"Ringertown Innovations, LLC"


In [9]:

print(_assignee_df.shape)
# get rid of ones that are not organizations
#assignee_df = _assignee_df[~_assignee_df.organization.isnull()]
#print(assignee_df.shape)
_assignee_df.head()

(540183, 5)


Unnamed: 0,id,type,name_first,name_last,organization
0,000049db-1cf0-4735-8a71-5a44b04859a6,2.0,,,"Atlantech International, Inc."
1,000074b2-2134-459b-a481-26c72c2313ae,3.0,,,Shanghai Juge Electronics Technologies Co. Ltd.
2,00007cb5-f03a-4547-8b7a-dd97dd959ef8,2.0,,,UrVibe LLC
3,0000c3c1-5ce8-4f80-8e21-d263467ab1fd,4.0,Wiley L.,"Day, Jr.",
4,0000c853-052b-4c30-acea-0c64e371349c,2.0,,,"Ringertown Innovations, LLC"


In [43]:
# Should just be a single id per assignee but this data....
# assignee_df = pd.DataFrame(_assignee_df.groupby(['id']).size()).reset_index()
_assignee_df.id.duplicated().sum()
#pd.DataFrame(_assignee_df.groupby(['id']).size()).value_counts()

0

In [11]:
# application_assignee_df = pd.merge(application_publication_assignee_df, _assignee_df, left_on=['assignee_id'], right_on=['id'], how='left', indicator='matched', validate='many_to_one')
application_assignee_df = pd.merge(application_publication_assignee_df, _assignee_df, left_on=['assignee_id'], right_on=['id'], how='left')
# application_assignee_df.query("matched!='both'")

# get rid of places that didn't have an assignee...
# application_assignee_df = application_assignee_df[~application_assignee_df.id.isnull()]
application_assignee_df

Unnamed: 0,id_x,document_number,type_x,application_number,date,country,series_code,invention_title,invention_abstract,rule_47_flag,filename,year,assignee_id,location_id,count,id_y,type_y,name_first,name_last,organization
0,0000021c-ccf9-11ea-ba95-121df0c29c1e,20100070354,utility,12412361,2009-03-27,US,12,SYSTEM AND METHOD FOR A MERCHANT DEBIT CARD PR...,A merchant debit card program is described tha...,False,ipa100318.xml,2009,10068d52-f4ee-47a2-9950-599af79a6484,f9139cb2-cb8f-11eb-9615-121df0c29c1e,1,10068d52-f4ee-47a2-9950-599af79a6484,2.0,,,"VISA USA, INC."
1,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,,2018,7b2f1943-78fd-4a9f-9c3e-d32ca19cc371,baa6fcdc-cb8e-11eb-9615-121df0c29c1e,1,7b2f1943-78fd-4a9f-9c3e-d32ca19cc371,3.0,,,TOHOKU UNIVERSITY
2,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,,2018,b4401241-2dd1-46d2-af15-8d91b699e960,baa6fcdc-cb8e-11eb-9615-121df0c29c1e,1,b4401241-2dd1-46d2-af15-8d91b699e960,3.0,,,"TOHOKU STEEL CO., LTD."
3,00004fba-ccf9-11ea-ba95-121df0c29c1e,20100070355,utility,12623189,2009-11-20,US,12,Methods for Transmitting Multimedia Files and ...,The invention is directed to a method of trans...,False,ipa100318.xml,2009,c0129a05-4813-44df-871d-205e59aa0bf7,fd10d2b1-cb8e-11eb-9615-121df0c29c1e,1,c0129a05-4813-44df-871d-205e59aa0bf7,3.0,,,Clarity Pharmaceuticals Ltd
4,0000680f-4904-11ec-b427-12de62d610b1,20210259524,utility,17319550,2021-05-13,US,17,"ENDOSCOPE HEAD, ENDOSCOPE AND ALBARRAN LEVER H...",The invention refers to an attachment for an e...,False,,2021,68671a0c-3274-4382-8c88-40b8cbb31765,cfca1ca8-cb90-11eb-9615-121df0c29c1e,1,68671a0c-3274-4382-8c88-40b8cbb31765,3.0,,,DIGITAL ENDOSCOPY GMBH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2969382,ffffe76f-ccf8-11ea-ba95-121df0c29c1e,20100070343,utility,12556076,2009-09-09,US,12,"SYSTEM AND METHOD FOR AGGREGATION, ANALYSIS, P...",Embodiments of systems and methods for the agg...,False,ipa100318.xml,2009,3341dd6e-43f6-4853-a436-b092a65c4a60,fe664e0e-cb8e-11eb-9615-121df0c29c1e,1,3341dd6e-43f6-4853-a436-b092a65c4a60,2.0,,,Truecar.com
2969383,ffffe9d9-ccf8-11ea-ba95-121df0c29c1e,20100070344,utility,12556109,2009-09-09,US,12,SYSTEM AND METHOD FOR CALCULATING AND DISPLAYI...,Embodiments of systems and methods for the agg...,False,ipa100318.xml,2009,3341dd6e-43f6-4853-a436-b092a65c4a60,fe664e0e-cb8e-11eb-9615-121df0c29c1e,1,3341dd6e-43f6-4853-a436-b092a65c4a60,2.0,,,Truecar.com
2969384,fffff581-ccf8-11ea-ba95-121df0c29c1e,20100070349,utility,12517260,2007-11-29,US,12,ROAD TOLL SYSTEM,A road toll system comprises a vehicle-mounted...,False,ipa100318.xml,2007,d25fa625-726a-4c37-a21e-7606b802cf50,ee4e6706-cb8f-11eb-9615-121df0c29c1e,1,d25fa625-726a-4c37-a21e-7606b802cf50,3.0,,,NXP B.V.
2969385,fffff7d2-ccf8-11ea-ba95-121df0c29c1e,20100070350,utility,12259937,2008-10-28,US,12,DYNAMIC PRICING FOR CONTENT PRESENTATIONS,A request for content is received. First conte...,False,ipa100318.xml,2008,2721c099-5a78-45fa-b3a7-11d119300596,ff4c2272-cb8e-11eb-9615-121df0c29c1e,1,2721c099-5a78-45fa-b3a7-11d119300596,2.0,,,Google LLC


In [12]:
#all have assignees
#application_assignee_df.assignee_id.isnull().sum()

In [13]:
application_assignee_df.head()

Unnamed: 0,id_x,document_number,type_x,application_number,date,country,series_code,invention_title,invention_abstract,rule_47_flag,filename,year,assignee_id,location_id,count,id_y,type_y,name_first,name_last,organization
0,0000021c-ccf9-11ea-ba95-121df0c29c1e,20100070354,utility,12412361,2009-03-27,US,12,SYSTEM AND METHOD FOR A MERCHANT DEBIT CARD PR...,A merchant debit card program is described tha...,False,ipa100318.xml,2009,10068d52-f4ee-47a2-9950-599af79a6484,f9139cb2-cb8f-11eb-9615-121df0c29c1e,1,10068d52-f4ee-47a2-9950-599af79a6484,2.0,,,"VISA USA, INC."
1,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,,2018,7b2f1943-78fd-4a9f-9c3e-d32ca19cc371,baa6fcdc-cb8e-11eb-9615-121df0c29c1e,1,7b2f1943-78fd-4a9f-9c3e-d32ca19cc371,3.0,,,TOHOKU UNIVERSITY
2,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,,2018,b4401241-2dd1-46d2-af15-8d91b699e960,baa6fcdc-cb8e-11eb-9615-121df0c29c1e,1,b4401241-2dd1-46d2-af15-8d91b699e960,3.0,,,"TOHOKU STEEL CO., LTD."
3,00004fba-ccf9-11ea-ba95-121df0c29c1e,20100070355,utility,12623189,2009-11-20,US,12,Methods for Transmitting Multimedia Files and ...,The invention is directed to a method of trans...,False,ipa100318.xml,2009,c0129a05-4813-44df-871d-205e59aa0bf7,fd10d2b1-cb8e-11eb-9615-121df0c29c1e,1,c0129a05-4813-44df-871d-205e59aa0bf7,3.0,,,Clarity Pharmaceuticals Ltd
4,0000680f-4904-11ec-b427-12de62d610b1,20210259524,utility,17319550,2021-05-13,US,17,"ENDOSCOPE HEAD, ENDOSCOPE AND ALBARRAN LEVER H...",The invention refers to an attachment for an e...,False,,2021,68671a0c-3274-4382-8c88-40b8cbb31765,cfca1ca8-cb90-11eb-9615-121df0c29c1e,1,68671a0c-3274-4382-8c88-40b8cbb31765,3.0,,,DIGITAL ENDOSCOPY GMBH


In [None]:
application_assignee_df.groupby(['document_number', 'location_id'


### Merge this again with location.

## Inventors

In [14]:
_publication_inventor_df.head()

Unnamed: 0,document_number,inventor_id,sequence,location_id
0,20050000001,fl:ti_ln:goldkind-1,1,fa3d02fd-09bd-11ec-893a-12de62d610b1
1,20050000002,fl:je_ln:levy-10,2,b778d60a-cb8e-11eb-9615-121df0c29c1e
2,20050000002,fl:ph_ln:levy-4,1,f8b3a9cd-cb90-11eb-9615-121df0c29c1e
3,20050000003,9958fa19-3b0f-11eb-a3cd-121df0c29c1e,1,4c4e3991-cb8e-11eb-9615-121df0c29c1e
4,20050000004,fl:da_ln:yun-29,1,efa16d0b-cb8f-11eb-9615-121df0c29c1e


In [15]:
# Fixing one to many problem here.
publication_inventor_df = pd.DataFrame(_publication_inventor_df.groupby(['document_number', 'inventor_id', 'location_id']).size()).reset_index()
publication_inventor_df = publication_inventor_df.rename(columns={0: "count"})
publication_inventor_df.sort_values(by='count')

Unnamed: 0,document_number,inventor_id,location_id,count
0,20050000001,fl:ti_ln:goldkind-1,fa3d02fd-09bd-11ec-893a-12de62d610b1,1
11235789,20170146912,fl:ha_ln:mann-2,ab02f7aa-cb8e-11eb-9615-121df0c29c1e,1
11235790,20170146913,fl:sh_ln:hirukawa-3,e85c02d5-cb8f-11eb-9615-121df0c29c1e,1
11235791,20170146913,fl:ta_ln:kudo-41,5c5ac67d-cb8e-11eb-9615-121df0c29c1e,1
11235792,20170146914,fl:no_ln:saito-49,fd46655c-09bc-11ec-893a-12de62d610b1,1
...,...,...,...,...
5461708,20110224034,fl:qi_ln:tu-7,3ce1d531-cb8e-11eb-9615-121df0c29c1e,3
7966937,20140135393,fl:si_ln:roy-10,3c736a08-cb8e-11eb-9615-121df0c29c1e,3
1427052,20060269535,fl:a._ln:naidu-4,f33e7145-cb8e-11eb-9615-121df0c29c1e,3
13400039,20190136670,fl:al_ln:gorbunov-1,e8ee00ec-cb90-11eb-9615-121df0c29c1e,3


## join to application

- many to many? on doc # and location id
    - one application may have multiple inventors 
    - one inventor could have multiple patent applications

In [16]:
# test_merge = pd.merge(application_assignee_df, _publication_inventor_df, 
#                       on=['document_number', 'location_id'], 
#                       how='left', 
#                       indicator='matched', 
#                       validate='many_to_many')

In [17]:
application_assignee_inventor_df = pd.merge(application_assignee_df, _publication_inventor_df, 
                      on=['document_number', 'location_id'], 
                      how='left')

application_assignee_inventor_df

Unnamed: 0,id_x,document_number,type_x,application_number,date,country,series_code,invention_title,invention_abstract,rule_47_flag,...,assignee_id,location_id,count,id_y,type_y,name_first,name_last,organization,inventor_id,sequence
0,0000021c-ccf9-11ea-ba95-121df0c29c1e,20100070354,utility,12412361,2009-03-27,US,12,SYSTEM AND METHOD FOR A MERCHANT DEBIT CARD PR...,A merchant debit card program is described tha...,False,...,10068d52-f4ee-47a2-9950-599af79a6484,f9139cb2-cb8f-11eb-9615-121df0c29c1e,1,10068d52-f4ee-47a2-9950-599af79a6484,2.0,,,"VISA USA, INC.",,
1,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,...,7b2f1943-78fd-4a9f-9c3e-d32ca19cc371,baa6fcdc-cb8e-11eb-9615-121df0c29c1e,1,7b2f1943-78fd-4a9f-9c3e-d32ca19cc371,3.0,,,TOHOKU UNIVERSITY,fl:da_ln:chiba-1,4.0
2,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,...,7b2f1943-78fd-4a9f-9c3e-d32ca19cc371,baa6fcdc-cb8e-11eb-9615-121df0c29c1e,1,7b2f1943-78fd-4a9f-9c3e-d32ca19cc371,3.0,,,TOHOKU UNIVERSITY,fl:fu_ln:narita-1,0.0
3,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,...,7b2f1943-78fd-4a9f-9c3e-d32ca19cc371,baa6fcdc-cb8e-11eb-9615-121df0c29c1e,1,7b2f1943-78fd-4a9f-9c3e-d32ca19cc371,3.0,,,TOHOKU UNIVERSITY,fl:ma_ln:watanabe-199,3.0
4,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,...,7b2f1943-78fd-4a9f-9c3e-d32ca19cc371,baa6fcdc-cb8e-11eb-9615-121df0c29c1e,1,7b2f1943-78fd-4a9f-9c3e-d32ca19cc371,3.0,,,TOHOKU UNIVERSITY,fl:ry_ln:onodera-1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4048134,ffffe9d9-ccf8-11ea-ba95-121df0c29c1e,20100070344,utility,12556109,2009-09-09,US,12,SYSTEM AND METHOD FOR CALCULATING AND DISPLAYI...,Embodiments of systems and methods for the agg...,False,...,3341dd6e-43f6-4853-a436-b092a65c4a60,fe664e0e-cb8e-11eb-9615-121df0c29c1e,1,3341dd6e-43f6-4853-a436-b092a65c4a60,2.0,,,Truecar.com,fl:mi_ln:swinson-5,5.0
4048135,ffffe9d9-ccf8-11ea-ba95-121df0c29c1e,20100070344,utility,12556109,2009-09-09,US,12,SYSTEM AND METHOD FOR CALCULATING AND DISPLAYI...,Embodiments of systems and methods for the agg...,False,...,3341dd6e-43f6-4853-a436-b092a65c4a60,fe664e0e-cb8e-11eb-9615-121df0c29c1e,1,3341dd6e-43f6-4853-a436-b092a65c4a60,2.0,,,Truecar.com,fl:to_ln:taira-2,1.0
4048136,fffff581-ccf8-11ea-ba95-121df0c29c1e,20100070349,utility,12517260,2007-11-29,US,12,ROAD TOLL SYSTEM,A road toll system comprises a vehicle-mounted...,False,...,d25fa625-726a-4c37-a21e-7606b802cf50,ee4e6706-cb8f-11eb-9615-121df0c29c1e,1,d25fa625-726a-4c37-a21e-7606b802cf50,3.0,,,NXP B.V.,,
4048137,fffff7d2-ccf8-11ea-ba95-121df0c29c1e,20100070350,utility,12259937,2008-10-28,US,12,DYNAMIC PRICING FOR CONTENT PRESENTATIONS,A request for content is received. First conte...,False,...,2721c099-5a78-45fa-b3a7-11d119300596,ff4c2272-cb8e-11eb-9615-121df0c29c1e,1,2721c099-5a78-45fa-b3a7-11d119300596,2.0,,,Google LLC,,


In [34]:
application_assignee_inventor_df.duplicated().sum()

0

In [35]:
application_assignee_inventor_df.query("application_number == 13483997")

Unnamed: 0,id_x,document_number,type_x,application_number,date,country,series_code,invention_title,invention_abstract,rule_47_flag,filename,year,assignee_id,location_id,count,id_y,type_y,name_first,name_last,organization,inventor_id,sequence
1605217,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:da_ln:ma-35,9.0
1605218,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:hu_ln:zhang-251,1.0
1605219,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:ji_ln:guo-300,12.0
1605220,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:ji_ln:zhou-361,7.0
1605221,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:ju_ln:yang-201,8.0
1605222,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:qi_ln:sun-94,2.0
1605223,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:qi_ln:zhao-147,6.0
1605224,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:ti_ln:zhang-109,4.0
1605225,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:xi_ln:liu-619,11.0
1605226,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:ya_ln:zhao-106,10.0


## now need to join on inventor id for male_flag

In [18]:
_inventor_df.head()

Unnamed: 0,id,name_first,name_last,male_flag,attribution_status
0,00003d9f-9469-11ec-a0af-12de62d610b1,Zéline,HERVIER,,98
1,00003fc5-9469-11ec-a0af-12de62d610b1,Baptiste Romain,LARROUY,,98
2,00008e0f-bdce-11ea-8a73-121df0c29c1e,Paramjit S.,Tappia,1.0,1
3,0000ff22-9469-11ec-a0af-12de62d610b1,Juric,DRAGO DRAGUTIN,,98
4,0000n6xqianutadbzbgzwled7,Eva K.,Mudráné,0.0,1


In [19]:
# test_merge = pd.merge(application_assignee_inventor_df, _inventor_df, 
#                        left_on=['inventor_id'],
#                        right_on = ['id'], 
#                        how='left', 
#                        indicator='matched', 
#                        validate='many_to_many')

In [20]:
application_assignee_inventor_male_flag_df = pd.merge(application_assignee_inventor_df, _inventor_df, 
                        left_on=['inventor_id'],
                        right_on = ['id'], 
                        how='left')

application_assignee_inventor_male_flag_df

Unnamed: 0,id_x,document_number,type_x,application_number,date,country,series_code,invention_title,invention_abstract,rule_47_flag,...,name_first_x,name_last_x,organization,inventor_id,sequence,id,name_first_y,name_last_y,male_flag,attribution_status
0,0000021c-ccf9-11ea-ba95-121df0c29c1e,20100070354,utility,12412361,2009-03-27,US,12,SYSTEM AND METHOD FOR A MERCHANT DEBIT CARD PR...,A merchant debit card program is described tha...,False,...,,,"VISA USA, INC.",,,,,,,
1,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,...,,,TOHOKU UNIVERSITY,fl:da_ln:chiba-1,4.0,fl:da_ln:chiba-1,Daiki,Chiba,1.0,1.0
2,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,...,,,TOHOKU UNIVERSITY,fl:fu_ln:narita-1,0.0,fl:fu_ln:narita-1,Fumio,NARITA,1.0,1.0
3,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,...,,,TOHOKU UNIVERSITY,fl:ma_ln:watanabe-199,3.0,fl:ma_ln:watanabe-199,Masahito,Watanabe,1.0,1.0
4,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,...,,,TOHOKU UNIVERSITY,fl:ry_ln:onodera-1,1.0,fl:ry_ln:onodera-1,Ryuichi,ONODERA,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4048134,ffffe9d9-ccf8-11ea-ba95-121df0c29c1e,20100070344,utility,12556109,2009-09-09,US,12,SYSTEM AND METHOD FOR CALCULATING AND DISPLAYI...,Embodiments of systems and methods for the agg...,False,...,,,Truecar.com,fl:mi_ln:swinson-5,5.0,fl:mi_ln:swinson-5,Mike,Swinson,1.0,1.0
4048135,ffffe9d9-ccf8-11ea-ba95-121df0c29c1e,20100070344,utility,12556109,2009-09-09,US,12,SYSTEM AND METHOD FOR CALCULATING AND DISPLAYI...,Embodiments of systems and methods for the agg...,False,...,,,Truecar.com,fl:to_ln:taira-2,1.0,fl:to_ln:taira-2,Tom,Taira,1.0,1.0
4048136,fffff581-ccf8-11ea-ba95-121df0c29c1e,20100070349,utility,12517260,2007-11-29,US,12,ROAD TOLL SYSTEM,A road toll system comprises a vehicle-mounted...,False,...,,,NXP B.V.,,,,,,,
4048137,fffff7d2-ccf8-11ea-ba95-121df0c29c1e,20100070350,utility,12259937,2008-10-28,US,12,DYNAMIC PRICING FOR CONTENT PRESENTATIONS,A request for content is received. First conte...,False,...,,,Google LLC,,,,,,,


In [27]:
application_assignee_inventor_male_flag_df.groupby(['application_number']).size().sort_values(ascending = False).head(40)

application_number
13483997    133
16472034     73
14353962     69
11780735     59
14117405     56
16962698     54
16528597     48
12871539     48
16608095     48
17045049     47
17035702     45
15021175     43
17002074     42
13125778     42
17022213     42
14342111     41
11767193     40
13820065     39
13809115     39
11780834     38
11817621     38
13978653     37
16474415     36
16936720     36
16343372     36
16481215     36
15753455     36
10596297     36
10580870     36
13534583     35
16329423     35
15517597     34
13462628     34
10598089     34
15343214     33
11577214     33
14247317     33
16228796     33
11584793     33
12669653     33
dtype: int64

In [30]:
application_assignee_inventor_male_flag_df.query("application_number == 13483997")

Unnamed: 0,id_x,document_number,type_x,application_number,date,country,series_code,invention_title,invention_abstract,rule_47_flag,filename,year,assignee_id,location_id,count,id_y,type_y,name_first_x,name_last_x,organization,inventor_id,sequence,id,name_first_y,name_last_y,male_flag,attribution_status
1605217,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:da_ln:ma-35,9.0,fl:da_ln:ma-35,Dazhong,Ma,,98.0
1605218,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:hu_ln:zhang-251,1.0,fl:hu_ln:zhang-251,Huaguang,Zhang,1.0,1.0
1605219,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:ji_ln:guo-300,12.0,fl:ji_ln:guo-300,Jing,Guo,0.0,1.0
1605220,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:ji_ln:zhou-361,7.0,fl:ji_ln:zhou-361,Jianguo,Zhou,1.0,1.0
1605221,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:ju_ln:yang-201,8.0,fl:ju_ln:yang-201,Jun,Yang,1.0,1.0
1605222,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:qi_ln:sun-94,2.0,fl:qi_ln:sun-94,Qiuye,Sun,,99.0
1605223,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:qi_ln:zhao-147,6.0,fl:qi_ln:zhao-147,Qingqi,Zhao,1.0,1.0
1605224,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:ti_ln:zhang-109,4.0,fl:ti_ln:zhang-109,Tieyan,Zhang,,99.0
1605225,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:xi_ln:liu-619,11.0,fl:xi_ln:liu-619,Xinrui,Liu,0.0,1.0
1605226,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,0fc43b55-98a2-47b1-9527-3c34069dc58a,d9942a52-cb8e-11eb-9615-121df0c29c1e,1,0fc43b55-98a2-47b1-9527-3c34069dc58a,5.0,Xinrui,Liu,,fl:ya_ln:zhao-106,10.0,fl:ya_ln:zhao-106,Yan,Zhao,1.0,1.0


In [21]:
application_assignee_inventor_male_flag_df.columns

Index(['id_x', 'document_number', 'type_x', 'application_number', 'date',
       'country', 'series_code', 'invention_title', 'invention_abstract',
       'rule_47_flag', 'filename', 'year', 'assignee_id', 'location_id',
       'count', 'id_y', 'type_y', 'name_first_x', 'name_last_x',
       'organization', 'inventor_id', 'sequence', 'id', 'name_first_y',
       'name_last_y', 'male_flag', 'attribution_status'],
      dtype='object')

## Locations

- fix by adding geoID

In [22]:
_location_df.head()

Unnamed: 0,id,city,state,country,latitude,longitude,county,state_fips,county_fips
0,000016d0-cb8f-11eb-9615-121df0c29c1e,Saishi,,JP,,,,,
1,00006da3-cb90-11eb-9615-121df0c29c1e,Alder,MT,US,45.3247,-112.108,Madison,30.0,30057.0
2,00007faf-cb91-11eb-9615-121df0c29c1e,Kentifield,CA,US,,,,6.0,
3,0001b27a-09bc-11ec-893a-12de62d610b1,Bonigen,,CH,46.6774,7.91959,,,
4,00022720-cb8f-11eb-9615-121df0c29c1e,lwanuma,,JP,,,,,


In [23]:
# print(_location_df.query("country == 'US'").county_fips.isnull().sum())
print(_location_df.query("country == 'US' & city.isnull() & state.isnull() & county.isnull() ", engine="python").shape[0])



1


In [24]:
_location_df.county_fips

0             NaN
1         30057.0
2             NaN
3             NaN
4             NaN
           ...   
242921        NaN
242922        NaN
242923        NaN
242924        NaN
242925        NaN
Name: county_fips, Length: 242926, dtype: float64