In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', None)

In [3]:
applications_file = r"./data/pregrant/application.tsv"
_applications_df = pd.read_csv(applications_file, sep='\t')
# 

inventor_file = r"./data/pregrant/inventor.tsv"
_inventor_df = pd.read_csv(inventor_file, sep='\t')
# 

assignee_file = r"./data/pregrant/assignee.tsv"
_assignee_df = pd.read_csv(assignee_file, sep='\t')
# 


location_file = r"./data/pregrant/location.tsv"
_location_df = pd.read_csv(location_file, sep='\t')
# 

publication_assignee_file = r"./data/pregrant/publication_assignee.tsv"
_publication_assignee_df = pd.read_csv(publication_assignee_file, sep='\t')
# 

publication_inventor_file = r"./data/pregrant/publication_inventor.tsv"
_publication_inventor_df = pd.read_csv(publication_inventor_file, sep='\t')

pre_grant_locations_file = "./data/pregrant/location_crosswalk.csv"
pre_grant_locations_df = pd.read_csv(pre_grant_locations_file, low_memory=False)
# 

locations_file = r"./data/pregrant/location.tsv"
_locations_df = pd.read_csv(locations_file, sep='\t')




## Application Data

In [4]:
# get year from date
_applications_df = _applications_df[~_applications_df.date.isnull()]
_applications_df['year'] = _applications_df.date.str[:4]
_applications_df = _applications_df.assign(year = _applications_df.year.astype('int'))


_applications_df.head()

Unnamed: 0,id,document_number,type,application_number,date,country,series_code,invention_title,invention_abstract,rule_47_flag,filename,year
0,00000184-4904-11ec-b427-12de62d610b1,20210259522,utility,16919476,2020-07-02,US,16,MEDICAL VISUALIZATION SYSTEM,A medical visualisation system including a fir...,False,,2020
1,0000021c-ccf9-11ea-ba95-121df0c29c1e,20100070354,utility,12412361,2009-03-27,US,12,SYSTEM AND METHOD FOR A MERCHANT DEBIT CARD PR...,A merchant debit card program is described tha...,False,ipa100318.xml,2009
2,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,,2018
3,00003dee-4904-11ec-b427-12de62d610b1,20210259523,utility,17316500,2021-05-10,US,17,MEDICAL IMAGING DEVICE WITH A TELESCOPIC SCOPE,The subject matter discloses a medical imaging...,False,,2021
4,00004fba-ccf9-11ea-ba95-121df0c29c1e,20100070355,utility,12623189,2009-11-20,US,12,Methods for Transmitting Multimedia Files and ...,The invention is directed to a method of trans...,False,ipa100318.xml,2009


In [5]:
# _applications_df.year.value_counts()

## publication_assignee crosswalk

In [6]:
_publication_assignee_df.head()

Unnamed: 0,document_number,assignee_id,sequence,location_id
0,20050000014,b27df54c-0a0d-4ae9-98c7-ccd172eb0c0e,1,f54d6149-cb8e-11eb-9615-121df0c29c1e
1,20050000023,1293d184-e3b2-467d-95f3-02fb4473cf6e,1,cfe77bc3-cb8e-11eb-9615-121df0c29c1e
2,20050000025,6f07167a-0983-4bb7-b7dc-0cd391e6c279,1,d6338035-cb8e-11eb-9615-121df0c29c1e
3,20050000029,417a65f9-a4c7-4061-964f-ffd992ce74bd,1,3cb80671-cb8e-11eb-9615-121df0c29c1e
4,20050000031,417a65f9-a4c7-4061-964f-ffd992ce74bd,1,3cb80671-cb8e-11eb-9615-121df0c29c1e


In [7]:
# Fixing one to many problem here.
publication_assignee_df = pd.DataFrame(_publication_assignee_df.groupby(['document_number', 'assignee_id', 'location_id']).size()).reset_index()
publication_assignee_df = publication_assignee_df.rename(columns={0: "count"})
publication_assignee_df.sort_values(by='count', ascending=False)

Unnamed: 0,document_number,assignee_id,location_id,count
1561380,20130067161,ae107234-b77a-4d73-97bc-ae1316ece835,e8e360fe-cb8e-11eb-9615-121df0c29c1e,5
1001823,20100254991,93c50a7b-1b47-4b8c-9474-7497967beaed,3e7b68e2-cb8e-11eb-9615-121df0c29c1e,4
406839,20070251083,ee88a895-514d-40bb-b9a6-db8fb20ad697,3cb80671-cb8e-11eb-9615-121df0c29c1e,4
1542091,20130041234,1219f1dd-7ce7-40fc-ae46-ac91aaf6a02c,e7275ca8-cb8f-11eb-9615-121df0c29c1e,4
2363595,20170119177,4e8fc194-af9a-41b8-931c-cd3ef3db9c11,fff50a17-cb8f-11eb-9615-121df0c29c1e,3
...,...,...,...,...
990773,20100238976,c08bdcaa-2331-4fa7-9e90-e5a4a659fc73,ec16f9be-cb90-11eb-9615-121df0c29c1e,1
990774,20100238977,80306c57-ba73-4e88-97c6-81b918f2dcf0,fe4bbe3c-cb8f-11eb-9615-121df0c29c1e,1
990775,20100238978,e77c94a4-d0b0-449e-be41-898f5f1bbc04,fdb12d00-cb8f-11eb-9615-121df0c29c1e,1
990776,20100238982,78b461d5-10ed-4ae2-b411-380977df8d07,fd46655c-09bc-11ec-893a-12de62d610b1,1


In [8]:
application_publication_assignee_df = pd.merge(_applications_df, publication_assignee_df, on=['document_number'], how='inner', indicator='matched', validate='one_to_many')
application_publication_assignee_df = application_publication_assignee_df.drop('matched', axis=1)

application_publication_assignee_df.head()

Unnamed: 0,id,document_number,type,application_number,date,country,series_code,invention_title,invention_abstract,rule_47_flag,filename,year,assignee_id,location_id,count
0,0000021c-ccf9-11ea-ba95-121df0c29c1e,20100070354,utility,12412361,2009-03-27,US,12,SYSTEM AND METHOD FOR A MERCHANT DEBIT CARD PR...,A merchant debit card program is described tha...,False,ipa100318.xml,2009,10068d52-f4ee-47a2-9950-599af79a6484,f9139cb2-cb8f-11eb-9615-121df0c29c1e,1
1,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,,2018,7b2f1943-78fd-4a9f-9c3e-d32ca19cc371,baa6fcdc-cb8e-11eb-9615-121df0c29c1e,1
2,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,,2018,b4401241-2dd1-46d2-af15-8d91b699e960,baa6fcdc-cb8e-11eb-9615-121df0c29c1e,1
3,00004fba-ccf9-11ea-ba95-121df0c29c1e,20100070355,utility,12623189,2009-11-20,US,12,Methods for Transmitting Multimedia Files and ...,The invention is directed to a method of trans...,False,ipa100318.xml,2009,c0129a05-4813-44df-871d-205e59aa0bf7,fd10d2b1-cb8e-11eb-9615-121df0c29c1e,1
4,0000680f-4904-11ec-b427-12de62d610b1,20210259524,utility,17319550,2021-05-13,US,17,"ENDOSCOPE HEAD, ENDOSCOPE AND ALBARRAN LEVER H...",The invention refers to an attachment for an e...,False,,2021,68671a0c-3274-4382-8c88-40b8cbb31765,cfca1ca8-cb90-11eb-9615-121df0c29c1e,1


## Assignees


In [9]:
_assignee_df.head()

Unnamed: 0,id,type,name_first,name_last,organization
0,000049db-1cf0-4735-8a71-5a44b04859a6,2.0,,,"Atlantech International, Inc."
1,000074b2-2134-459b-a481-26c72c2313ae,3.0,,,Shanghai Juge Electronics Technologies Co. Ltd.
2,00007cb5-f03a-4547-8b7a-dd97dd959ef8,2.0,,,UrVibe LLC
3,0000c3c1-5ce8-4f80-8e21-d263467ab1fd,4.0,Wiley L.,"Day, Jr.",
4,0000c853-052b-4c30-acea-0c64e371349c,2.0,,,"Ringertown Innovations, LLC"


In [10]:

print(_assignee_df.shape)
# get rid of ones that are not organizations
#assignee_df = _assignee_df[~_assignee_df.organization.isnull()]
#print(assignee_df.shape)
_assignee_df.head()

(540183, 5)


Unnamed: 0,id,type,name_first,name_last,organization
0,000049db-1cf0-4735-8a71-5a44b04859a6,2.0,,,"Atlantech International, Inc."
1,000074b2-2134-459b-a481-26c72c2313ae,3.0,,,Shanghai Juge Electronics Technologies Co. Ltd.
2,00007cb5-f03a-4547-8b7a-dd97dd959ef8,2.0,,,UrVibe LLC
3,0000c3c1-5ce8-4f80-8e21-d263467ab1fd,4.0,Wiley L.,"Day, Jr.",
4,0000c853-052b-4c30-acea-0c64e371349c,2.0,,,"Ringertown Innovations, LLC"


In [11]:
# Should just be a single id per assignee but this data....
# assignee_df = pd.DataFrame(_assignee_df.groupby(['id']).size()).reset_index()
_assignee_df.id.duplicated().sum()
#pd.DataFrame(_assignee_df.groupby(['id']).size()).value_counts()

0

In [12]:
# application_assignee_df = pd.merge(application_publication_assignee_df, _assignee_df, left_on=['assignee_id'], right_on=['id'], how='left', indicator='matched', validate='many_to_one')
application_assignee_df = pd.merge(application_publication_assignee_df, _assignee_df, left_on=['assignee_id'], right_on=['id'], how='left')
# application_assignee_df.query("matched!='both'")

# get rid of places that didn't have an assignee...
# application_assignee_df = application_assignee_df[~application_assignee_df.id.isnull()]
application_assignee_df

Unnamed: 0,id_x,document_number,type_x,application_number,date,country,series_code,invention_title,invention_abstract,rule_47_flag,filename,year,assignee_id,location_id,count,id_y,type_y,name_first,name_last,organization
0,0000021c-ccf9-11ea-ba95-121df0c29c1e,20100070354,utility,12412361,2009-03-27,US,12,SYSTEM AND METHOD FOR A MERCHANT DEBIT CARD PR...,A merchant debit card program is described tha...,False,ipa100318.xml,2009,10068d52-f4ee-47a2-9950-599af79a6484,f9139cb2-cb8f-11eb-9615-121df0c29c1e,1,10068d52-f4ee-47a2-9950-599af79a6484,2.0,,,"VISA USA, INC."
1,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,,2018,7b2f1943-78fd-4a9f-9c3e-d32ca19cc371,baa6fcdc-cb8e-11eb-9615-121df0c29c1e,1,7b2f1943-78fd-4a9f-9c3e-d32ca19cc371,3.0,,,TOHOKU UNIVERSITY
2,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,,2018,b4401241-2dd1-46d2-af15-8d91b699e960,baa6fcdc-cb8e-11eb-9615-121df0c29c1e,1,b4401241-2dd1-46d2-af15-8d91b699e960,3.0,,,"TOHOKU STEEL CO., LTD."
3,00004fba-ccf9-11ea-ba95-121df0c29c1e,20100070355,utility,12623189,2009-11-20,US,12,Methods for Transmitting Multimedia Files and ...,The invention is directed to a method of trans...,False,ipa100318.xml,2009,c0129a05-4813-44df-871d-205e59aa0bf7,fd10d2b1-cb8e-11eb-9615-121df0c29c1e,1,c0129a05-4813-44df-871d-205e59aa0bf7,3.0,,,Clarity Pharmaceuticals Ltd
4,0000680f-4904-11ec-b427-12de62d610b1,20210259524,utility,17319550,2021-05-13,US,17,"ENDOSCOPE HEAD, ENDOSCOPE AND ALBARRAN LEVER H...",The invention refers to an attachment for an e...,False,,2021,68671a0c-3274-4382-8c88-40b8cbb31765,cfca1ca8-cb90-11eb-9615-121df0c29c1e,1,68671a0c-3274-4382-8c88-40b8cbb31765,3.0,,,DIGITAL ENDOSCOPY GMBH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2969382,ffffe76f-ccf8-11ea-ba95-121df0c29c1e,20100070343,utility,12556076,2009-09-09,US,12,"SYSTEM AND METHOD FOR AGGREGATION, ANALYSIS, P...",Embodiments of systems and methods for the agg...,False,ipa100318.xml,2009,3341dd6e-43f6-4853-a436-b092a65c4a60,fe664e0e-cb8e-11eb-9615-121df0c29c1e,1,3341dd6e-43f6-4853-a436-b092a65c4a60,2.0,,,Truecar.com
2969383,ffffe9d9-ccf8-11ea-ba95-121df0c29c1e,20100070344,utility,12556109,2009-09-09,US,12,SYSTEM AND METHOD FOR CALCULATING AND DISPLAYI...,Embodiments of systems and methods for the agg...,False,ipa100318.xml,2009,3341dd6e-43f6-4853-a436-b092a65c4a60,fe664e0e-cb8e-11eb-9615-121df0c29c1e,1,3341dd6e-43f6-4853-a436-b092a65c4a60,2.0,,,Truecar.com
2969384,fffff581-ccf8-11ea-ba95-121df0c29c1e,20100070349,utility,12517260,2007-11-29,US,12,ROAD TOLL SYSTEM,A road toll system comprises a vehicle-mounted...,False,ipa100318.xml,2007,d25fa625-726a-4c37-a21e-7606b802cf50,ee4e6706-cb8f-11eb-9615-121df0c29c1e,1,d25fa625-726a-4c37-a21e-7606b802cf50,3.0,,,NXP B.V.
2969385,fffff7d2-ccf8-11ea-ba95-121df0c29c1e,20100070350,utility,12259937,2008-10-28,US,12,DYNAMIC PRICING FOR CONTENT PRESENTATIONS,A request for content is received. First conte...,False,ipa100318.xml,2008,2721c099-5a78-45fa-b3a7-11d119300596,ff4c2272-cb8e-11eb-9615-121df0c29c1e,1,2721c099-5a78-45fa-b3a7-11d119300596,2.0,,,Google LLC


In [13]:
#all have assignees
#application_assignee_df.assignee_id.isnull().sum()

In [14]:
application_assignee_df.head()

Unnamed: 0,id_x,document_number,type_x,application_number,date,country,series_code,invention_title,invention_abstract,rule_47_flag,filename,year,assignee_id,location_id,count,id_y,type_y,name_first,name_last,organization
0,0000021c-ccf9-11ea-ba95-121df0c29c1e,20100070354,utility,12412361,2009-03-27,US,12,SYSTEM AND METHOD FOR A MERCHANT DEBIT CARD PR...,A merchant debit card program is described tha...,False,ipa100318.xml,2009,10068d52-f4ee-47a2-9950-599af79a6484,f9139cb2-cb8f-11eb-9615-121df0c29c1e,1,10068d52-f4ee-47a2-9950-599af79a6484,2.0,,,"VISA USA, INC."
1,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,,2018,7b2f1943-78fd-4a9f-9c3e-d32ca19cc371,baa6fcdc-cb8e-11eb-9615-121df0c29c1e,1,7b2f1943-78fd-4a9f-9c3e-d32ca19cc371,3.0,,,TOHOKU UNIVERSITY
2,00001b62-f3c4-11eb-b0cf-121df0c29c1e,20210172812,utility,16623118,2018-04-20,US,16,"ENERGY CONVERTER, VIBRATION POWER GENERATOR, F...",An energy converter is formed by bonding a sol...,False,,2018,b4401241-2dd1-46d2-af15-8d91b699e960,baa6fcdc-cb8e-11eb-9615-121df0c29c1e,1,b4401241-2dd1-46d2-af15-8d91b699e960,3.0,,,"TOHOKU STEEL CO., LTD."
3,00004fba-ccf9-11ea-ba95-121df0c29c1e,20100070355,utility,12623189,2009-11-20,US,12,Methods for Transmitting Multimedia Files and ...,The invention is directed to a method of trans...,False,ipa100318.xml,2009,c0129a05-4813-44df-871d-205e59aa0bf7,fd10d2b1-cb8e-11eb-9615-121df0c29c1e,1,c0129a05-4813-44df-871d-205e59aa0bf7,3.0,,,Clarity Pharmaceuticals Ltd
4,0000680f-4904-11ec-b427-12de62d610b1,20210259524,utility,17319550,2021-05-13,US,17,"ENDOSCOPE HEAD, ENDOSCOPE AND ALBARRAN LEVER H...",The invention refers to an attachment for an e...,False,,2021,68671a0c-3274-4382-8c88-40b8cbb31765,cfca1ca8-cb90-11eb-9615-121df0c29c1e,1,68671a0c-3274-4382-8c88-40b8cbb31765,3.0,,,DIGITAL ENDOSCOPY GMBH


In [15]:
# Getting list of location_ids that we used. MHK will use this for geocoding
assignee_location_ids = pd.DataFrame(application_assignee_df.location_id.unique())
assignee_location_ids = assignee_location_ids.rename(columns={0: "location_id"})
# assignee_location_ids.shape




### Merge this again with location.

In [16]:
pre_grant_locations_df.head()

Unnamed: 0,id,GEOID,pv_city,gl_city,pv_state,gl_state,country,pv_lat,gl_lat,pv_long,gl_long,pv_county,gl_county,pv_state_fips,pv_county_fips,gl_county_fips
0,f9139cb2-cb8f-11eb-9615-121df0c29c1e,6075,San Francisco,,CA,,US,37.7292,,-123.047,,San Francisco,,6.0,6075.0,
1,ffc9f8bd-cb8e-11eb-9615-121df0c29c1e,32031,Reno,,NV,,US,39.5504,,-119.803,,Washoe,,32.0,32031.0,
2,9596257a-cb90-11eb-9615-121df0c29c1e,50021,Rutland,,VT,,US,43.6106,,-72.9726,,Rutland,,50.0,50021.0,
3,499f8deb-cb8e-11eb-9615-121df0c29c1e,34003,Parkridge,Park Ridge,NJ,NJ,US,,-74.03939,,41.03379,,Bergen,34.0,,34003.0
4,ea59a211-09be-11ec-893a-12de62d610b1,6085,Sunnyvale,,CA,,US,37.3688,,-122.036,,Santa Clara,,6.0,6085.0,


## One thing to note is that I dropped all locations that were not in the United States. However, there are a lot of US patents with assignees outside of the US.


In [17]:
# test_merge = pd.merge(application_assignee_df, pre_grant_locations_df, 
#                       left_on=['location_id'], 
#                       right_on=['id'], 
#                       how='left', 
#                       indicator='matched', 
#                       validate='many_to_one')

# print(test_merge.query("matched=='both'").shape)
# shape1 = test_merge.query("matched=='both'").shape

In [18]:
# test_merge = pd.merge(application_assignee_df, _locations_df, 
#                       left_on=['location_id'], 
#                       right_on=['id'], 
#                       how='left', 
#                       indicator='matched', 
#                       validate='many_to_one')

# print(test_merge.query("matched=='both'").shape)
# shape2 = test_merge.query("matched=='both'").query("country_y=='US'").shape

In [19]:
#test_merge.head()

In [20]:
# print(shape1)
# print(shape2)
# print(shape2[0]-shape1[0])

In [21]:
application_assignee_location_df = pd.merge(application_assignee_df, pre_grant_locations_df, left_on=['location_id'], right_on=['id'], how='left')

In [22]:
#application_assignee_location_df['GEOID'].head()

In [23]:
print(application_assignee_location_df.shape)
print(application_assignee_location_df.query("GEOID.notnull()", engine="python").shape)
# application_assignee_location_GEOIDS_df = application_assignee_location_df.query("GEOID.notnull() & organization.notnull()", engine="python").copy()
application_assignee_location_GEOIDS_df = application_assignee_location_df.query("GEOID.notnull()", engine="python").copy()
application_assignee_location_GEOIDS_df.head()

(2969387, 36)
(1034622, 36)


Unnamed: 0,id_x,document_number,type_x,application_number,date,country_x,series_code,invention_title,invention_abstract,rule_47_flag,filename,year,assignee_id,location_id,count,id_y,type_y,name_first,name_last,organization,id,GEOID,pv_city,gl_city,pv_state,gl_state,country_y,pv_lat,gl_lat,pv_long,gl_long,pv_county,gl_county,pv_state_fips,pv_county_fips,gl_county_fips
0,0000021c-ccf9-11ea-ba95-121df0c29c1e,20100070354,utility,12412361,2009-03-27,US,12,SYSTEM AND METHOD FOR A MERCHANT DEBIT CARD PR...,A merchant debit card program is described tha...,False,ipa100318.xml,2009,10068d52-f4ee-47a2-9950-599af79a6484,f9139cb2-cb8f-11eb-9615-121df0c29c1e,1,10068d52-f4ee-47a2-9950-599af79a6484,2.0,,,"VISA USA, INC.",f9139cb2-cb8f-11eb-9615-121df0c29c1e,6075,San Francisco,,CA,,US,37.7292,,-123.047,,San Francisco,,6.0,6075.0,
7,00007a2d-ccf9-11ea-ba95-121df0c29c1e,20100070357,utility,12334277,2008-12-12,US,12,INCENTIVE BASED MARKETING THROUGH SOCIAL NETWORKS,A method and system for providing an incentive...,False,ipa100318.xml,2008,c912d9d3-f86c-4c2c-8bc9-c91a9b4511ab,ffc9f8bd-cb8e-11eb-9615-121df0c29c1e,1,c912d9d3-f86c-4c2c-8bc9-c91a9b4511ab,2.0,,,"AT&T Intellectual Property I, L.P.",ffc9f8bd-cb8e-11eb-9615-121df0c29c1e,32031,Reno,,NV,,US,39.5504,,-119.803,,Washoe,,32.0,32031.0,
8,00007d04-ccf9-11ea-ba95-121df0c29c1e,20100070358,utility,12561091,2009-09-16,US,12,REC CREDIT DISTRIBUTION SYSTEM AND METHOD,A method for promoting recycling from a fund e...,False,ipa100318.xml,2009,c1bd445e-3bcd-4b27-855a-651207bb56e3,9596257a-cb90-11eb-9615-121df0c29c1e,1,c1bd445e-3bcd-4b27-855a-651207bb56e3,2.0,,,"Casella Waste Systems, Inc.",9596257a-cb90-11eb-9615-121df0c29c1e,50021,Rutland,,VT,,US,43.6106,,-72.9726,,Rutland,,50.0,50021.0,
10,0000827d-ccf9-11ea-ba95-121df0c29c1e,20100070360,utility,12339981,2008-12-19,US,12,SYSTEM AND METHOD FOR CREATING A SPEECH SEARCH...,"Disclosed herein are systems, methods, and com...",False,ipa100318.xml,2008,c912d9d3-f86c-4c2c-8bc9-c91a9b4511ab,ffc9f8bd-cb8e-11eb-9615-121df0c29c1e,1,c912d9d3-f86c-4c2c-8bc9-c91a9b4511ab,2.0,,,"AT&T Intellectual Property I, L.P.",ffc9f8bd-cb8e-11eb-9615-121df0c29c1e,32031,Reno,,NV,,US,39.5504,,-119.803,,Washoe,,32.0,32031.0,
11,00008c11-ccf9-11ea-ba95-121df0c29c1e,20100070363,utility,12623282,2009-11-20,US,12,INTERNET STRAWMAN AND USER INTERFACE THEREFOR,A computer implemented method for facilitating...,False,ipa100318.xml,2009,89e5d47d-a6c8-4dbc-b190-2a0bb9fb5970,499f8deb-cb8e-11eb-9615-121df0c29c1e,1,89e5d47d-a6c8-4dbc-b190-2a0bb9fb5970,2.0,,,SONY ELECTRONICS INC.,499f8deb-cb8e-11eb-9615-121df0c29c1e,34003,Parkridge,Park Ridge,NJ,NJ,US,,-74.03939,,41.03379,,Bergen,34.0,,34003.0


In [24]:
denormalized_application_assignee = application_assignee_location_GEOIDS_df[['document_number', 'type_x', 'application_number', 'year', 'assignee_id', 'location_id', 'GEOID', 'organization']]

In [73]:
#denormalized_application_assignee.query("application_number == 16919476")

Unnamed: 0,document_number,type_x,application_number,year,assignee_id,location_id,GEOID,organization


## Inventors

## To Do:

* Finish Inventory crosswalks.
* Combine assignee and inventor crosswalks.
* Aggregate assignee and inventor crosswalks.
* Combine with granted data

What do we need:
* Assignee information
    * 
* Inventor information
    * Gender counts
    * Number of inventors
    * Team size
    * 



In [26]:
_publication_inventor_df.head()

Unnamed: 0,document_number,inventor_id,sequence,location_id
0,20050000001,fl:ti_ln:goldkind-1,1,fa3d02fd-09bd-11ec-893a-12de62d610b1
1,20050000002,fl:je_ln:levy-10,2,b778d60a-cb8e-11eb-9615-121df0c29c1e
2,20050000002,fl:ph_ln:levy-4,1,f8b3a9cd-cb90-11eb-9615-121df0c29c1e
3,20050000003,9958fa19-3b0f-11eb-a3cd-121df0c29c1e,1,4c4e3991-cb8e-11eb-9615-121df0c29c1e
4,20050000004,fl:da_ln:yun-29,1,efa16d0b-cb8f-11eb-9615-121df0c29c1e


In [27]:
# Fixing one to many problem here. -> don't group by location? don't care about mulitple location for single application for inventor? 
#publication_inventor_df = pd.DataFrame(_publication_inventor_df.groupby(['document_number', 'inventor_id', 'location_id']).size()).reset_index()
publication_inventor_df = pd.DataFrame(_publication_inventor_df.groupby(['document_number', 'inventor_id']).size()).reset_index()
publication_inventor_df = publication_inventor_df.rename(columns={0: "count"})
publication_inventor_df.sort_values(by='count')

# df.groupby(['name','month'])['text'].apply(lambda x: ','.join(x)).reset_index()   


Unnamed: 0,document_number,inventor_id,count
0,20050000001,fl:ti_ln:goldkind-1,1
11297698,20170168522,fl:jo_ln:morris-47,1
11297699,20170168522,fl:ro_ln:kapinos-9,1
11297700,20170168522,fl:sc_ln:li-11,1
11297701,20170168523,fl:ar_ln:ardakani-1,1
...,...,...,...
5461135,20110224034,fl:qi_ln:tu-7,3
8864468,20150076727,fl:ge_ln:rocha-2,3
9054114,20150143865,fl:ji_ln:griffin-1,3
2618238,20080137033,fl:wi_ln:padula-1,3


## join to application

- many to many? on doc # and location id
    - one application may have multiple inventors 
    - one inventor could have multiple patent applications

In [28]:
# test_merge = pd.merge(application_assignee_df, _publication_inventor_df, 
#                       on=['document_number', 'location_id'], 
#                       how='left', 
#                       indicator='matched', 
#                       validate='many_to_many')

In [29]:
# application_assignee_inventor_df = pd.merge(application_assignee_df, _publication_inventor_df, 
#                       on=['document_number', 'location_id'], 
#                       how='left')

# application_assignee_inventor_df

# Merging on just the application data, no assignees yet.

#changed this was _publication_inventor_df but changed to the grouped version to avoid mulitple locations for single inventor
application_inventor_df = pd.merge(_applications_df, publication_inventor_df, 
                      on=['document_number'], 
                      how='left')

application_inventor_df

Unnamed: 0,id,document_number,type,application_number,date,country,series_code,invention_title,invention_abstract,rule_47_flag,filename,year,inventor_id,count
0,00000184-4904-11ec-b427-12de62d610b1,20210259522,utility,16919476,2020-07-02,US,16,MEDICAL VISUALIZATION SYSTEM,A medical visualisation system including a fir...,False,,2020,fl:br_ln:nielsen-22,1
1,00000184-4904-11ec-b427-12de62d610b1,20210259522,utility,16919476,2020-07-02,US,16,MEDICAL VISUALIZATION SYSTEM,A medical visualisation system including a fir...,False,,2020,fl:ch_ln:hahnemann-1,1
2,00000184-4904-11ec-b427-12de62d610b1,20210259522,utility,16919476,2020-07-02,US,16,MEDICAL VISUALIZATION SYSTEM,A medical visualisation system including a fir...,False,,2020,fl:he_ln:frengler-1,1
3,00000184-4904-11ec-b427-12de62d610b1,20210259522,utility,16919476,2020-07-02,US,16,MEDICAL VISUALIZATION SYSTEM,A medical visualisation system including a fir...,False,,2020,fl:li_ln:ubbesen-1,1
4,0000021c-ccf9-11ea-ba95-121df0c29c1e,20100070354,utility,12412361,2009-03-27,US,12,SYSTEM AND METHOD FOR A MERCHANT DEBIT CARD PR...,A merchant debit card program is described tha...,False,ipa100318.xml,2009,fl:ed_ln:fordyce-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16676951,fffff7d2-ccf8-11ea-ba95-121df0c29c1e,20100070350,utility,12259937,2008-10-28,US,12,DYNAMIC PRICING FOR CONTENT PRESENTATIONS,A request for content is received. First conte...,False,ipa100318.xml,2008,fl:mi_ln:hochberg-2,1
16676952,fffffadc-ccf8-11ea-ba95-121df0c29c1e,20100070351,utility,12448114,2008-04-14,US,12,ELECTRONIC ADVERTISEMENT METHOD AND SYSTEM USI...,The present invention is related to an electro...,False,ipa100318.xml,2008,fl:do_ln:kang-212,1
16676953,fffffd13-4903-11ec-b427-12de62d610b1,20210259521,utility,17165645,2021-02-02,US,17,CONTROLLER FOR SELECTIVELY CONTROLLING MANUAL ...,A system configured to control an endoscope pr...,False,,2021,fl:ch_ln:hwang-196,1
16676954,fffffd62-ccf8-11ea-ba95-121df0c29c1e,20100070352,utility,12560644,2009-09-16,US,12,Consumer incentive system and method,A method and system to enhance the relationshi...,False,ipa100318.xml,2009,fl:wi_ln:flanders-2,1


In [30]:
application_inventor_df.duplicated().sum()

0

In [31]:
#sanity checks 
application_inventor_df.query("application_number == 13483997")

Unnamed: 0,id,document_number,type,application_number,date,country,series_code,invention_title,invention_abstract,rule_47_flag,filename,year,inventor_id,count
6777001,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,fl:da_ln:ma-35,1
6777002,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,fl:hu_ln:zhang-251,1
6777003,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,fl:ji_ln:guo-300,1
6777004,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,fl:ji_ln:zhou-361,1
6777005,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,fl:ju_ln:yang-201,1
6777006,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,fl:qi_ln:sun-94,1
6777007,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,fl:qi_ln:zhao-147,1
6777008,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,fl:ti_ln:zhang-109,1
6777009,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,fl:xi_ln:liu-619,1
6777010,6627fd1a-bca7-11ea-8a73-121df0c29c1e,20130054204,utility,13483997,2012-05-30,US,13,AUTOMATIC THREE-PHASE UNBALANCED LOAD COMPENSA...,Disclosed are an automatic three-phase unbalan...,False,ipa130228.xml,2012,fl:xi_ln:meng-161,1


## now need to join on inventor id for male_flag

In [32]:
_inventor_df.head()

Unnamed: 0,id,name_first,name_last,male_flag,attribution_status
0,00003d9f-9469-11ec-a0af-12de62d610b1,Zéline,HERVIER,,98
1,00003fc5-9469-11ec-a0af-12de62d610b1,Baptiste Romain,LARROUY,,98
2,00008e0f-bdce-11ea-8a73-121df0c29c1e,Paramjit S.,Tappia,1.0,1
3,0000ff22-9469-11ec-a0af-12de62d610b1,Juric,DRAGO DRAGUTIN,,98
4,0000n6xqianutadbzbgzwled7,Eva K.,Mudráné,0.0,1


In [67]:
_inventor_df.shape

(5363671, 5)

In [66]:
_inventor_df.attribution_status.value_counts()

1     4444558
98     552431
99     366682
Name: attribution_status, dtype: int64

In [33]:
# test_merge = pd.merge(application_assignee_inventor_df, _inventor_df, 
#                        left_on=['inventor_id'],
#                        right_on = ['id'], 
#                        how='left', 
#                        indicator='matched', 
#                        validate='many_to_many')

In [34]:
application_inventor_male_flag_df = pd.merge(application_inventor_df, _inventor_df, 
                        left_on=['inventor_id'],
                        right_on = ['id'], 
                        how='left')                        

application_inventor_male_flag_df

Unnamed: 0,id_x,document_number,type,application_number,date,country,series_code,invention_title,invention_abstract,rule_47_flag,filename,year,inventor_id,count,id_y,name_first,name_last,male_flag,attribution_status
0,00000184-4904-11ec-b427-12de62d610b1,20210259522,utility,16919476,2020-07-02,US,16,MEDICAL VISUALIZATION SYSTEM,A medical visualisation system including a fir...,False,,2020,fl:br_ln:nielsen-22,1,fl:br_ln:nielsen-22,Brian,NIELSEN,1.0,1
1,00000184-4904-11ec-b427-12de62d610b1,20210259522,utility,16919476,2020-07-02,US,16,MEDICAL VISUALIZATION SYSTEM,A medical visualisation system including a fir...,False,,2020,fl:ch_ln:hahnemann-1,1,fl:ch_ln:hahnemann-1,Christina,Hahnemann,,98
2,00000184-4904-11ec-b427-12de62d610b1,20210259522,utility,16919476,2020-07-02,US,16,MEDICAL VISUALIZATION SYSTEM,A medical visualisation system including a fir...,False,,2020,fl:he_ln:frengler-1,1,fl:he_ln:frengler-1,Henrik,FRENGLER,1.0,1
3,00000184-4904-11ec-b427-12de62d610b1,20210259522,utility,16919476,2020-07-02,US,16,MEDICAL VISUALIZATION SYSTEM,A medical visualisation system including a fir...,False,,2020,fl:li_ln:ubbesen-1,1,fl:li_ln:ubbesen-1,Line Sandahl,UBBESEN,1.0,1
4,0000021c-ccf9-11ea-ba95-121df0c29c1e,20100070354,utility,12412361,2009-03-27,US,12,SYSTEM AND METHOD FOR A MERCHANT DEBIT CARD PR...,A merchant debit card program is described tha...,False,ipa100318.xml,2009,fl:ed_ln:fordyce-1,1,fl:ed_ln:fordyce-1,Edward W.,"Fordyce, III",1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16676951,fffff7d2-ccf8-11ea-ba95-121df0c29c1e,20100070350,utility,12259937,2008-10-28,US,12,DYNAMIC PRICING FOR CONTENT PRESENTATIONS,A request for content is received. First conte...,False,ipa100318.xml,2008,fl:mi_ln:hochberg-2,1,fl:mi_ln:hochberg-2,Michael,Hochberg,1.0,1
16676952,fffffadc-ccf8-11ea-ba95-121df0c29c1e,20100070351,utility,12448114,2008-04-14,US,12,ELECTRONIC ADVERTISEMENT METHOD AND SYSTEM USI...,The present invention is related to an electro...,False,ipa100318.xml,2008,fl:do_ln:kang-212,1,fl:do_ln:kang-212,Dong-Kyun,Kang,1.0,1
16676953,fffffd13-4903-11ec-b427-12de62d610b1,20210259521,utility,17165645,2021-02-02,US,17,CONTROLLER FOR SELECTIVELY CONTROLLING MANUAL ...,A system configured to control an endoscope pr...,False,,2021,fl:ch_ln:hwang-196,1,fl:ch_ln:hwang-196,Charles,Hwang,1.0,1
16676954,fffffd62-ccf8-11ea-ba95-121df0c29c1e,20100070352,utility,12560644,2009-09-16,US,12,Consumer incentive system and method,A method and system to enhance the relationshi...,False,ipa100318.xml,2009,fl:wi_ln:flanders-2,1,fl:wi_ln:flanders-2,William Henry,Flanders,1.0,1


In [35]:
denormalized_application_assignee

Unnamed: 0,document_number,type_x,application_number,year,assignee_id,location_id,GEOID,organization
0,20100070354,utility,12412361,2009,10068d52-f4ee-47a2-9950-599af79a6484,f9139cb2-cb8f-11eb-9615-121df0c29c1e,06075,"VISA USA, INC."
7,20100070357,utility,12334277,2008,c912d9d3-f86c-4c2c-8bc9-c91a9b4511ab,ffc9f8bd-cb8e-11eb-9615-121df0c29c1e,32031,"AT&T Intellectual Property I, L.P."
8,20100070358,utility,12561091,2009,c1bd445e-3bcd-4b27-855a-651207bb56e3,9596257a-cb90-11eb-9615-121df0c29c1e,50021,"Casella Waste Systems, Inc."
10,20100070360,utility,12339981,2008,c912d9d3-f86c-4c2c-8bc9-c91a9b4511ab,ffc9f8bd-cb8e-11eb-9615-121df0c29c1e,32031,"AT&T Intellectual Property I, L.P."
11,20100070363,utility,12623282,2009,89e5d47d-a6c8-4dbc-b190-2a0bb9fb5970,499f8deb-cb8e-11eb-9615-121df0c29c1e,34003,SONY ELECTRONICS INC.
...,...,...,...,...,...,...,...,...
2969379,20100070340,utility,12404357,2009,36b3dbd4-1290-4529-8e25-8f731a3ef988,e8f20d31-09be-11ec-893a-12de62d610b1,06059,"Relevant Play, LLC"
2969382,20100070343,utility,12556076,2009,3341dd6e-43f6-4853-a436-b092a65c4a60,fe664e0e-cb8e-11eb-9615-121df0c29c1e,06037,Truecar.com
2969383,20100070344,utility,12556109,2009,3341dd6e-43f6-4853-a436-b092a65c4a60,fe664e0e-cb8e-11eb-9615-121df0c29c1e,06037,Truecar.com
2969385,20100070350,utility,12259937,2008,2721c099-5a78-45fa-b3a7-11d119300596,ff4c2272-cb8e-11eb-9615-121df0c29c1e,06085,Google LLC


In [36]:
application_inventor_male_flag_df.groupby(['application_number']).size().sort_values(ascending = False).head(20)

application_number
16414490    133
15154403    133
16714350    130
15154212    130
17139409    130
11803178    100
16189157     94
16827116     68
16472830     68
12233530     65
16274800     65
13284655     65
11166065     65
11155368     63
13004007     61
11003307     60
14701371     60
14630778     58
12233489     55
15932378     54
dtype: int64

In [37]:
#checking for inventor with Nan male flag
application_inventor_male_flag_df.query("application_number == 16919476")

Unnamed: 0,id_x,document_number,type,application_number,date,country,series_code,invention_title,invention_abstract,rule_47_flag,filename,year,inventor_id,count,id_y,name_first,name_last,male_flag,attribution_status
0,00000184-4904-11ec-b427-12de62d610b1,20210259522,utility,16919476,2020-07-02,US,16,MEDICAL VISUALIZATION SYSTEM,A medical visualisation system including a fir...,False,,2020,fl:br_ln:nielsen-22,1,fl:br_ln:nielsen-22,Brian,NIELSEN,1.0,1
1,00000184-4904-11ec-b427-12de62d610b1,20210259522,utility,16919476,2020-07-02,US,16,MEDICAL VISUALIZATION SYSTEM,A medical visualisation system including a fir...,False,,2020,fl:ch_ln:hahnemann-1,1,fl:ch_ln:hahnemann-1,Christina,Hahnemann,,98
2,00000184-4904-11ec-b427-12de62d610b1,20210259522,utility,16919476,2020-07-02,US,16,MEDICAL VISUALIZATION SYSTEM,A medical visualisation system including a fir...,False,,2020,fl:he_ln:frengler-1,1,fl:he_ln:frengler-1,Henrik,FRENGLER,1.0,1
3,00000184-4904-11ec-b427-12de62d610b1,20210259522,utility,16919476,2020-07-02,US,16,MEDICAL VISUALIZATION SYSTEM,A medical visualisation system including a fir...,False,,2020,fl:li_ln:ubbesen-1,1,fl:li_ln:ubbesen-1,Line Sandahl,UBBESEN,1.0,1


In [38]:
# it has 133 inventors??? 
application_inventor_male_flag_df.query("application_number == 16414490")

Unnamed: 0,id_x,document_number,type,application_number,date,country,series_code,invention_title,invention_abstract,rule_47_flag,filename,year,inventor_id,count,id_y,name_first,name_last,male_flag,attribution_status
1890284,1ba5003c-bc7e-11ea-8a73-121df0c29c1e,20200028745,utility,16414490,2019-05-16,US,16,"Network Architecture, Methods, and Devices for...",Methods and apparatus in a fifth-generation wi...,False,ipa200123.xml,2019,fl:al_ln:elessaili-1,1,fl:al_ln:elessaili-1,Ali,El Essaili,1.0,1
1890285,1ba5003c-bc7e-11ea-8a73-121df0c29c1e,20200028745,utility,16414490,2019-05-16,US,16,"Network Architecture, Methods, and Devices for...",Methods and apparatus in a fifth-generation wi...,False,ipa200123.xml,2019,fl:al_ln:khayrallah-1,1,fl:al_ln:khayrallah-1,Ali S.,Khayrallah,1.0,1
1890286,1ba5003c-bc7e-11ea-8a73-121df0c29c1e,20200028745,utility,16414490,2019-05-16,US,16,"Network Architecture, Methods, and Devices for...",Methods and apparatus in a fifth-generation wi...,False,ipa200123.xml,2019,fl:al_ln:zaidi-1,1,fl:al_ln:zaidi-1,Ali,Zaidi,1.0,1
1890287,1ba5003c-bc7e-11ea-8a73-121df0c29c1e,20200028745,utility,16414490,2019-05-16,US,16,"Network Architecture, Methods, and Devices for...",Methods and apparatus in a fifth-generation wi...,False,ipa200123.xml,2019,fl:an_ln:bergstrom-2,1,fl:an_ln:bergstrom-2,Andreas,Bergström,1.0,1
1890288,1ba5003c-bc7e-11ea-8a73-121df0c29c1e,20200028745,utility,16414490,2019-05-16,US,16,"Network Architecture, Methods, and Devices for...",Methods and apparatus in a fifth-generation wi...,False,ipa200123.xml,2019,fl:an_ln:carlsson-5,1,fl:an_ln:carlsson-5,Anders,Carlsson,1.0,1
1890289,1ba5003c-bc7e-11ea-8a73-121df0c29c1e,20200028745,utility,16414490,2019-05-16,US,16,"Network Architecture, Methods, and Devices for...",Methods and apparatus in a fifth-generation wi...,False,ipa200123.xml,2019,fl:an_ln:cedergren-2,1,fl:an_ln:cedergren-2,Andreas,Cedergren,1.0,1
1890290,1ba5003c-bc7e-11ea-8a73-121df0c29c1e,20200028745,utility,16414490,2019-05-16,US,16,"Network Architecture, Methods, and Devices for...",Methods and apparatus in a fifth-generation wi...,False,ipa200123.xml,2019,fl:an_ln:furuskar-1,1,fl:an_ln:furuskar-1,Anders,Furuskär,1.0,1
1890291,1ba5003c-bc7e-11ea-8a73-121df0c29c1e,20200028745,utility,16414490,2019-05-16,US,16,"Network Architecture, Methods, and Devices for...",Methods and apparatus in a fifth-generation wi...,False,ipa200123.xml,2019,fl:an_ln:reial-5,1,fl:an_ln:reial-5,Andres,Reial,1.0,1
1890292,1ba5003c-bc7e-11ea-8a73-121df0c29c1e,20200028745,utility,16414490,2019-05-16,US,16,"Network Architecture, Methods, and Devices for...",Methods and apparatus in a fifth-generation wi...,False,ipa200123.xml,2019,fl:an_ln:stjernman-1,1,fl:an_ln:stjernman-1,Anders,Stjernman,1.0,1
1890293,1ba5003c-bc7e-11ea-8a73-121df0c29c1e,20200028745,utility,16414490,2019-05-16,US,16,"Network Architecture, Methods, and Devices for...",Methods and apparatus in a fifth-generation wi...,False,ipa200123.xml,2019,fl:an_ln:wallen-2,1,fl:an_ln:wallen-2,Anders,Wallén,1.0,1


In [72]:
#application_inventor_male_flag_df.query("application_number == 16919476")
denormalized_application_assignee.query("application_number == 16919476")

Unnamed: 0,document_number,type_x,application_number,year,assignee_id,location_id,GEOID,organization


In [39]:
application_inventor_male_flag_df.query("application_number == 13483997").inventor_id.value_counts()

fl:da_ln:ma-35        1
fl:hu_ln:zhang-251    1
fl:ji_ln:guo-300      1
fl:ji_ln:zhou-361     1
fl:ju_ln:yang-201     1
fl:qi_ln:sun-94       1
fl:qi_ln:zhao-147     1
fl:ti_ln:zhang-109    1
fl:xi_ln:liu-619      1
fl:xi_ln:meng-161     1
fl:ya_ln:zhao-106     1
fl:zh_ln:liu-737      1
Name: inventor_id, dtype: int64

In [40]:
application_inventor_male_flag_df.columns

Index(['id_x', 'document_number', 'type', 'application_number', 'date',
       'country', 'series_code', 'invention_title', 'invention_abstract',
       'rule_47_flag', 'filename', 'year', 'inventor_id', 'count', 'id_y',
       'name_first', 'name_last', 'male_flag', 'attribution_status'],
      dtype='object')

# do we need the cell below? 

In [41]:
# assignee_location_ids = pd.DataFrame(application_assignee_df.location_id.unique())
# assignee_location_ids = assignee_location_ids.rename(columns={0: "location_id"})
# assignee_location_ids.head()

#application_inventor_male_flag_location_ids = pd.DataFrame(application_inventor_male_flag_df.location_id.unique())
#application_inventor_male_flag_location_ids = application_inventor_male_flag_location_ids.rename(columns={0: "location_id"})
#application_inventor_male_flag_location_ids.shape

In [42]:
# application_inventor_male_flag_df
# denormalized_application_assignee


application_assignee_inventor_df = pd.merge(denormalized_application_assignee, application_inventor_male_flag_df, 
                      on=['document_number'], 
                      how='left')

In [71]:
application_assignee_inventor_df.query("application_number_x == 16919476")

Unnamed: 0,document_number,type_x,application_number_x,year_x,assignee_id,location_id,GEOID,organization,id_x,type,application_number_y,date,country,invention_title,invention_abstract,rule_47_flag,filename,year_y,inventor_id,count,id_y,name_first,name_last,male_flag,attribution_status


In [43]:
application_assignee_inventor_df.query("application_number_x == 11724624")

Unnamed: 0,document_number,type_x,application_number_x,year_x,assignee_id,location_id,GEOID,organization,id_x,type,application_number_y,date,country,series_code,invention_title,invention_abstract,rule_47_flag,filename,year_y,inventor_id,count,id_y,name_first,name_last,male_flag,attribution_status
2864389,20070154943,utility,11724624,2007,93cc62ac-5391-47aa-8399-2944f43a4429,70fc9ca7-09bd-11ec-893a-12de62d610b1,9011,Pfizer Inc.,eaac59d1-cc76-11ea-ba95-121df0c29c1e,utility,11724624,2007-03-15,US,11,Methods for preventing cattle reproductive dis...,The present invention relates to methods for t...,False,ipa070705.xml,2007,fl:br_ln:fergen-1,1,fl:br_ln:fergen-1,Brian James,FERGEN,1.0,1
2864390,20070154943,utility,11724624,2007,93cc62ac-5391-47aa-8399-2944f43a4429,70fc9ca7-09bd-11ec-893a-12de62d610b1,9011,Pfizer Inc.,eaac59d1-cc76-11ea-ba95-121df0c29c1e,utility,11724624,2007-03-15,US,11,Methods for preventing cattle reproductive dis...,The present invention relates to methods for t...,False,ipa070705.xml,2007,fl:ca_ln:tucker-1,1,fl:ca_ln:tucker-1,Cassius M.,Tucker,1.0,1
2864391,20070154943,utility,11724624,2007,93cc62ac-5391-47aa-8399-2944f43a4429,70fc9ca7-09bd-11ec-893a-12de62d610b1,9011,Pfizer Inc.,eaac59d1-cc76-11ea-ba95-121df0c29c1e,utility,11724624,2007-03-15,US,11,Methods for preventing cattle reproductive dis...,The present invention relates to methods for t...,False,ipa070705.xml,2007,fl:ma_ln:ficken-1,1,fl:ma_ln:ficken-1,Martin D.,Ficken,1.0,1
2864392,20070154943,utility,11724624,2007,93cc62ac-5391-47aa-8399-2944f43a4429,70fc9ca7-09bd-11ec-893a-12de62d610b1,9011,Pfizer Inc.,eaac59d1-cc76-11ea-ba95-121df0c29c1e,utility,11724624,2007-03-15,US,11,Methods for preventing cattle reproductive dis...,The present invention relates to methods for t...,False,ipa070705.xml,2007,fl:mi_ln:ellsworth-1,1,fl:mi_ln:ellsworth-1,Michael A.,Ellsworth,1.0,1
2864393,20070154943,utility,11724624,2007,93cc62ac-5391-47aa-8399-2944f43a4429,fe67d7f9-cb8f-11eb-9615-121df0c29c1e,36061,Pfizer Inc.,eaac59d1-cc76-11ea-ba95-121df0c29c1e,utility,11724624,2007-03-15,US,11,Methods for preventing cattle reproductive dis...,The present invention relates to methods for t...,False,ipa070705.xml,2007,fl:br_ln:fergen-1,1,fl:br_ln:fergen-1,Brian James,FERGEN,1.0,1
2864394,20070154943,utility,11724624,2007,93cc62ac-5391-47aa-8399-2944f43a4429,fe67d7f9-cb8f-11eb-9615-121df0c29c1e,36061,Pfizer Inc.,eaac59d1-cc76-11ea-ba95-121df0c29c1e,utility,11724624,2007-03-15,US,11,Methods for preventing cattle reproductive dis...,The present invention relates to methods for t...,False,ipa070705.xml,2007,fl:ca_ln:tucker-1,1,fl:ca_ln:tucker-1,Cassius M.,Tucker,1.0,1
2864395,20070154943,utility,11724624,2007,93cc62ac-5391-47aa-8399-2944f43a4429,fe67d7f9-cb8f-11eb-9615-121df0c29c1e,36061,Pfizer Inc.,eaac59d1-cc76-11ea-ba95-121df0c29c1e,utility,11724624,2007-03-15,US,11,Methods for preventing cattle reproductive dis...,The present invention relates to methods for t...,False,ipa070705.xml,2007,fl:ma_ln:ficken-1,1,fl:ma_ln:ficken-1,Martin D.,Ficken,1.0,1
2864396,20070154943,utility,11724624,2007,93cc62ac-5391-47aa-8399-2944f43a4429,fe67d7f9-cb8f-11eb-9615-121df0c29c1e,36061,Pfizer Inc.,eaac59d1-cc76-11ea-ba95-121df0c29c1e,utility,11724624,2007-03-15,US,11,Methods for preventing cattle reproductive dis...,The present invention relates to methods for t...,False,ipa070705.xml,2007,fl:mi_ln:ellsworth-1,1,fl:mi_ln:ellsworth-1,Michael A.,Ellsworth,1.0,1


In [44]:
# duplication of document number below becasue the same organization has 2 geoIDs
application_assignee_inventor_df.query("document_number == 20070154943")

Unnamed: 0,document_number,type_x,application_number_x,year_x,assignee_id,location_id,GEOID,organization,id_x,type,application_number_y,date,country,series_code,invention_title,invention_abstract,rule_47_flag,filename,year_y,inventor_id,count,id_y,name_first,name_last,male_flag,attribution_status
2864389,20070154943,utility,11724624,2007,93cc62ac-5391-47aa-8399-2944f43a4429,70fc9ca7-09bd-11ec-893a-12de62d610b1,9011,Pfizer Inc.,eaac59d1-cc76-11ea-ba95-121df0c29c1e,utility,11724624,2007-03-15,US,11,Methods for preventing cattle reproductive dis...,The present invention relates to methods for t...,False,ipa070705.xml,2007,fl:br_ln:fergen-1,1,fl:br_ln:fergen-1,Brian James,FERGEN,1.0,1
2864390,20070154943,utility,11724624,2007,93cc62ac-5391-47aa-8399-2944f43a4429,70fc9ca7-09bd-11ec-893a-12de62d610b1,9011,Pfizer Inc.,eaac59d1-cc76-11ea-ba95-121df0c29c1e,utility,11724624,2007-03-15,US,11,Methods for preventing cattle reproductive dis...,The present invention relates to methods for t...,False,ipa070705.xml,2007,fl:ca_ln:tucker-1,1,fl:ca_ln:tucker-1,Cassius M.,Tucker,1.0,1
2864391,20070154943,utility,11724624,2007,93cc62ac-5391-47aa-8399-2944f43a4429,70fc9ca7-09bd-11ec-893a-12de62d610b1,9011,Pfizer Inc.,eaac59d1-cc76-11ea-ba95-121df0c29c1e,utility,11724624,2007-03-15,US,11,Methods for preventing cattle reproductive dis...,The present invention relates to methods for t...,False,ipa070705.xml,2007,fl:ma_ln:ficken-1,1,fl:ma_ln:ficken-1,Martin D.,Ficken,1.0,1
2864392,20070154943,utility,11724624,2007,93cc62ac-5391-47aa-8399-2944f43a4429,70fc9ca7-09bd-11ec-893a-12de62d610b1,9011,Pfizer Inc.,eaac59d1-cc76-11ea-ba95-121df0c29c1e,utility,11724624,2007-03-15,US,11,Methods for preventing cattle reproductive dis...,The present invention relates to methods for t...,False,ipa070705.xml,2007,fl:mi_ln:ellsworth-1,1,fl:mi_ln:ellsworth-1,Michael A.,Ellsworth,1.0,1
2864393,20070154943,utility,11724624,2007,93cc62ac-5391-47aa-8399-2944f43a4429,fe67d7f9-cb8f-11eb-9615-121df0c29c1e,36061,Pfizer Inc.,eaac59d1-cc76-11ea-ba95-121df0c29c1e,utility,11724624,2007-03-15,US,11,Methods for preventing cattle reproductive dis...,The present invention relates to methods for t...,False,ipa070705.xml,2007,fl:br_ln:fergen-1,1,fl:br_ln:fergen-1,Brian James,FERGEN,1.0,1
2864394,20070154943,utility,11724624,2007,93cc62ac-5391-47aa-8399-2944f43a4429,fe67d7f9-cb8f-11eb-9615-121df0c29c1e,36061,Pfizer Inc.,eaac59d1-cc76-11ea-ba95-121df0c29c1e,utility,11724624,2007-03-15,US,11,Methods for preventing cattle reproductive dis...,The present invention relates to methods for t...,False,ipa070705.xml,2007,fl:ca_ln:tucker-1,1,fl:ca_ln:tucker-1,Cassius M.,Tucker,1.0,1
2864395,20070154943,utility,11724624,2007,93cc62ac-5391-47aa-8399-2944f43a4429,fe67d7f9-cb8f-11eb-9615-121df0c29c1e,36061,Pfizer Inc.,eaac59d1-cc76-11ea-ba95-121df0c29c1e,utility,11724624,2007-03-15,US,11,Methods for preventing cattle reproductive dis...,The present invention relates to methods for t...,False,ipa070705.xml,2007,fl:ma_ln:ficken-1,1,fl:ma_ln:ficken-1,Martin D.,Ficken,1.0,1
2864396,20070154943,utility,11724624,2007,93cc62ac-5391-47aa-8399-2944f43a4429,fe67d7f9-cb8f-11eb-9615-121df0c29c1e,36061,Pfizer Inc.,eaac59d1-cc76-11ea-ba95-121df0c29c1e,utility,11724624,2007-03-15,US,11,Methods for preventing cattle reproductive dis...,The present invention relates to methods for t...,False,ipa070705.xml,2007,fl:mi_ln:ellsworth-1,1,fl:mi_ln:ellsworth-1,Michael A.,Ellsworth,1.0,1


In [45]:
application_assignee_inventor_df.query("application_number_x == 16919476")

Unnamed: 0,document_number,type_x,application_number_x,year_x,assignee_id,location_id,GEOID,organization,id_x,type,application_number_y,date,country,series_code,invention_title,invention_abstract,rule_47_flag,filename,year_y,inventor_id,count,id_y,name_first,name_last,male_flag,attribution_status


In [46]:
application_assignee_inventor_df = application_assignee_inventor_df.drop(['series_code'], axis = 1)

## inventors are listed per document number 
- multiple document numbers for single application number 
- am grouping by 'application_number_x', 'GEOID', 'organization', 'id_y' -> so each inventor will show one time

In [47]:
pre_grant = application_assignee_inventor_df.groupby(['application_number_x', 'GEOID', 'organization', 'id_y','type_x', 'year_x', 'male_flag','attribution_status']).size().reset_index()
pre_grant.sort_values(by =0, ascending = False)

#other columns listed to maintain data and not lose it 

Unnamed: 0,application_number_x,GEOID,organization,id_y,type_x,year_x,male_flag,attribution_status,0
260767,11530310,36103,SPD Control Systems Corporation,fl:ja_ln:moskowitz-2,utility,2006,1.0,1,4
372071,11770369,25021,"ANALOG DEVICES, INC.",fl:ne_ln:kuan-1,utility,2007,1.0,1,3
32,9677535,36061,Reserve Management Corporation,fl:br_ln:bent-2,utility,2000,1.0,1,3
1676509,13765233,48113,TEXAS INSTRUMENTS INCORPORATED,fl:ba_ln:varadarajan-6,utility,2013,1.0,1,3
2063246,14218123,11001,GEORGETOWN UNIVERSITY,fl:pe_ln:li-185,utility,2014,1.0,1,3
...,...,...,...,...,...,...,...,...,...
986487,12853717,47163,Eastman Chemical Company,fl:th_ln:pecorini-1,utility,2010,1.0,1,1
986488,12853718,21117,TOYOTA MOTOR ENGINEERING & MANUFACTURING NORTH...,fl:be_ln:grayson-1,utility,2010,1.0,1,1
986489,12853718,21117,TOYOTA MOTOR ENGINEERING & MANUFACTURING NORTH...,fl:de_ln:banerjee-16,utility,2010,1.0,1,1
986490,12853718,21117,TOYOTA MOTOR ENGINEERING & MANUFACTURING NORTH...,fl:ma_ln:ishii-62,utility,2010,0.0,1,1


In [48]:
#all of them have geoids 
pre_grant.GEOID.isnull().sum()

0

In [49]:
#now only shows the unique inventors on each application
pre_grant.query("application_number_x == 13436531")

Unnamed: 0,application_number_x,GEOID,organization,id_y,type_x,year_x,male_flag,attribution_status,0
1391843,13436531,51059,Pegasus Global Strategic Solutions LLC,fl:ch_ln:warner-10,utility,2012,1.0,1,1
1391844,13436531,51059,Pegasus Global Strategic Solutions LLC,fl:fl_ln:brumley-1,utility,2012,1.0,1,1
1391845,13436531,51059,Pegasus Global Strategic Solutions LLC,fl:mi_ln:reedy-3,utility,2012,1.0,1,1
1391846,13436531,51059,Pegasus Global Strategic Solutions LLC,fl:ro_ln:brumley-3,utility,2012,1.0,1,1


In [50]:
#pre_grant.query("male_flag != 1.0")

In [51]:
pre_grant.male_flag.value_counts()

1.0    2618966
0.0     338156
Name: male_flag, dtype: int64

In [52]:
#all atributed??? 
pre_grant.attribution_status.value_counts()

1    2957122
Name: attribution_status, dtype: int64

In [53]:
pre_grant.query("application_number_x == 13436531")

Unnamed: 0,application_number_x,GEOID,organization,id_y,type_x,year_x,male_flag,attribution_status,0
1391843,13436531,51059,Pegasus Global Strategic Solutions LLC,fl:ch_ln:warner-10,utility,2012,1.0,1,1
1391844,13436531,51059,Pegasus Global Strategic Solutions LLC,fl:fl_ln:brumley-1,utility,2012,1.0,1,1
1391845,13436531,51059,Pegasus Global Strategic Solutions LLC,fl:mi_ln:reedy-3,utility,2012,1.0,1,1
1391846,13436531,51059,Pegasus Global Strategic Solutions LLC,fl:ro_ln:brumley-3,utility,2012,1.0,1,1


# Not done here...

In [74]:
play = pre_grant
play

Unnamed: 0,application_number_x,GEOID,organization,id_y,type_x,year_x,male_flag,attribution_status,0
0,8479995,36061,"Enzo Biochem, Inc.",fl:de_ln:engelhardt-3,utility,1995,1.0,1,1
1,8479995,36061,"Enzo Biochem, Inc.",fl:el_ln:rabbani-1,utility,1995,1.0,1,1
2,8479995,36061,"Enzo Biochem, Inc.",fl:ja_ln:stavrianopoulos-1,utility,1995,1.0,1,1
3,8479995,36061,"Enzo Biochem, Inc.",fl:pa_ln:olsiewski-1,utility,1995,0.0,1,1
4,8479995,36061,"Enzo Biochem, Inc.",fl:ro_ln:pergolizzi-1,utility,1995,1.0,1,1
...,...,...,...,...,...,...,...,...,...
2957117,17477725,22071,THE ADMINISTRATORS OF THE TULANE EDUCATIONAL FUND,fl:ro_ln:garry-2,utility,2021,1.0,1,1
2957118,17477725,22071,THE ADMINISTRATORS OF THE TULANE EDUCATIONAL FUND,fl:ru_ln:wilson-1,utility,2021,1.0,1,1
2957119,17477725,22071,THE ADMINISTRATORS OF THE TULANE EDUCATIONAL FUND,fl:sa_ln:hopkins-4,utility,2021,1.0,1,1
2957120,17478015,29510,Monsanto Technology LLC,fl:br_ln:eads-2,utility,2021,1.0,1,1


In [55]:
#play['team_size'] =  
total_inventors = play.groupby(['application_number_x', 'GEOID', 'organization'], as_index=False).size()

#play.groupby(['application_number_x', 'GEOID', 'organization']).agg({'male_flag':'size'.reset_index()   #.sort_values(by = 'male_flag')

total_inventors = total_inventors.rename({'size': 'team_size'}, axis = 1)
total_inventors
#join/merge total_inventors  with play becasue grouped by mulitple 

Unnamed: 0,application_number_x,GEOID,organization,team_size
0,8479995,36061,"Enzo Biochem, Inc.",6
1,9136483,06085,NanoGram Corporation,3
2,9301989,08013,Condensate Energy LLC,1
3,9328626,42029,"Vanguard Products Group, Inc.",3
4,9474435,29510,Monsanto Technology LLC,7
...,...,...,...,...
1014517,17477135,25017,PRESIDENT AND FELLOWS OF HARVARD COLLEGE,2
1014518,17477135,25017,"The Broad Institute, Inc.",2
1014519,17477725,22071,"AUTOIMMUNE TECHNOLOGIES, LLC",5
1014520,17477725,22071,THE ADMINISTRATORS OF THE TULANE EDUCATIONAL FUND,5


In [56]:
total_inventors.query("application_number_x == 17477725")

Unnamed: 0,application_number_x,GEOID,organization,team_size
1014519,17477725,22071,"AUTOIMMUNE TECHNOLOGIES, LLC",5
1014520,17477725,22071,THE ADMINISTRATORS OF THE TULANE EDUCATIONAL FUND,5


In [57]:
total_inventors.query("application_number_x == 13436531")

Unnamed: 0,application_number_x,GEOID,organization,team_size
486150,13436531,51059,Pegasus Global Strategic Solutions LLC,4


In [75]:
total_males = play.groupby(['application_number_x', 'GEOID', 'organization'], as_index=False)['male_flag'].sum()
total_males = total_males.rename({'male_flag': 'male_inventors'}, axis = 1)
total_males

Unnamed: 0,application_number_x,GEOID,organization,male_inventors
0,8479995,36061,"Enzo Biochem, Inc.",5.0
1,9136483,06085,NanoGram Corporation,2.0
2,9301989,08013,Condensate Energy LLC,1.0
3,9328626,42029,"Vanguard Products Group, Inc.",2.0
4,9474435,29510,Monsanto Technology LLC,7.0
...,...,...,...,...
1014517,17477135,25017,PRESIDENT AND FELLOWS OF HARVARD COLLEGE,1.0
1014518,17477135,25017,"The Broad Institute, Inc.",1.0
1014519,17477725,22071,"AUTOIMMUNE TECHNOLOGIES, LLC",5.0
1014520,17477725,22071,THE ADMINISTRATORS OF THE TULANE EDUCATIONAL FUND,5.0


In [76]:
total_males.query("application_number_x == 13436531")

Unnamed: 0,application_number_x,GEOID,organization,male_inventors
486150,13436531,51059,Pegasus Global Strategic Solutions LLC,4.0


In [80]:
# now merge the to new tables together and then add onto the pre_grant

test_merge = pd.merge(total_inventors, total_males, on= ['application_number_x', 'GEOID', 'organization'],
                      how = 'outer', indicator = 'matched', validate = 'one_to_one')

# #all match 
#test_merge.query("matched != 'both'")

Unnamed: 0,application_number_x,GEOID,organization,team_size,male_inventors,matched


In [81]:
inventor_stats = pd.merge(total_inventors, total_males, on= ['application_number_x', 'GEOID', 'organization'], how ='left')
inventor_stats


Unnamed: 0,application_number_x,GEOID,organization,team_size,male_inventors
0,8479995,36061,"Enzo Biochem, Inc.",6,5.0
1,9136483,06085,NanoGram Corporation,3,2.0
2,9301989,08013,Condensate Energy LLC,1,1.0
3,9328626,42029,"Vanguard Products Group, Inc.",3,2.0
4,9474435,29510,Monsanto Technology LLC,7,7.0
...,...,...,...,...,...
1014517,17477135,25017,PRESIDENT AND FELLOWS OF HARVARD COLLEGE,2,1.0
1014518,17477135,25017,"The Broad Institute, Inc.",2,1.0
1014519,17477725,22071,"AUTOIMMUNE TECHNOLOGIES, LLC",5,5.0
1014520,17477725,22071,THE ADMINISTRATORS OF THE TULANE EDUCATIONAL FUND,5,5.0


In [84]:
#test_merge = pd.merge(pre_grant, inventor_stats, on =['application_number_x', 'GEOID', 'organization'], how = 'outer',
#                      indicator = 'matched', validate = 'many_to_one')

#all matched
#test_merge.query("matched != 'both'")

Unnamed: 0,application_number_x,GEOID,organization,id_y,type_x,year_x,male_flag,attribution_status,0,team_size,male_inventors,matched


In [85]:
pre_grant_stats = pd.merge(pre_grant, inventor_stats, on =['application_number_x', 'GEOID', 'organization'], how = 'left')
pre_grant_stats


Unnamed: 0,application_number_x,GEOID,organization,id_y,type_x,year_x,male_flag,attribution_status,0,team_size,male_inventors
0,8479995,36061,"Enzo Biochem, Inc.",fl:de_ln:engelhardt-3,utility,1995,1.0,1,1,6,5.0
1,8479995,36061,"Enzo Biochem, Inc.",fl:el_ln:rabbani-1,utility,1995,1.0,1,1,6,5.0
2,8479995,36061,"Enzo Biochem, Inc.",fl:ja_ln:stavrianopoulos-1,utility,1995,1.0,1,1,6,5.0
3,8479995,36061,"Enzo Biochem, Inc.",fl:pa_ln:olsiewski-1,utility,1995,0.0,1,1,6,5.0
4,8479995,36061,"Enzo Biochem, Inc.",fl:ro_ln:pergolizzi-1,utility,1995,1.0,1,1,6,5.0
...,...,...,...,...,...,...,...,...,...,...,...
2957117,17477725,22071,THE ADMINISTRATORS OF THE TULANE EDUCATIONAL FUND,fl:ro_ln:garry-2,utility,2021,1.0,1,1,5,5.0
2957118,17477725,22071,THE ADMINISTRATORS OF THE TULANE EDUCATIONAL FUND,fl:ru_ln:wilson-1,utility,2021,1.0,1,1,5,5.0
2957119,17477725,22071,THE ADMINISTRATORS OF THE TULANE EDUCATIONAL FUND,fl:sa_ln:hopkins-4,utility,2021,1.0,1,1,5,5.0
2957120,17478015,29510,Monsanto Technology LLC,fl:br_ln:eads-2,utility,2021,1.0,1,1,2,2.0


In [86]:
pre_grant_stats = pre_grant_stats[['application_number_x','GEOID','organization','type_x', 'year_x','team_size','male_inventors']]
pre_grant_stats

Unnamed: 0,application_number_x,GEOID,organization,type_x,year_x,team_size,male_inventors
0,8479995,36061,"Enzo Biochem, Inc.",utility,1995,6,5.0
1,8479995,36061,"Enzo Biochem, Inc.",utility,1995,6,5.0
2,8479995,36061,"Enzo Biochem, Inc.",utility,1995,6,5.0
3,8479995,36061,"Enzo Biochem, Inc.",utility,1995,6,5.0
4,8479995,36061,"Enzo Biochem, Inc.",utility,1995,6,5.0
...,...,...,...,...,...,...,...
2957117,17477725,22071,THE ADMINISTRATORS OF THE TULANE EDUCATIONAL FUND,utility,2021,5,5.0
2957118,17477725,22071,THE ADMINISTRATORS OF THE TULANE EDUCATIONAL FUND,utility,2021,5,5.0
2957119,17477725,22071,THE ADMINISTRATORS OF THE TULANE EDUCATIONAL FUND,utility,2021,5,5.0
2957120,17478015,29510,Monsanto Technology LLC,utility,2021,2,2.0


In [88]:
pre_grant_stats = pre_grant_stats[~pre_grant_stats.duplicated()]
pre_grant_stats

Unnamed: 0,application_number_x,GEOID,organization,type_x,year_x,team_size,male_inventors
0,8479995,36061,"Enzo Biochem, Inc.",utility,1995,6,5.0
6,9136483,06085,NanoGram Corporation,utility,1998,3,2.0
9,9301989,08013,Condensate Energy LLC,utility,1999,1,1.0
10,9328626,42029,"Vanguard Products Group, Inc.",utility,1999,3,2.0
13,9474435,29510,Monsanto Technology LLC,utility,1999,7,7.0
...,...,...,...,...,...,...,...
2957106,17477135,25017,PRESIDENT AND FELLOWS OF HARVARD COLLEGE,utility,2021,2,1.0
2957108,17477135,25017,"The Broad Institute, Inc.",utility,2021,2,1.0
2957110,17477725,22071,"AUTOIMMUNE TECHNOLOGIES, LLC",utility,2021,5,5.0
2957115,17477725,22071,THE ADMINISTRATORS OF THE TULANE EDUCATIONAL FUND,utility,2021,5,5.0


In [90]:
pre_grant_stats = pre_grant_stats.assign(female_inventors = pre_grant_stats['team_size'] - pre_grant_stats['male_inventors'])
pre_grant_stats

Unnamed: 0,application_number_x,GEOID,organization,type_x,year_x,team_size,male_inventors,female_inventors
0,8479995,36061,"Enzo Biochem, Inc.",utility,1995,6,5.0,1.0
6,9136483,06085,NanoGram Corporation,utility,1998,3,2.0,1.0
9,9301989,08013,Condensate Energy LLC,utility,1999,1,1.0,0.0
10,9328626,42029,"Vanguard Products Group, Inc.",utility,1999,3,2.0,1.0
13,9474435,29510,Monsanto Technology LLC,utility,1999,7,7.0,0.0
...,...,...,...,...,...,...,...,...
2957106,17477135,25017,PRESIDENT AND FELLOWS OF HARVARD COLLEGE,utility,2021,2,1.0,1.0
2957108,17477135,25017,"The Broad Institute, Inc.",utility,2021,2,1.0,1.0
2957110,17477725,22071,"AUTOIMMUNE TECHNOLOGIES, LLC",utility,2021,5,5.0,0.0
2957115,17477725,22071,THE ADMINISTRATORS OF THE TULANE EDUCATIONAL FUND,utility,2021,5,5.0,0.0


In [91]:
#save it 
#pre_grant_stats.to_csv('pre_grant_stats.csv')

In [60]:
#play['team_size'] = application_assignee_inventor_df.groupby(['document_number', 'organization', 'id_y' ]).\
#                       agg({'male_flag':'size'})

#patents_fun['all_assignees'] = patents.groupby(['patent_number'])['assignee'].transform(lambda x : '|'.join(x))

## Locations

- fix by adding geoID

In [61]:
# _all_locations = pd.concat([assignee_location_ids, application_inventor_male_flag_location_ids], ignore_index=True, axis=0)
# print(_all_locations.shape)
# all_locations = pd.DataFrame(_all_locations.location_id.unique())
# all_locations = all_locations.rename(columns={0: "location_id"})
# print(all_locations.shape)
# all_locations.to_csv(r"./data/pregrant/pregrant_locations.csv")

# all_locations.query("location_id=='baa6fcdc-cb8e-11eb-9615-121df0c29c1e'")

In [62]:
#_location_df.head()

In [63]:
# print(_location_df.query("country == 'US'").county_fips.isnull().sum())
#print(_location_df.query("country == 'US' & city.isnull() & state.isnull() & county.isnull() ", engine="python").shape[0])



In [64]:
#_location_df.county_fips