In [21]:
# import required python modules
import pandas as pd
import os.path as os_path
import csv
import os
import re

In [22]:
# set global variables for start and end year -- these must be data type str
START_YEAR = '2015'
END_YEAR = '2022'

In [23]:
import numpy
print(numpy.__version__)
print(pd.__version__)
import sys
print(sys.version)

1.24.3
1.5.1
3.8.10 (tags/v3.8.10:3d8993a, May  3 2021, 11:48:03) [MSC v.1928 64 bit (AMD64)]


In [24]:
# Get the current working directory
cwd = os.getcwd()

# Print the current working directory
print("Current working directory:", cwd)


Current working directory: C:\Users\user\Downloads


In [25]:
DATA_DIR = '..\data'
OUTPUT_DIR = '..\output'
PATEX_DIR = 'C:\\Users\\user\\Downloads\\BulkData2023'
PATENTSVIEW_DIR = "C:\\Users\\user\\Downloads\\data_patentsview_20231231"
OUTPUT_FNAME = 'nj_univ_appl_inv_gender.csv'

In [26]:
def id_rutgers_stevens_njit(df, text_col, output_col_suffix=''):
    '''Create columns to identify if text in 'text_col' is associated with Rutgers, Stevens, or NJIT

    This function is tailored for Rutgers University, Stevens Institute of Technology, and
    New Jersey Institute of Technology. Use a set of keywords (whole words or simplified) to
    identify if the text in 'text_col' contains the keywords assocated with each target
    university. A binary output column, one for each target university, identifies the result
    as True or False; to increase flexibility, add a suffix to the output columns.

    Rutgers = rutgers AND (univer OR jersey)
    Stevens = stevens AND institute
    NJIT = jersey AND institute AND technol

    :param df (dataframe): dataframe to analyze
    :param text_col (str): column containing text w/names
    :param output_col_suffix (str): suffix to add to end of output columns (default='')
    :return df (dataframe): dataframe with binary output columns indicating university
    
    # lowercase text in text_col
    df[text_col] = df[text_col].str.lower()
    # analyze Rutgers
    df['rutgers_'+output_col_suffix] = df[text_col].apply(lambda x: \
                      ('rutgers' in x) and (('univer' in x) or ('jersey' in x)))
    # analyze Stevens
    df['stevens_'+output_col_suffix] = df[text_col].apply(lambda x: \
                      ('stevens' in x) and ('institute' in x) and ('tech' in x))
    # analyze NJIT
    df['njit_'+output_col_suffix] = df[text_col].apply(lambda x: \
                      ('jersey' in x) and ('institute' in x) and ('technol' in x))

    #analyse princeton
    df['princeton_'+output_col_suffix] = df[text_col].apply(lambda x: ('princeton' in x) and ('univer' in x))

    #analyse rowan
    df['rowan_'+output_col_suffix] = df[text_col].apply(lambda x: ('rowan' in x) and ('univer' in x))

    #analyse NJCU
    df['NJCU_'+output_col_suffix] = df[text_col].apply(lambda x: ('jersey' in x) and ('city' in x) and ('univer' in x))

    #analyse seton
    df['seton_'+output_col_suffix] = df[text_col].apply(lambda x: ('seton' in x) and ('hall' in x))

    #analyse saint peter's
    df['saintPeters_'+output_col_suffix] = df[text_col].apply(lambda x: ('saint') and ('peter' in x) and ('univer' in x))
    
    # return updated dataframe
    return df
    '''
   
    patterns = {
        'rutgers': r'\b(rutgers(?:,\s*the\s+state\s+university(?:\s+of\s+new\s+jersey)?)?)\b',
        'stevens': r'\b(stevens\s+insti(tute)?)\b',
        'njit': r'\b(jersey\s+insti(tute)?\s+of\s+tech(nology)?)\b',
        'princeton': r'\b(princeton\s+univer(sity)?)\b',
        'rowan': r'\b(rowan\s+univer(sity)?)\b',
        'njcu': r'\b(new\s+jersey\s+city\s+univer(sity)?)\b',
        'setonHall': r'\b(seton\s+hall\s+univer(sity)?)\b',
        'saintPeters': r'\b(sa(in)t\s*peter\s+univer(sity)?)\b'
    }

    # Apply each regex pattern to the text column and create a new binary column
    for university, pattern in patterns.items():
        column_name = university + '_' + output_col_suffix
        #df[column_name] = df[text_col].apply(lambda x: bool(re.search(pattern, str(x), re.IGNORECASE)))
        df[column_name] = df[text_col].apply(lambda x: bool(re.findall(pattern, str(x), re.IGNORECASE)))
        print(f'Created column: {column_name}')
        
    return df


In [27]:
def add_appl_owner_to_dict(appl_dict, appl_id, owner):
    '''Add the owner of an application to a dictionary.
    
    The dictionary key is an application number, and values are a set of owners for that application. 
    The use of sets precludes duplicates. Use data type to ignore blank values (e.g., NaN for an owner). 
    Note: we do not use a defaultdict to simplify the later conversion to a pandas dataframe.
    :param appl_dict (dict of sets): initial application owner dictionary 
    :param appl_id (str): application number
    :param owner (str): either an individual applicant or assignee of an application
    :return appl_dict (dict of sets): updated applicaton owner dictionary    
    '''
    # check if input application id is a string
    if type(appl_id)==str:
        # if application id is not in dictionary keys then add it to the dict w/empty set
        if appl_id not in appl_dict.keys(): appl_dict[appl_id] = set()
        # if input owner is a string, add to set for that application 
        if type(owner)==str: appl_dict[appl_id].add(owner)
    return appl_dict

In [28]:
# set filepath to PatEx applicant data csv file
filepath = os_path.join(PATEX_DIR, 'all_applicants.csv')
print(filepath)

C:\Users\user\Downloads\BulkData2023\all_applicants.csv


In [29]:
with open(filepath, 'r') as file:
    for _ in range(2):
        print(file.readline(), end='')

application_number,applicant_organization,applicant_name_first,applicant_name_middle,applicant_name_last,applicant_city_name,applicant_region_code,applicant_country_code
10470614,,Societe Franco-Belge de Fabrication de,,Combustile - FBFC,Courbevoie,,FR


In [30]:
applicant_df = pd.read_csv(filepath,
                           header=0,
                           dtype=str,
                           keep_default_na=False)

In [31]:
# filter data to keep only US and NJ applicants
print('Number of rows before US NJ filter: {:,}'.format(len(applicant_df)))

applicant_df = applicant_df.loc[(applicant_df['applicant_country_code']=='US') & \
                                (applicant_df['applicant_region_code']=='NJ')]

print('Number of rows after US NJ filter: {:,}'.format(len(applicant_df)))

Number of rows before US NJ filter: 6,752,288
Number of rows after US NJ filter: 122,653


In [32]:
# create column for application series code
applicant_df['series'] = applicant_df['application_number'].apply(lambda x: x[0:2])

In [33]:
# filter data to remove PCT applications -- begins with "PCT" and hence the series from above = "PC"
print('Number of rows before PCT filter: {:,}'.format(len(applicant_df)))

applicant_df = applicant_df.loc[applicant_df['series']!='PC']

print('Number of rows after PCT filter: {:,}'.format(len(applicant_df)))

Number of rows before PCT filter: 122,653
Number of rows after PCT filter: 34,269


In [34]:
# create combined applicant string - each column separated by a space
# (note: using the python .join() function would be more complicated due to our using dataframes;
#        hence, use a simple concatenation)
applicant_df['applicant_combined'] = applicant_df['applicant_organization'] + ' ' + \
                                     applicant_df['applicant_name_first'] + ' ' + \
                                     applicant_df['applicant_name_middle'] + ' ' + \
                                     applicant_df['applicant_name_last']

In [35]:
applicant_df['count'] = applicant_df['applicant_organization'].str.split(',').apply(lambda x: len([item for item in x if 'INC' not in item]))

In [36]:
applicant_df[['applicant_combined']].to_excel('output_column.xlsx', index=False)

In [37]:
# identify applications having Rutgers, Stevens, or NJIT
applicant_df = id_rutgers_stevens_njit(applicant_df, 'applicant_combined', output_col_suffix='app')

Created column: rutgers_app
Created column: stevens_app
Created column: njit_app
Created column: princeton_app
Created column: rowan_app
Created column: njcu_app
Created column: setonHall_app
Created column: saintPeters_app


In [38]:
# summarize results
print('Number of Rutgers applications: {:,}'.format(len(applicant_df.loc[applicant_df['rutgers_app']==True])))
print('Number of Stevens applications: {:,}'.format(len(applicant_df.loc[applicant_df['stevens_app']==True])))
print('Number of NJIT applications: {:,}'.format(len(applicant_df.loc[applicant_df['njit_app']==True])))
print('Number of princeton applications: {:,}'.format(len(applicant_df.loc[applicant_df['princeton_app']==True])))
print('Number of rowan applications: {:,}'.format(len(applicant_df.loc[applicant_df['rowan_app']==True])))
print('Number of NJCU applications: {:,}'.format(len(applicant_df.loc[applicant_df['njcu_app']==True])))
print('Number of seton applications: {:,}'.format(len(applicant_df.loc[applicant_df['setonHall_app']==True])))
print('Number of saintPeters applications: {:,}'.format(len(applicant_df.loc[applicant_df['saintPeters_app']==True])))

Number of Rutgers applications: 612
Number of Stevens applications: 95
Number of NJIT applications: 103
Number of princeton applications: 420
Number of rowan applications: 69
Number of NJCU applications: 0
Number of seton applications: 7
Number of saintPeters applications: 0


In [39]:
# summarize the applicant names associated with the target universities
for univ in ['rutgers_app', 'stevens_app', 'njit_app','princeton_app','rowan_app','njcu_app','setonHall_app','saintPeters_app']:
    print('Applicant names for {}'.format(univ))
    print('----------------------')
    df = applicant_df.loc[applicant_df[univ]==True]
    print(df['applicant_combined'].value_counts())
    print()

Applicant names for rutgers_app
----------------------
Rutgers, The State University of New Jersey              407
RUTGERS, THE STATE UNIVERSITY OF NEW JERSEY              153
Rutgers, the State University of New Jersey               23
   Rutgers, The State University of New Jersey             4
Rutgers University                                         3
Rutgers,The State University of New Jersey                 3
 RUTGERS, THE STATE UNIVERSITY OF  NEW JERSEY              2
RUTGERS THE STATE UNIVERSITY OF NEW JERSEY                 2
RUTGERS, THE STATE UNIVERSITY                              2
 RUTGERS, THE STATE  UNIVERSITY OF NEW JERSEY              1
  RUTGERS, THE STATE UNIVERSITY OF NEW JERSEY              1
RUTGERS, THE STATE OF UNIVERSITY OF NEW JERSEY             1
Rutgers, The State University                              1
Rutgers, The State Univeristy Of New Jersey                1
Rutgers, The State of New Jersey                           1
RUTGERS , THE STATE UNIVERSITY

In [40]:
## keep only observations w/target universities
print('Number of rows before removing non-target universities: {:,}'.format(len(applicant_df)))

applicant_df = applicant_df.loc[(applicant_df['rutgers_app']==True) | \
                                (applicant_df['stevens_app']==True) | \
                                (applicant_df['njit_app']==True) | \
                                (applicant_df['princeton_app']==True) | \
                                (applicant_df['rowan_app']==True) | \
                                (applicant_df['njcu_app']==True) | \
                                (applicant_df['setonHall_app']==True) | \
                                (applicant_df['saintPeters_app']==True)]

print('Number of rows after removing non-target universities: {:,}'.format(len(applicant_df)))

Number of rows before removing non-target universities: 34,269
Number of rows after removing non-target universities: 1,306


In [41]:
# keep only the columns we need
applicant_df = applicant_df[['application_number','rutgers_app', 'stevens_app', \
                             'njit_app','princeton_app','rowan_app','njcu_app','setonHall_app','saintPeters_app']]

In [42]:
# create a consoldiated university variable
applicant_df.loc[applicant_df['rutgers_app']==True,'univ'] = 'Rutgers'
applicant_df.loc[applicant_df['stevens_app']==True,'univ'] = 'Stevens'
applicant_df.loc[applicant_df['njit_app']==True,'univ'] = 'NJIT'
applicant_df.loc[applicant_df['princeton_app']==True,'univ'] = 'Princeton'
applicant_df.loc[applicant_df['rowan_app']==True,'univ'] = 'Rowan'
applicant_df.loc[applicant_df['njcu_app']==True,'univ'] = 'NJCU'
applicant_df.loc[applicant_df['setonHall_app']==True,'univ'] = 'Seton'
applicant_df.loc[applicant_df['saintPeters_app']==True,'univ'] = 'saintPeters'
# print results
print(applicant_df['univ'].value_counts())

Rutgers      612
Princeton    420
NJIT         103
Stevens       95
Rowan         69
Seton          7
Name: univ, dtype: int64


In [43]:
# define columns we want to use (see above)
'''
col_g_assignee  = ['patent_id','disambig_assignee_organization',
                   'disambig_assignee_individual_name_first', 'disambig_assignee_individual_name_last']
col_pg_assignee = ['pgpub_id','disambig_assignee_organization',
                   'disambig_assignee_individual_name_first', 'disambig_assignee_individual_name_last']
'''

"\ncol_g_assignee  = ['patent_id','disambig_assignee_organization',\n                   'disambig_assignee_individual_name_first', 'disambig_assignee_individual_name_last']\ncol_pg_assignee = ['pgpub_id','disambig_assignee_organization',\n                   'disambig_assignee_individual_name_first', 'disambig_assignee_individual_name_last']\n"

In [44]:
# set seperator for the data files; for tsv file, use '/t'
seperator = '\t'

In [45]:
# # open assignee data for patents and PGPubs as pandas dataframes
# # - identify the header as row 0 (the first row)
# # - specify columns we want to read via usecols option
# # - use seperator as specified above
# # - infer compression (default but specify this option anyway)
# # - ensure all columns are read in as strings
# # - option keep_default_na=False ensure we get an empty string and not NaN if a value is missing
# patent_assignee_df = pd.read_csv(os_path.join(PATENTSVIEW_DIR,'g_assignee_disambiguated.tsv.zip'), 
#                                  header=0, 
#                                  usecols = col_g_assignee, 
#                                  sep=seperator, 
#                                  compression='infer', 
#                                  dtype=str, 
#                                  keep_default_na=False)
# pgpub_assignee_df =  pd.read_csv(os_path.join(PATENTSVIEW_DIR,'pg_assignee_disambiguated.tsv.zip'), 
#                                  header=0, 
#                                  usecols = col_pg_assignee,
#                                  sep=seperator, 
#                                  compression='infer', 
#                                  dtype=str, 
#                                  keep_default_na=False)

In [46]:
# # as before, create a string combining all assignee name information
# patent_assignee_df['assignee_combined'] = patent_assignee_df['disambig_assignee_organization'] + ' ' + \
#                                           patent_assignee_df['disambig_assignee_individual_name_first'] + ' ' + \
#                                           patent_assignee_df['disambig_assignee_individual_name_last']
# pgpub_assignee_df['assignee_combined']  = pgpub_assignee_df['disambig_assignee_organization'] + ' ' + \
#                                           pgpub_assignee_df['disambig_assignee_individual_name_first'] + ' ' + \
#                                           pgpub_assignee_df['disambig_assignee_individual_name_last']

In [47]:
# # identify if target universities -- patents 
# patent_assignee_df = id_rutgers_stevens_njit(patent_assignee_df, 
#                                              'assignee_combined', 
#                                              output_col_suffix='asg_patent')

In [48]:
# # identify if target universities -- pgpubs
# pgpub_assignee_df  = id_rutgers_stevens_njit(pgpub_assignee_df, 
#                                              'assignee_combined', 
#                                              output_col_suffix='asg_pgpub')

In [49]:
# # summarize the assignee names associated with the target universities - patents
# for univ in ['rutgers_asg_patent', 'stevens_asg_patent', 'njit_asg_patent','princeton_asg_patent','rowan_asg_patent','seton_asg_patent','NJCU_asg_patent','saintPeters_asg_patent']:
#     print('Assignee names for {}'.format(univ))
#     print('----------------------')
#     df = patent_assignee_df.loc[patent_assignee_df[univ]==True]
#     print(df['disambig_assignee_organization'].value_counts())
#     print()

In [50]:
# # summarize the assignee names associated with the target universities - pgpubs
# for univ in ['rutgers_asg_pgpub', 'stevens_asg_pgpub', 'njit_asg_pgpub','princeton_asg_pgpub','rowan_asg_pgpub','NJCU_asg_pgpub','seton_asg_pgpub','saintPeters_asg_pgpub']:
#     print('Assignee names for {}'.format(univ))
#     print('----------------------')
#     df = pgpub_assignee_df.loc[pgpub_assignee_df[univ]==True]
#     print(df['disambig_assignee_organization'].value_counts())
#     print()

In [51]:
# # keep only observations w/target universities - patents
# print('PATENTS')
# print('Number of rows before removing non-target universities: {:,}'.format(len(patent_assignee_df)))

# patent_assignee_df = patent_assignee_df.loc[(patent_assignee_df['rutgers_asg_patent']==True) | \
#                                             (patent_assignee_df['stevens_asg_patent']==True) | \
#                                             (patent_assignee_df['njit_asg_patent']==True)    | \
#                                             (patent_assignee_df['princeton_asg_patent']==True) | \
#                                             (patent_assignee_df['rowan_asg_patent']==True) | \
#                                             (patent_assignee_df['NJCU_asg_patent']==True) | \
#                                             (patent_assignee_df['seton_asg_patent']==True) | \
#                                             (patent_assignee_df['saintPeters_asg_patent']==True)]
# print('Number of rows after removing non-target universities: {:,}'.format(len(patent_assignee_df)))

In [52]:
# # keep only observations w/target universities - pgpubs
# print('PGPUBS')
# print('Number of rows before removing non-target universities: {:,}'.format(len(pgpub_assignee_df)))

# pgpub_assignee_df = pgpub_assignee_df.loc[(pgpub_assignee_df['rutgers_asg_pgpub']==True) | \
#                                           (pgpub_assignee_df['stevens_asg_pgpub']==True) | \
#                                           (pgpub_assignee_df['njit_asg_pgpub']==True)    | \
#                                           (pgpub_assignee_df['rowan_asg_pgpub']==True)    | \
#                                           (pgpub_assignee_df['princeton_asg_pgpub']==True)    | \
#                                           (pgpub_assignee_df['NJCU_asg_pgpub']==True)    | \
#                                           (pgpub_assignee_df['seton_asg_pgpub']==True)    | \
#                                           (pgpub_assignee_df['saintPeters_asg_pgpub']==True)]

# print('Number of rows after removing non-target universities: {:,}'.format(len(pgpub_assignee_df)))

In [53]:
# # keep only the columns we need 
# patent_assignee_df = patent_assignee_df[['patent_id','rutgers_asg_patent','stevens_asg_patent','njit_asg_patent','princeton_asg_patent','rowan_asg_patent','NJCU_asg_patent','seton_asg_patent','saintPeters_asg_patent']]
# pgpub_assignee_df = pgpub_assignee_df[['pgpub_id','rutgers_asg_pgpub','stevens_asg_pgpub','njit_asg_pgpub','rowan_asg_pgpub','princeton_asg_pgpub','NJCU_asg_pgpub','seton_asg_pgpub','saintPeters_asg_pgpub']]

In [54]:
# # create a consoldiated university variable - patents
# patent_assignee_df.loc[patent_assignee_df['rutgers_asg_patent']==True,'univ_asg_patent'] = 'Rutgers'
# patent_assignee_df.loc[patent_assignee_df['stevens_asg_patent']==True,'univ_asg_patent'] = 'Stevens'
# patent_assignee_df.loc[patent_assignee_df['njit_asg_patent']==True,'univ_asg_patent'] = 'NJIT'
# patent_assignee_df.loc[patent_assignee_df['princeton_asg_patent']==True,'univ_asg_patent'] = 'Princeton'
# patent_assignee_df.loc[patent_assignee_df['rowan_asg_patent']==True,'univ_asg_patent'] = 'Rowan'
# patent_assignee_df.loc[patent_assignee_df['NJCU_asg_patent']==True,'univ_asg_patent'] = 'NJCU'
# patent_assignee_df.loc[patent_assignee_df['seton_asg_patent']==True,'univ_asg_patent'] = 'Seton'
# patent_assignee_df.loc[patent_assignee_df['saintPeters_asg_patent']==True,'univ_asg_patent'] = 'SaintPeters'

# # print results 
# print('PATENTS')
# print(patent_assignee_df['univ_asg_patent'].value_counts())

In [55]:
# # create a consoldiated university variable - pgpubs
# pgpub_assignee_df.loc[pgpub_assignee_df['rutgers_asg_pgpub']==True,'univ_asg_pgpub'] = 'Rutgers'
# pgpub_assignee_df.loc[pgpub_assignee_df['stevens_asg_pgpub']==True,'univ_asg_pgpub'] = 'Stevens'
# pgpub_assignee_df.loc[pgpub_assignee_df['njit_asg_pgpub']==True,'univ_asg_pgpub'] = 'NJIT'
# pgpub_assignee_df.loc[pgpub_assignee_df['princeton_asg_pgpub']==True,'univ_asg_pgpub'] = 'Princeton'
# pgpub_assignee_df.loc[pgpub_assignee_df['rowan_asg_pgpub']==True,'univ_asg_pgpub'] = 'Rowan'
# pgpub_assignee_df.loc[pgpub_assignee_df['NJCU_asg_pgpub']==True,'univ_asg_pgpub'] = 'NJCU'
# pgpub_assignee_df.loc[pgpub_assignee_df['seton_asg_pgpub']==True,'univ_asg_pgpub'] = 'Seton'
# pgpub_assignee_df.loc[pgpub_assignee_df['saintPeters_asg_pgpub']==True,'univ_asg_pgpub'] = 'SaintPeters'

# # print results 
# print('PGPUBS')
# print(pgpub_assignee_df['univ_asg_pgpub'].value_counts())

In [56]:
# # define columns we want to use (see above)
# col_g_appln = ['patent_id','application_id']
# col_pg_appln = ['pgpub_id','application_id']

In [57]:
# set separator for .tsv file
seperator = '\t'

In [58]:
# # open data for patents and PGPubs as pandas dataframes
# # - identify the header as row 0 (the first row)
# # - specify columns we want to read via usecols option
# # - use same seperator as above
# # - infer compression (default but specify this option anyway)
# # - ensure all columns are read in as strings
# # - option keep_default_na=False ensure we get an empty string and not NaN if a value is missing
# patent_appln_df = pd.read_csv(os_path.join(PATENTSVIEW_DIR,'g_application.tsv.zip'), 
#                                  header=0, 
#                                  usecols = col_g_appln, 
#                                  sep=seperator, 
#                                  compression='infer', 
#                                  dtype=str, 
#                                  keep_default_na=False)
# pgpub_appln_df =  pd.read_csv(os_path.join(PATENTSVIEW_DIR,'pg_published_application.tsv.zip'), 
#                                  header=0, 
#                                  usecols = col_pg_appln,
#                                  sep=seperator, 
#                                  compression='infer', 
#                                  dtype=str, 
#                                  keep_default_na=False)

In [59]:
# # left join assignee dataframe (left table) to application dataframe (right table) -- patents 
# # use indicator=True to filter results 
# patent_assignee_df = patent_assignee_df.merge(patent_appln_df, 
#                                               how='left', 
#                                               on='patent_id', 
#                                               indicator=True)
# # print merge results 
# patent_assignee_df['_merge'].value_counts()

In [60]:
# # keep _merge=='both'
# patent_assignee_df = patent_assignee_df.loc[patent_assignee_df['_merge']=='both']
# # drop _merge column
# patent_assignee_df.drop(columns=['_merge'], inplace=True)

In [61]:
# # left join assignee dataframe (left table) to application dataframe (right table) -- pgpubs 
# # use indicator=True to filter results 
# pgpub_assignee_df = pgpub_assignee_df.merge(pgpub_appln_df, 
#                                             how='inner', 
#                                             on='pgpub_id', 
#                                             indicator=True)
# # print merge results 
# pgpub_assignee_df['_merge'].value_counts()

In [62]:
# # keep _merge=='both'
# pgpub_assignee_df = pgpub_assignee_df.loc[pgpub_assignee_df['_merge']=='both']
# # drop _merge column
# pgpub_assignee_df.drop(columns=['_merge'], inplace=True)

In [63]:
# # print size of assignee dataframes 
# print('Number patents having target universities as assignee: {:,}'.format(len(patent_assignee_df)))
# print('Number PGPubs having target universities as assignee: {:,}'.format(len(pgpub_assignee_df)))

In [64]:
# initialize the dictionary
appl_dict = {}

In [65]:
# process applicant data by looping through each row of the dataframe
# and calling the function add_appl_owner_to_dict 
for i in range(0,len(applicant_df)): 
    appl_dict = add_appl_owner_to_dict(appl_dict, 
                                       applicant_df.iloc[i]['application_number'], 
                                       applicant_df.iloc[i]['univ'])

In [66]:
# # similarly process patent assignee data; note different column names 
# for i in range(0,len(patent_assignee_df)): 
#     appl_dict = add_appl_owner_to_dict(appl_dict, 
#                                        patent_assignee_df.iloc[i]['application_id'], 
#                                        patent_assignee_df.iloc[i]['univ_asg_patent'])

In [67]:
# # similarly process pgpub assignee data; note different column names 
# for i in range(0,len(pgpub_assignee_df)): 
#     appl_dict = add_appl_owner_to_dict(appl_dict, 
#                                        pgpub_assignee_df.iloc[i]['application_id'], 
#                                        pgpub_assignee_df.iloc[i]['univ_asg_pgpub'])

In [68]:
# to faciliate creating a dataframe, create two lists, one for application IDs and another for universities, 
# where the set of university names is converted to a list
appl_list = []
univ_list = []
for key, value in appl_dict.items():
    appl_list.append(key)
    univ_list.append(list(value))
# verify length of the two list are the same 
n_appl_list = len(appl_list)
n_univ_list = len(univ_list)
print('Length of application ID list: {:,}'.format(n_appl_list))
print('Length of university list: {:,}'.format(n_univ_list))
if n_appl_list!=n_univ_list: 
    raise ValueError('# applications do not equal length of university list')

Length of application ID list: 1,288
Length of university list: 1,288


In [69]:
# create dataframe 
nj_univ_df = pd.DataFrame({'application_number': appl_list, 'univ_list': univ_list})

In [70]:
# create column for application series code
nj_univ_df['series'] = nj_univ_df['application_number'].apply(lambda x: x[0:2]) 

In [71]:
# filter data to remove PCT applications -- begins with "PCT" and hence the series from above = "PC"
print('Number of rows before PCT filter: {:,}'.format(len(nj_univ_df)))
nj_univ_df = nj_univ_df.loc[nj_univ_df['series']!='PC']
print('Number of rows after PCT filter: {:,}'.format(len(nj_univ_df)))

Number of rows before PCT filter: 1,288
Number of rows after PCT filter: 1,288


In [72]:
# print series -- value_counts (pandas 1.5) outputs a pandas series whose index are the series codes
nj_univ_df['series'].value_counts().sort_index()

12      1
13     73
14    267
15    228
16    267
17    270
18     32
61     13
62     96
63     41
Name: series, dtype: int64

In [73]:
# print universities -- these will include those with multiple university owners 
nj_univ_df['univ_list'].value_counts().sort_index()

[NJIT]                   99
[NJIT, Rutgers]           4
[Princeton]             417
[Rowan]                  69
[Rutgers]               595
[Rutgers, Princeton]      2
[Rutgers, Stevens]       11
[Seton]                   7
[Stevens]                84
Name: univ_list, dtype: int64

In [74]:
# to illustrate the next step, look at application '14777191' as an example
print('Before pandas explode()')
print(nj_univ_df.loc[nj_univ_df['application_number']=='14777191'].transpose())

Before pandas explode()
                         324
application_number  14777191
univ_list             [NJIT]
series                    14


In [75]:
# use pandas explode() to break apart multiple universities into seperate rows (keeping same application ID)
print('Number rows prior to breaking apart university lists: {:,}'.format(len(nj_univ_df)))
nj_univ_df = nj_univ_df.explode('univ_list')
print('Number rows after breaking apart university lists: {:,}'.format(len(nj_univ_df)))

Number rows prior to breaking apart university lists: 1,288
Number rows after breaking apart university lists: 1,305


In [76]:
print('After pandas explode()')
print(nj_univ_df.loc[nj_univ_df['application_number']=='14777191'].transpose())

After pandas explode()
                         324
application_number  14777191
univ_list               NJIT
series                    14


In [77]:
# examine result universities -- all should be singular 
nj_univ_df['univ_list'].value_counts().sort_index()

NJIT         103
Princeton    419
Rowan         69
Rutgers      612
Seton          7
Stevens       95
Name: univ_list, dtype: int64

In [78]:
# rename univ_list column to univ
nj_univ_df.rename(columns={'univ_list': 'univ'}, inplace=True)

In [79]:
# # delete unneeded dataframes and data structures 
# del applicant_df, patent_assignee_df, pgpub_assignee_df
# del appl_dict, appl_list, univ_list

In [80]:
# look at first 5 observations (do not use transpose to preserve column orientation)
# note that university names are not in a list 
nj_univ_df.head(5)

Unnamed: 0,application_number,univ,series
0,17435954,Rutgers,17
1,17914647,Rutgers,17
2,18005188,Princeton,18
3,18245256,Rutgers,18
4,12539049,NJIT,12


In [81]:
# look at the first 6 observations (3 applications) having more than one university owner 
# (option keep=False tags all duplicate observation, not just the first or last)
# (note: this command does not change the underlying dataset - duplicates are not dropped)
nj_univ_df.loc[nj_univ_df.duplicated(subset=['application_number'], keep=False)].head(6)

Unnamed: 0,application_number,univ,series
140,14348197,Rutgers,14
140,14348197,Stevens,14
292,14751086,Rutgers,14
292,14751086,Stevens,14
298,14763944,NJIT,14
298,14763944,Rutgers,14


In [82]:
# set filepath to PatEx application data csv file
filepath = os_path.join(PATEX_DIR, 'application_data.csv')
print(filepath)

C:\Users\user\Downloads\BulkData2023\application_data.csv


In [83]:
# identify columns we want to use
col_list = ['application_number','filing_date','application_invention_type',
            'earliest_pgpub_number','earliest_pgpub_date','patent_number','patent_issue_date',
            'invention_title','small_entity_indicator','aia_first_to_file']

# open csv file as a pandas dataframe
# - identify the header as row 0 (the first row)
# - use only selected columns
# - ensure all columns are read in as strings (important for the application number)
# - option keep_default_na=False ensure we get an empty string and not NaN if a value is missing
appl_df = pd.read_csv(filepath,
                      header=0,
                      usecols=col_list,
                      dtype=str,
                      keep_default_na=False)

In [84]:
appl_df.head()

Unnamed: 0,application_number,filing_date,application_invention_type,earliest_pgpub_number,earliest_pgpub_date,patent_number,patent_issue_date,invention_title,small_entity_indicator,aia_first_to_file
0,4453098,,,,,,,,UNDISCOUNTED,
1,4544040,,,,,,,,UNDISCOUNTED,
2,4577552,,,,,,,,UNDISCOUNTED,
3,4589670,,,,,,,,UNDISCOUNTED,
4,4578318,,,,,,,,UNDISCOUNTED,


In [85]:
# create column for filing year
appl_df['filing_year'] = appl_df['filing_date'].apply(lambda x: x[0:4])

In [86]:
appl_df.head()

Unnamed: 0,application_number,filing_date,application_invention_type,earliest_pgpub_number,earliest_pgpub_date,patent_number,patent_issue_date,invention_title,small_entity_indicator,aia_first_to_file,filing_year
0,4453098,,,,,,,,UNDISCOUNTED,,
1,4544040,,,,,,,,UNDISCOUNTED,,
2,4577552,,,,,,,,UNDISCOUNTED,,
3,4589670,,,,,,,,UNDISCOUNTED,,
4,4578318,,,,,,,,UNDISCOUNTED,,


In [87]:
# filter data to keep applications between start and end dates

print('Number of rows before date filter: {:,}'.format(len(appl_df)))

appl_df = appl_df.loc[(appl_df['filing_year']>=START_YEAR) & \
                      (appl_df['filing_year']<=END_YEAR)]

print('Number of rows after date filter: {:,}'.format(len(appl_df)))


Number of rows before date filter: 14,100,378
Number of rows after date filter: 4,246,035


In [88]:
# left join nj_univ_df (left table) to appl_df (right table); use indicator=True to filter results 
nj_univ_df = nj_univ_df.merge(appl_df, 
                                how='left', 
                                on='application_number', 
                                indicator=True)

In [89]:
nj_univ_df.head()

Unnamed: 0,application_number,univ,series,filing_date,application_invention_type,earliest_pgpub_number,earliest_pgpub_date,patent_number,patent_issue_date,invention_title,small_entity_indicator,aia_first_to_file,filing_year,_merge
0,17435954,Rutgers,17,,,,,,,,,,,left_only
1,17914647,Rutgers,17,,,,,,,,,,,left_only
2,18005188,Princeton,18,,,,,,,,,,,left_only
3,18245256,Rutgers,18,,,,,,,,,,,left_only
4,12539049,NJIT,12,,,,,,,,,,,left_only


In [90]:
merge_results_series = nj_univ_df['_merge'].value_counts()
print(merge_results_series)

print('\nMerge results: ')
print('Number appl w/target NJ university applicants within date range: {:,}'.
      format(merge_results_series['both']))
print('Number appl w/target NJ university applicants outside date range (or no appl data): {:,}'.
      format(merge_results_series['left_only']))

both          1050
left_only      255
right_only       0
Name: _merge, dtype: int64

Merge results: 
Number appl w/target NJ university applicants within date range: 1,050
Number appl w/target NJ university applicants outside date range (or no appl data): 255


In [91]:
nj_univ_df

Unnamed: 0,application_number,univ,series,filing_date,application_invention_type,earliest_pgpub_number,earliest_pgpub_date,patent_number,patent_issue_date,invention_title,small_entity_indicator,aia_first_to_file,filing_year,_merge
0,17435954,Rutgers,17,,,,,,,,,,,left_only
1,17914647,Rutgers,17,,,,,,,,,,,left_only
2,18005188,Princeton,18,,,,,,,,,,,left_only
3,18245256,Rutgers,18,,,,,,,,,,,left_only
4,12539049,NJIT,12,,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1300,18017849,Princeton,18,,,,,,,,,,,left_only
1301,18100727,Rutgers,18,,,,,,,,,,,left_only
1302,18103046,Stevens,18,,,,,,,,,,,left_only
1303,18024423,NJIT,18,,,,,,,,,,,left_only


In [92]:
# keep _merge=='both'
nj_univ_df = nj_univ_df.loc[nj_univ_df['_merge']=='both']

In [93]:
nj_univ_df

Unnamed: 0,application_number,univ,series,filing_date,application_invention_type,earliest_pgpub_number,earliest_pgpub_date,patent_number,patent_issue_date,invention_title,small_entity_indicator,aia_first_to_file,filing_year,_merge
243,14605053,Stevens,14,2015-01-26,Utility,US20150235073A1,2015-08-20,,,FLEXIBLE PART-BASED REPRESENTATION FOR REAL-WO...,SMALL,true,2015,both
244,14417704,Rowan,14,2015-01-27,Utility,US20150198600A1,2015-07-16,,,Autoantibody Profiles in the Early Detection a...,SMALL,false,2015,both
245,14607496,Rutgers,14,2015-01-28,Utility,US20150233934A1,2015-08-20,9945051,2018-04-17,COLOR-ENCODING AND IN-SITU INTERROGATION OF MA...,SMALL,false,2015,both
246,14418392,Rutgers,14,2015-01-29,Utility,US20150297725A1,2015-10-22,,,COCHLEATES MADE WITH SOY PHOSPHATIDYLSERINE,SMALL,false,2015,both
247,14609269,Rutgers,14,2015-01-29,Utility,US20150140074A1,2015-05-21,9775907,2017-10-03,COCHLEATES MADE WITH SOY PHOSPHATIDYLSERINE,SMALL,true,2015,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1288,18085724,NJIT,18,2022-12-21,Utility,US20230149855A1,2023-05-18,,,Hollow Fiber Membrane Module for Direct Contac...,SMALL,true,2022,both
1289,18013368,Rutgers,18,2022-12-28,Utility,,,,,IDENTIFICATION OF ESTROGEN RECEPTOR POSITIVE (...,SMALL,Other,2022,both
1290,18013302,Princeton,18,2022-12-28,Utility,,,,,COMPOUNDS HAVING ANTICANCER ACTIVITY,SMALL,true,2022,both
1291,18003753,Princeton,18,2022-12-29,Utility,,,,,Ketogenic Diet and Ketone Supplementation for ...,SMALL,Other,2022,both


In [94]:
# drop _merge column
nj_univ_df.drop(columns=['_merge'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nj_univ_df.drop(columns=['_merge'], inplace=True)


In [95]:
nj_univ_df

Unnamed: 0,application_number,univ,series,filing_date,application_invention_type,earliest_pgpub_number,earliest_pgpub_date,patent_number,patent_issue_date,invention_title,small_entity_indicator,aia_first_to_file,filing_year
243,14605053,Stevens,14,2015-01-26,Utility,US20150235073A1,2015-08-20,,,FLEXIBLE PART-BASED REPRESENTATION FOR REAL-WO...,SMALL,true,2015
244,14417704,Rowan,14,2015-01-27,Utility,US20150198600A1,2015-07-16,,,Autoantibody Profiles in the Early Detection a...,SMALL,false,2015
245,14607496,Rutgers,14,2015-01-28,Utility,US20150233934A1,2015-08-20,9945051,2018-04-17,COLOR-ENCODING AND IN-SITU INTERROGATION OF MA...,SMALL,false,2015
246,14418392,Rutgers,14,2015-01-29,Utility,US20150297725A1,2015-10-22,,,COCHLEATES MADE WITH SOY PHOSPHATIDYLSERINE,SMALL,false,2015
247,14609269,Rutgers,14,2015-01-29,Utility,US20150140074A1,2015-05-21,9775907,2017-10-03,COCHLEATES MADE WITH SOY PHOSPHATIDYLSERINE,SMALL,true,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1288,18085724,NJIT,18,2022-12-21,Utility,US20230149855A1,2023-05-18,,,Hollow Fiber Membrane Module for Direct Contac...,SMALL,true,2022
1289,18013368,Rutgers,18,2022-12-28,Utility,,,,,IDENTIFICATION OF ESTROGEN RECEPTOR POSITIVE (...,SMALL,Other,2022
1290,18013302,Princeton,18,2022-12-28,Utility,,,,,COMPOUNDS HAVING ANTICANCER ACTIVITY,SMALL,true,2022
1291,18003753,Princeton,18,2022-12-29,Utility,,,,,Ketogenic Diet and Ketone Supplementation for ...,SMALL,Other,2022


In [96]:
# reset index; do not insert previous index into dataframe
nj_univ_df.reset_index(drop=True, inplace=True)

In [97]:
nj_univ_df

Unnamed: 0,application_number,univ,series,filing_date,application_invention_type,earliest_pgpub_number,earliest_pgpub_date,patent_number,patent_issue_date,invention_title,small_entity_indicator,aia_first_to_file,filing_year
0,14605053,Stevens,14,2015-01-26,Utility,US20150235073A1,2015-08-20,,,FLEXIBLE PART-BASED REPRESENTATION FOR REAL-WO...,SMALL,true,2015
1,14417704,Rowan,14,2015-01-27,Utility,US20150198600A1,2015-07-16,,,Autoantibody Profiles in the Early Detection a...,SMALL,false,2015
2,14607496,Rutgers,14,2015-01-28,Utility,US20150233934A1,2015-08-20,9945051,2018-04-17,COLOR-ENCODING AND IN-SITU INTERROGATION OF MA...,SMALL,false,2015
3,14418392,Rutgers,14,2015-01-29,Utility,US20150297725A1,2015-10-22,,,COCHLEATES MADE WITH SOY PHOSPHATIDYLSERINE,SMALL,false,2015
4,14609269,Rutgers,14,2015-01-29,Utility,US20150140074A1,2015-05-21,9775907,2017-10-03,COCHLEATES MADE WITH SOY PHOSPHATIDYLSERINE,SMALL,true,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,18085724,NJIT,18,2022-12-21,Utility,US20230149855A1,2023-05-18,,,Hollow Fiber Membrane Module for Direct Contac...,SMALL,true,2022
1046,18013368,Rutgers,18,2022-12-28,Utility,,,,,IDENTIFICATION OF ESTROGEN RECEPTOR POSITIVE (...,SMALL,Other,2022
1047,18013302,Princeton,18,2022-12-28,Utility,,,,,COMPOUNDS HAVING ANTICANCER ACTIVITY,SMALL,true,2022
1048,18003753,Princeton,18,2022-12-29,Utility,,,,,Ketogenic Diet and Ketone Supplementation for ...,SMALL,Other,2022


In [98]:
# delete application dataframe 
del appl_df 

In [99]:
# print first five rows of data (use transpose for readability)
nj_univ_df.head(5).transpose()

Unnamed: 0,0,1,2,3,4
application_number,14605053,14417704,14607496,14418392,14609269
univ,Stevens,Rowan,Rutgers,Rutgers,Rutgers
series,14,14,14,14,14
filing_date,2015-01-26,2015-01-27,2015-01-28,2015-01-29,2015-01-29
application_invention_type,Utility,Utility,Utility,Utility,Utility
earliest_pgpub_number,US20150235073A1,US20150198600A1,US20150233934A1,US20150297725A1,US20150140074A1
earliest_pgpub_date,2015-08-20,2015-07-16,2015-08-20,2015-10-22,2015-05-21
patent_number,,,9945051,,9775907
patent_issue_date,,,2018-04-17,,2017-10-03
invention_title,FLEXIBLE PART-BASED REPRESENTATION FOR REAL-WO...,Autoantibody Profiles in the Early Detection a...,COLOR-ENCODING AND IN-SITU INTERROGATION OF MA...,COCHLEATES MADE WITH SOY PHOSPHATIDYLSERINE,COCHLEATES MADE WITH SOY PHOSPHATIDYLSERINE


In [100]:
nj_univ_df

Unnamed: 0,application_number,univ,series,filing_date,application_invention_type,earliest_pgpub_number,earliest_pgpub_date,patent_number,patent_issue_date,invention_title,small_entity_indicator,aia_first_to_file,filing_year
0,14605053,Stevens,14,2015-01-26,Utility,US20150235073A1,2015-08-20,,,FLEXIBLE PART-BASED REPRESENTATION FOR REAL-WO...,SMALL,true,2015
1,14417704,Rowan,14,2015-01-27,Utility,US20150198600A1,2015-07-16,,,Autoantibody Profiles in the Early Detection a...,SMALL,false,2015
2,14607496,Rutgers,14,2015-01-28,Utility,US20150233934A1,2015-08-20,9945051,2018-04-17,COLOR-ENCODING AND IN-SITU INTERROGATION OF MA...,SMALL,false,2015
3,14418392,Rutgers,14,2015-01-29,Utility,US20150297725A1,2015-10-22,,,COCHLEATES MADE WITH SOY PHOSPHATIDYLSERINE,SMALL,false,2015
4,14609269,Rutgers,14,2015-01-29,Utility,US20150140074A1,2015-05-21,9775907,2017-10-03,COCHLEATES MADE WITH SOY PHOSPHATIDYLSERINE,SMALL,true,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,18085724,NJIT,18,2022-12-21,Utility,US20230149855A1,2023-05-18,,,Hollow Fiber Membrane Module for Direct Contac...,SMALL,true,2022
1046,18013368,Rutgers,18,2022-12-28,Utility,,,,,IDENTIFICATION OF ESTROGEN RECEPTOR POSITIVE (...,SMALL,Other,2022
1047,18013302,Princeton,18,2022-12-28,Utility,,,,,COMPOUNDS HAVING ANTICANCER ACTIVITY,SMALL,true,2022
1048,18003753,Princeton,18,2022-12-29,Utility,,,,,Ketogenic Diet and Ketone Supplementation for ...,SMALL,Other,2022


In [101]:
nj_univ_df.to_csv('valid.csv')

In [102]:
# set filepath to PatEx inventor data csv file
filepath = os_path.join(PATEX_DIR, 'all_inventors.csv')
print(filepath)

C:\Users\user\Downloads\BulkData2023\all_inventors.csv


In [103]:
# open csv file as a pandas dataframe
# - identify the header as row 0 (the first row)
# - use all columns (do not specify usecols)
# - ensure all columns are read in as strings (important for the application number)
# - option keep_default_na=False ensure we get an empty string and not NaN if a value is missing
inv_df = pd.read_csv(filepath,
                     header=0,
                     dtype=str,
                     keep_default_na=False)

In [104]:
inv_df.columns

Index(['application_number', 'inventor_name_first', 'inventor_name_middle',
       'inventor_name_last', 'inventor_rank', 'inventor_city_name',
       'inventor_region_code', 'inventor_country_code'],
      dtype='object')

In [105]:
inv_df.head()

Unnamed: 0,application_number,inventor_name_first,inventor_name_middle,inventor_name_last,inventor_rank,inventor_city_name,inventor_region_code,inventor_country_code
0,4840815,WILLIAM,D.,SCHAEFFER,1,POMONA,CA,US
1,5434252,HERMAN,,LEVIN,1,GLENVIEW,IL,US
2,5434252,ERIC,K.,MAXON,2,EVANSTON,IL,US
3,5603052,AGIS,F.,KYDONIEUS,1,NEW YORK,NY,US
4,5823902,WILLIAM,G.,HECKENHAUER,1,BUCYRUS,OH,US


In [106]:
# left join applicant_df (left table) to inv_df (right table); use indicator=True to check results 
nj_univ_appl_inv_df = nj_univ_df.merge(inv_df, 
                                       how='left', 
                                       on='application_number', 
                                       indicator=True)

In [107]:
nj_univ_appl_inv_df.head()

Unnamed: 0,application_number,univ,series,filing_date,application_invention_type,earliest_pgpub_number,earliest_pgpub_date,patent_number,patent_issue_date,invention_title,...,aia_first_to_file,filing_year,inventor_name_first,inventor_name_middle,inventor_name_last,inventor_rank,inventor_city_name,inventor_region_code,inventor_country_code,_merge
0,14605053,Stevens,14,2015-01-26,Utility,US20150235073A1,2015-08-20,,,FLEXIBLE PART-BASED REPRESENTATION FOR REAL-WO...,...,True,2015,Gang,,Hua,1,Livingston,NJ,US,both
1,14605053,Stevens,14,2015-01-26,Utility,US20150235073A1,2015-08-20,,,FLEXIBLE PART-BASED REPRESENTATION FOR REAL-WO...,...,True,2015,Haoxiang,,Li,2,Hoboken,NJ,US,both
2,14417704,Rowan,14,2015-01-27,Utility,US20150198600A1,2015-07-16,,,Autoantibody Profiles in the Early Detection a...,...,False,2015,Robert,G.,Nagele,1,Turnersville,NJ,US,both
3,14417704,Rowan,14,2015-01-27,Utility,US20150198600A1,2015-07-16,,,Autoantibody Profiles in the Early Detection a...,...,False,2015,Eric,P.,Nagele,2,Turnersville,NJ,US,both
4,14607496,Rutgers,14,2015-01-28,Utility,US20150233934A1,2015-08-20,9945051.0,2018-04-17,COLOR-ENCODING AND IN-SITU INTERROGATION OF MA...,...,False,2015,Richard,,Ebright,1,New Brunswick,NJ,US,both


In [108]:
nj_univ_appl_inv_df.shape

(3454, 21)

In [109]:
merge_results_series = nj_univ_appl_inv_df['_merge'].value_counts()
print(merge_results_series)

print('\nMerge results: ')
print('Number application-applicant observations with inventor data: {:,}'.
      format(merge_results_series['both']))
print('Number application-applicant observations missing inventor data: {:,}'.
      format(merge_results_series['left_only']))

both          3445
left_only        9
right_only       0
Name: _merge, dtype: int64

Merge results: 
Number application-applicant observations with inventor data: 3,445
Number application-applicant observations missing inventor data: 9


In [110]:
# keep _merge=='both'
nj_univ_appl_inv_df = nj_univ_appl_inv_df.loc[nj_univ_appl_inv_df['_merge']=='both']

In [111]:
# drop _merge column 
nj_univ_appl_inv_df.drop(columns=['_merge'], inplace=True)

In [112]:
nj_univ_appl_inv_df['inv_id'] = nj_univ_appl_inv_df.groupby(['inventor_name_last',
                                                             'inventor_name_first',
                                                             'inventor_name_middle']).ngroup()

In [113]:
# count number of unque inventors by university
nj_univ_appl_inv_df.groupby(by=['univ'])['application_number'].nunique()

univ
NJIT          79
Princeton    341
Rowan         65
Rutgers      483
Seton          5
Stevens       68
Name: application_number, dtype: int64

In [114]:
nj_univ_appl_inv_df.to_csv('final_results.csv')

In [115]:
# import gender_it_functions
from gender_it import gender_it_functions as gf

In [116]:
#gf.read_wgnd(path='gender_it//')

In [117]:
# get gender attributions for inventors; set threshold to 0.60
nj_univ_appl_inv_gender_df = gf.get_gender(nj_univ_appl_inv_df, 
                                           name_column = 'inventor_name_first', 
                                           country_column = 'inventor_country_code', 
                                           treshold = 0.6)

Step 1 - reading the name-country-gender dictionary
reading the dictionnary.
Step 2 - reading the name-language-gender dictionary
reading the dictionnary.
downloading the dictionnary.
Step 3 - reading the name-gender dictionary.
reading the dictionnary.
Results distirbution is as follow: 
            gender  Percentage
M            2403   69.753266
F             653   18.955007
not found     387   11.233672
?               2    0.058055


In [118]:
nj_univ_appl_inv_df.head()

Unnamed: 0,application_number,univ,series,filing_date,application_invention_type,earliest_pgpub_number,earliest_pgpub_date,patent_number,patent_issue_date,invention_title,...,aia_first_to_file,filing_year,inventor_name_first,inventor_name_middle,inventor_name_last,inventor_rank,inventor_city_name,inventor_region_code,inventor_country_code,inv_id
0,14605053,Stevens,14,2015-01-26,Utility,US20150235073A1,2015-08-20,,,FLEXIBLE PART-BASED REPRESENTATION FOR REAL-WO...,...,True,2015,Gang,,Hua,1,Livingston,NJ,US,794
1,14605053,Stevens,14,2015-01-26,Utility,US20150235073A1,2015-08-20,,,FLEXIBLE PART-BASED REPRESENTATION FOR REAL-WO...,...,True,2015,Haoxiang,,Li,2,Hoboken,NJ,US,1052
2,14417704,Rowan,14,2015-01-27,Utility,US20150198600A1,2015-07-16,,,Autoantibody Profiles in the Early Detection a...,...,False,2015,Robert,G.,Nagele,1,Turnersville,NJ,US,1296
3,14417704,Rowan,14,2015-01-27,Utility,US20150198600A1,2015-07-16,,,Autoantibody Profiles in the Early Detection a...,...,False,2015,Eric,P.,Nagele,2,Turnersville,NJ,US,1295
4,14607496,Rutgers,14,2015-01-28,Utility,US20150233934A1,2015-08-20,9945051.0,2018-04-17,COLOR-ENCODING AND IN-SITU INTERROGATION OF MA...,...,False,2015,Richard,,Ebright,1,New Brunswick,NJ,US,486


In [119]:
nj_univ_appl_inv_gender_df.head()

Unnamed: 0,application_number,univ,series,filing_date,application_invention_type,earliest_pgpub_number,earliest_pgpub_date,patent_number,patent_issue_date,invention_title,...,inventor_rank,inventor_city_name,inventor_region_code,inventor_country_code,inv_id,level,gender,F,M,?
982,14605053,Stevens,14,2015-01-26,Utility,US20150235073A1,2015-08-20,,,FLEXIBLE PART-BASED REPRESENTATION FOR REAL-WO...,...,1,Livingston,NJ,US,794,1.0,M,0.0,1.0,0.0
387,14605053,Stevens,14,2015-01-26,Utility,US20150235073A1,2015-08-20,,,FLEXIBLE PART-BASED REPRESENTATION FOR REAL-WO...,...,2,Hoboken,NJ,US,1052,3.0,M,0.0,1.0,0.0
3029,14417704,Rowan,14,2015-01-27,Utility,US20150198600A1,2015-07-16,,,Autoantibody Profiles in the Early Detection a...,...,1,Turnersville,NJ,US,1296,2.0,M,0.0,1.0,0.0
2168,14417704,Rowan,14,2015-01-27,Utility,US20150198600A1,2015-07-16,,,Autoantibody Profiles in the Early Detection a...,...,2,Turnersville,NJ,US,1295,2.0,M,0.0,1.0,0.0
2983,14607496,Rutgers,14,2015-01-28,Utility,US20150233934A1,2015-08-20,9945051.0,2018-04-17,COLOR-ENCODING AND IN-SITU INTERROGATION OF MA...,...,1,New Brunswick,NJ,US,486,2.0,M,0.0,1.0,0.0


In [120]:
nj_univ_appl_inv_gender_df.columns

Index(['application_number', 'univ', 'series', 'filing_date',
       'application_invention_type', 'earliest_pgpub_number',
       'earliest_pgpub_date', 'patent_number', 'patent_issue_date',
       'invention_title', 'small_entity_indicator', 'aia_first_to_file',
       'filing_year', 'inventor_name_first', 'inventor_name_middle',
       'inventor_name_last', 'inventor_rank', 'inventor_city_name',
       'inventor_region_code', 'inventor_country_code', 'inv_id', 'level',
       'gender', 'F', 'M', '?'],
      dtype='object')

In [121]:
nj_univ_appl_inv_gender_df.groupby(by=["univ","gender","filing_year"]).size()

univ     gender     filing_year
NJIT     F          2015           10
                    2016            6
                    2017            4
                    2018            5
                    2019            8
                                   ..
Stevens  not found  2017            1
                    2019            5
                    2020            4
                    2021            3
                    2022           10
Length: 124, dtype: int64

In [122]:
nj_univ_appl_inv_gender_df.groupby(by=["univ","gender","filing_year"])['inv_id'].nunique()

univ     gender     filing_year
NJIT     F          2015           9
                    2016           6
                    2017           4
                    2018           5
                    2019           8
                                  ..
Stevens  not found  2017           1
                    2019           5
                    2020           4
                    2021           3
                    2022           9
Name: inv_id, Length: 124, dtype: int64

In [123]:
nj_univ_appl_inv_gender_df.groupby(by=["univ","application_invention_type","filing_year","gender"])['inv_id'].nunique()

univ     application_invention_type  filing_year  gender   
NJIT     Provisional                 2016         F             1
                                                  M             5
                                                  not found     2
                                     2017         M             4
                                     2018         F             1
                                                               ..
Stevens  Utility                     2021         F             7
                                                  M            13
                                     2022         F             3
                                                  M            24
                                                  not found     9
Name: inv_id, Length: 204, dtype: int64

In [124]:
nj_univ_appl_inv_gender_df.groupby(by=["univ","application_invention_type","filing_year","gender"])['application_number'].nunique()

univ     application_invention_type  filing_year  gender   
NJIT     Provisional                 2016         F             1
                                                  M             3
                                                  not found     2
                                     2017         M             2
                                     2018         F             1
                                                               ..
Stevens  Utility                     2021         F             4
                                                  M             4
                                     2022         F             4
                                                  M            11
                                                  not found     8
Name: application_number, Length: 204, dtype: int64

In [125]:
nj_univ_appl_inv_gender_df.to_csv('first_results_with_asg.csv')

In [126]:
print(appl_dict)

{'17435954': {'Rutgers'}, '17914647': {'Rutgers'}, '18005188': {'Princeton'}, '18245256': {'Rutgers'}, '12539049': {'NJIT'}, '13624330': {'Princeton'}, '13625527': {'Rutgers'}, '13647239': {'Princeton'}, '13647861': {'Princeton'}, '13650759': {'Stevens'}, '13651296': {'NJIT'}, '13651829': {'NJIT'}, '13651977': {'NJIT'}, '13652094': {'Princeton'}, '13652408': {'Rowan'}, '13654324': {'Rutgers'}, '61716348': {'Princeton'}, '13661017': {'NJIT'}, '13673590': {'Rutgers'}, '13675685': {'Princeton'}, '13680890': {'Princeton'}, '13687644': {'Rutgers'}, '13690831': {'Stevens'}, '13693024': {'Rutgers'}, '13717142': {'Rutgers'}, '13719605': {'Princeton'}, '13729435': {'NJIT'}, '13732674': {'Rutgers'}, '13694820': {'Rutgers'}, '13751690': {'Rutgers'}, '13755562': {'Rutgers'}, '61759113': {'Rutgers'}, '13757752': {'Rutgers'}, '61761436': {'Princeton'}, '61765284': {'Princeton'}, '13773248': {'Princeton'}, '13773806': {'Princeton'}, '13776703': {'Rutgers'}, '13780941': {'Princeton'}, '13786887': {'Pr

In [132]:
pwd

'C:\\Users\\user\\Downloads'

In [134]:
!git init

Initialized empty Git repository in C:/Users/user/Downloads/.git/


In [136]:
!git remote add origin https://github.com/megh221322/USPTO_Data_Analysis