*Step 1: Import the Python Modules and the Datasets*

In [1]:
import numpy as np
import pandas as pd
import bq_helper
from bq_helper import BigQueryHelper
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

*Step 2: Load Both Datasets*

In [2]:
internal_candidates = pd.read_csv('/kaggle/input/private-g-leadership-staffing-data/L8SWE mapped talent - Internal.csv')
external_candidates = pd.read_csv('/kaggle/input/private-g-leadership-staffing-data/L8SWE mapped talent - External.csv')
combined_candidates = pd.concat([internal_candidates, external_candidates])
list_of_names = combined_candidates['Name'].tolist()

In [3]:
# Add additional names that are not in the "list_of_names"
another_list = ['SARA MORROW', 'LEILANIM LINDFIELD']

In [4]:
list_of_names = list_of_names + another_list
list_of_names = [x.upper() for x in list_of_names]
patents = bq_helper.BigQueryHelper(active_project="patents-public-data",dataset_name="patents") 

Using Kaggle's public dataset BigQuery integration.


*Step 3: Write queries for the Google Patents Public BigQuery Dataset*

In [5]:
inventor_query = """
WITH temp1 AS (
    SELECT
      DISTINCT
      PUB.country_code,
      PUB.application_number AS patent_number,
      inventor_name
    FROM
      `patents-public-data.patents.publications` PUB
    CROSS JOIN
      UNNEST(PUB.inventor) AS inventor_name
    WHERE
          PUB.grant_date > 0
      AND PUB.country_code IS NOT NULL
      AND PUB.application_number IS NOT NULL
      AND PUB.inventor IS NOT NULL
)
SELECT
  *
FROM (
    SELECT
     temp1.country_code AS country,
     temp1.inventor_name AS inventor,
     COUNT(temp1.patent_number) AS count_of_patents
    FROM temp1
    GROUP BY
     temp1.country_code,
     temp1.inventor_name
     )
WHERE
 count_of_patents > 0
;
"""

*Step 4: Use your queries to download data from the Google Patents Public BigQuery Dataset*

In [6]:
print('Query Size: ', patents.estimate_query_size(inventor_query), 'GB')
inventor_query_results = patents.query_to_pandas_safe(inventor_query, max_gb_scanned=7)
top_inventors_in_both_datasets = inventor_query_results[inventor_query_results.inventor.isin(list_of_names)].nlargest(500,'count_of_patents')

Query Size:  6.5854929238557816 GB


*Step 5: Which names from the "list_of_names" dataset are associated with the largest number of patents?*

In [7]:
print('Most Prolific Inventors That Are Also In "list_of_names":')
inventors_in_both_datasets_table = ff.create_table(top_inventors_in_both_datasets)
py.iplot(inventors_in_both_datasets_table, filename='jupyter-table1')

Most Prolific Inventors That Are Also In "list_of_names":


*Step 6: Sum results for names that show up twice (associated with two different countries)*

In [8]:
top_inventors_in_both_datasets_with_combined_duplicates = top_inventors_in_both_datasets.groupby(top_inventors_in_both_datasets.iloc[:,1]).sum()
top_inventors_in_both_datasets_with_combined_duplicates = top_inventors_in_both_datasets_with_combined_duplicates.reset_index()
top_inventors_in_both_datasets_with_combined_duplicates = top_inventors_in_both_datasets_with_combined_duplicates.sort_values(by=['count_of_patents'],ascending=False)
pd.options.display.max_rows = 9999
top_inventors_in_both_datasets_with_combined_duplicates.head(9999)

Unnamed: 0,inventor,count_of_patents
109,LI XIAO,2267
191,YUAN YUAN,1700
113,LU LU,584
81,JIA LI,477
182,WEI LIU,454
187,YING WANG,191
97,JUN XU,156
114,LU XUN,130
108,LI DENG,100
131,MING ZHAO,99


*Step 7: Save the results as a .CSV file that can be opened using Google Sheets*

In [9]:
print('# of inventors in both datasets after combining duplicates: ', top_inventors_in_both_datasets.shape[0])
top_inventors_in_both_datasets_with_combined_duplicates.to_csv('candidates_sorted_by_patent_number.csv',index=False)

# of inventors in both datasets after combining duplicates:  307


*Step 8: Look up specific individuals*

In [10]:
look_up_these_individuals = ['SARA MORROW', 'LEILANIM LINDFIELD','LI XIAO','MING ZHAO']
smaller_list = top_inventors_in_both_datasets_with_combined_duplicates[top_inventors_in_both_datasets_with_combined_duplicates.inventor.isin(look_up_these_individuals)]
smaller_list.head(9999)

Unnamed: 0,inventor,count_of_patents
109,LI XIAO,2267
131,MING ZHAO,99
