# Reading patent data

This notebook contains code to download and process patent data to generate indicators to be used in the BEIS innovation indicators project.

**Note that this notebook requires access to Nesta's data production system**

The PATSTAT data we are using in this analysis is proprietary and therefore it is not possible for us to make it available in a raw format

Patent data is complex. [Data dictionary](https://github.com/nestauk/patent_analysis/raw/master/references/patstat_data_dict.pdf) and [guide](https://github.com/nestauk/patent_analysis/raw/master/references/The_Patents_Guide_2nd_edition.pdf)

### Preamble

In [None]:
%run ../notebook_preamble.ipy

from data_getters.labs.core import download_file
import random


In [None]:
def patent_download(file_path=None, progress=True):
    """ Fetch Gateway To Research predicted industries

    Repo: http://github.com/nestauk/patent_analysis
    Commit: cb11b3f
    File: https://github.com/nestauk/patent_analysis/blob/master/notebooks/02-jmg-patent_merge.ipynb

    Args:
        file_path (`str`, optional): Path to download to. If None, stream file.
        progress (`bool`, optional): If `True` and `file_path` is not `None`,
            display download progress.
    """
    itemname = "Scotland_temp/15_10_2019_patents_combined.csv"
    return download_file(itemname, file_path, progress)

In [None]:
def make_data_dict(table,name,path,sample=5):
    '''
    A function to output the form for a data dictionary
    
    Args:
        -table (df) is the df we want to create the data dictionary for
        -name (str) of the df
        -path (str) is the place where we want to save the file
        

    
    '''
    
    types = [estimate_type(table[x],sample=sample) for x in table.columns]
        
    data_dict = pd.DataFrame()
    data_dict['variable'] = table.columns
        
    data_dict['type'] = types
    
    data_dict['description'] = ['' for x in data_dict['variable']]
        
    out = os.path.join(path,f'{today_str}_{name}.csv')
    
    #print(data_dict.columns)
    
    data_dict.to_csv(out)
    

def estimate_type(variable,sample):
    '''
    Estimates the type of a column. 

    Args:
        variable (iterable) with values
        sample (n) is the number of values to test
    
    '''
    
    selection = random.sample(list(variable),sample)
    
    types = pd.Series([type(x) for x in selection]).value_counts().sort_values(ascending=False)
    
    return(types.index[0])

                           
                           
    
from ast import literal_eval

def flat_freq(a_list):
    '''
    Return value counts for categories in a nested list
    
    '''
    return(pd.Series([x for el in a_list for x in el]).value_counts())

def emergent_load_check(path,check=True):
    '''
    This function checks the results from the analysis of emerging technologies in the patent / research / glass data.
    
    It cleans variable and category names, outputs counts by category and top keywors by sector.
    
    
    Args:
        path (str) is the path for the data. This could be modified to include a data_getter path
        check (boolean) is whether we want to check the data or just load it
    
    '''
    
    #Some parsing of variables
    data = pd.read_csv(path,
                      dtype={'appln_id':str},
                            converters={
                                'proc_text':literal_eval,
                                'keywords':literal_eval,
                                'Strategic Priority':literal_eval})
    
    
    #Tidy variable names
    data.columns = [re.sub(' ','_',x.lower()) for x in data.columns]
    
    #Strip whitespace

    for v in ['keywords','strategic_priority']:
        data[v] = [[x.strip().lower() for x in el] for el in data[v]]
        

    #If we want to check the data
    if check==True:
        
        print('Checking')
        print('====')
        
        #What are the strategic areas?
        area_counts = flat_freq(data['strategic_priority'])
        
        print('Strategic area distribution')
        print('\n')
        
        print(area_counts)
        print('\n')
        
        areas = area_counts.index
        
        #Keyword frequencies

        #Extracts the keyword distribution per area: do we have the right distributions?
        kw_freq = [
            flat_freq(data.loc[[area in priorities for priorities in data['strategic_priority']]]['keywords']) for area in areas]
        
        print('Top keyword distribution per area')
        print('\n')
        
        #Print them
        for t,r in zip(areas,kw_freq):

            print(t)
            print('=====')

            print(r.head(n=20))

            print('\n')
    
        print('Area combinations')
        print('=======')
        
        print(pd.Series(data['strategic_priority']).value_counts())
        
        
    #Return
    return(data)
        

def flatten_list(a_list):
    
    return([x for el in a_list for x in el])

        

## 1. Read data

We read a patent dataset based on the processing and analysis that we undertook [here](https://github.com/nestauk/patent_analysis)

In the patent file that we read every row is a patent application and the columns contain information about it. In some cases, the columns contain lists of applicants, IPC codes and other things.



In [None]:
from ast import literal_eval


In [None]:
p_d = patent_download()

p = pd.read_csv(p_d,dtype={'appln_id':str})

In [None]:
# p = pd.read_csv('https://nesta-data-getters.s3.eu-west-2.amazonaws.com/Scotland_temp/11_10_2019_patents_combined.csv',dtype={'appln_id':str})

# #p = pd.read_csv('/Users/jmateosgarcia/Desktop/patents/patents//data/processed/11_10_2019_patents_combined.csv',dtype={'appln_id':str})

In [None]:
p.head()

In [None]:
#We need to parse some of the lists in the data

#These are the list variables that we need to parse
list_vars = ['appl_psn_name','appl_person_address','appl_laua','appl_lad_name','appl_uk_postcode_long',
           'inv_psn_name','inv_person_address','inv_laua','inv_lad_name','inv_uk_postcode_long','tf_weight','tf_techn_field_nr', 'tf_techn_field', 'ipc_class_symbol_proc_10',
            'appl_nuts_name','inv_nuts_name','appl_nuts','inv_nuts']


for v in list_vars:
    
    print(v)
    
    
    p[v] = [literal_eval(x) if pd.isnull(x)==False else np.nan for x in p[v]]
    
    #Bring back the misssing variables
    if any(l in v for l in ['lad','laua','nuts','ttwa']):
        p[v] = [np.nan if type(var)!=list else np.nan if all(x=='missing' for x in var) else var for var in p[v]]
        

### Metadata: NUTS lookup

Read [NUTS lookup](http://geoportal1-ons.opendata.arcgis.com/datasets/9b4c94e915c844adb11e15a4b1e1294d_0.csv)

In [None]:
nuts = pd.read_csv('http://geoportal1-ons.opendata.arcgis.com/datasets/9b4c94e915c844adb11e15a4b1e1294d_0.csv')

In [None]:
#This is a NUTS 2 lookup
nuts_2_code_name_lookup = nuts.drop_duplicates('NUTS218CD').set_index('NUTS218CD')['NUTS218NM'].to_dict()

In [None]:
#And we also identify LADS in Scotland
lads_scotland = nuts.loc[[v[0]=='S' for v in nuts['LAD18CD']]][['LAD18CD','LAD18NM']].drop_duplicates('LAD18CD').set_index('LAD18CD')['LAD18NM'].to_dict()

In [None]:
#This is a NUTS code - lookup name for all NUTS codes regardless of their level
with open('../../../data/aux/patstat_nuts_lookup.json','r') as infile:
    nuts_patstat_lookup = json.load(infile)

### Generate counts of activity for Scotland

Strategy:

* LAD labels: Label patent ids with a is in Scotland flag and extract Scottish LADs that appear in the locations
* NUTS label: same thing
* Create counts of activity by patent family (to focus on invention and avoid double counting)
* Create counts of activity by LAD

Label patent ids with Scotland info

In [None]:
geo_code_vars = ['inv_laua','appl_laua','inv_nuts','appl_nuts']

#This loops goes over lists of geo codes in the patents looking for Scottish codes and then also extracts them
    
for c in geo_code_vars:
    
    if 'laua' in c:
        
        #Is there any overlap between LADS in a patent and the Scottis list of LADS?
        p[c+'_scotland'] = [len(set(lads)&set(lads_scotland.keys()))>0 if type(lads)==list else np.nan for lads in p[c]]
        
        #What are the names of the scottish LADs?
        p[c+'_scot_names'] = [[lads_scotland[x] for x in lads if x in lads_scotland.keys()] if type(lads)==list else np.nan for lads in p[c]]
        
    if 'nuts' in c:

        #Here we make the variable missing if the information is only available at the UK level. 
        #Otherwise Scottish shares appears underrepresented
        
        p[c+'_scotland'] = [np.nan if type(nuts)!=list else np.nan if (len(set(nuts))==1)&(nuts[0]=='UK') else any('UKM' in nut for nut in nuts) if type(nuts)==list else np.nan for nuts in p[c]]
        
        #UKM are the Scottish NUTS
        p[c+'_scot_names'] = [[nuts_patstat_lookup[x] for x in nuts if 'UKM' in x] if type(nuts)==list else np.nan for nuts in p[c]]

In [None]:
#Create a couple of lists with the Scottish variables to simplify querying
scot_vars = [x+'_scotland' for x in geo_code_vars]

scot_names = [x+'_scot_names' for x in geo_code_vars]

In [None]:
#This is the share of Scottish organisations in the total
p[scot_vars].mean()

In [None]:
#Look at levels of activity by Scottish LAD
p_lad_freqs = pd.concat([flat_freq(p[x].dropna()) for x in scot_names[:2]],axis=1)
p_lad_freqs.columns = scot_names[:2]

p_lad_freqs.sort_values(scot_names[0],ascending=False).head()

#### Create additional variables

Here we create additional variables to simplify the analysis of the Scottish data


In [None]:
#Dummies for whether a patent application involves an inventor or an applicant in a Scottish LAD or NUT region
p['scot_inv'] = [(x['inv_laua_scotland']==True) or (x['inv_nuts_scotland']==True) for pid,x in p.iterrows()]

p['scot_applicant'] = [(x['appl_laua_scotland']==True) or (x['appl_nuts_scotland']==True) for pid,x in p.iterrows()]


In [None]:
#One df with dummies for whether a patent application involves an inventor in a Scottish LAD or not
scot_lad_dummies = pd.concat(
    [pd.Series([sc_lad in x if type(x)==list else np.nan for x in p['inv_lad_name']],name=sc_lad) for sc_lad in sorted(lads_scotland.values())],axis=1)

In [None]:
p_final = pd.concat([p,scot_lad_dummies],axis=1)

In [None]:
lads_scotland.values()

In [None]:
p_final_keep = ['appln_id','ipr_type','granted','appln_auth','appln_filing_year',
                'docdb_family_id','nb_citing_docdb_fam',
                'appl_psn_name', 'appl_psn_id', 'appl_psn_sector',
                'appl_person_address', 'appl_uk_postcode_long','appl_laua', 'appl_lad_name', 'appl_nuts',
                'inv_psn_name', 'inv_psn_id', 'inv_psn_sector', 
                'inv_person_address','inv_uk_postcode_long','inv_laua','inv_lad_name', 'inv_nuts',
                 'appln_abstract_lg', 'appln_abstract','tf_weight', 'tf_techn_field_nr', 'tf_techn_field',
                'ipc_class_symbol_proc_10', 'inv_nuts_name', 'appl_nuts_name',
               'raw_ids', 'is_ai_ipo', 'inv_laua_scotland', 'inv_laua_scot_names',
               'appl_laua_scotland', 'appl_laua_scot_names', 'inv_nuts_scotland',
               'inv_nuts_scot_names', 'appl_nuts_scotland', 'appl_nuts_scot_names',
               'priority_transport_aerospace', 'priority_industrial_technologies',
               'priority_scientific_biomedical', 'priority_digital_applications',
               'priority_data_analytics_ai', 'priority_ict',
               'priority_environmental_technologies', 'scot_inv', 'scot_applicant',
              'Aberdeen City', 'Aberdeenshire', 'Angus', 'Argyll and Bute',
               'City of Edinburgh', 'Clackmannanshire', 'Dumfries and Galloway',
               'Dundee City', 'East Ayrshire', 'East Dunbartonshire', 'East Lothian',
               'East Renfrewshire', 'Falkirk', 'Fife', 'Glasgow City', 'Highland',
               'Inverclyde', 'Midlothian', 'Moray', 'Na h-Eileanan Siar',
               'North Ayrshire', 'North Lanarkshire', 'Orkney Islands',
               'Perth and Kinross', 'Renfrewshire', 'Scottish Borders',
               'Shetland Islands', 'South Ayrshire', 'South Lanarkshire', 'Stirling',
               'West Dunbartonshire', 'West Lothian']


p_final_2 = p_final[p_final_keep]

In [None]:
p_final_2.head()

In [None]:
p_final_2.shape

In [None]:
p_final_2.to_csv(f'../../data/processed/{today_str}_patent_applications_cleaned.csv')

In [None]:
#Concatenate patents in Scotland

scottish_em_patents = pd.concat([pd.crosstab(p_em['inv_laua_scotland'],p_em[x],normalize=1)[True] for x in priority_areas],axis=1)
scottish_em_patents.columns = priority_areas

100*scottish_em_patents.T

#### Focus on patent families

Strategy: group by patent id and sum the number of occurrences of each priority area in the patent.

We can merge these with the 


In [None]:
# Need to get patent families

agg = p_em.groupby(['docdb_family_id'])[priority_areas].sum()

#agg = pd.concat([p_em.groupby(['docdb_family_id'])[p].sum() for p in priority_areas],axis=1)

In [None]:
pat_fam_id = agg.apply(lambda x: x.astype(int),axis=0)

pat_fam_id.sum()

In [None]:
p_em['has_scotland'] = p_em['']

### Get individual company /organisation information

In [None]:
#Reading it from the patents file...Should replace this 

pat_person = pd.read_csv('/Users/jmateosgarcia/Desktop/patents/patents/data/processed/11_10_2019_person_profiles.csv',
                        dtype={'appln_id':str})

In [None]:
pat_person['scot_lad'] = [x[0]=='S' if type(x)==str else np.nan for x in pat_person['laua']]

In [None]:
pat_person_em = pd.merge(pat_person,em,left_on='appln_id',right_on='appln_id')

In [None]:
pat_person_grouped = pat_person_em.groupby(['person_id','person_name'])[priority_areas].sum()

In [None]:
pat_person_grouped.sort_values('priority_data_analytics_ai',ascending=False).head(n=20)

The above can be subset by sector

### Some validation

How do these figures compare with European patenting stats?

In [None]:
pat = pd.read_csv('https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/pat_ep_rtot.tsv.gz',
                  compression='gzip',delimiter='\t',na_values=[': '])

In [None]:
pat_n = pat.loc[['NR,' in x for x in pat.iloc[:,0]]]

In [None]:
pat_n['geo'] = [x.split(',')[1] for x in pat_n.iloc[:,0]]

In [None]:
scot = pat_n.loc[['UKM' in x for x in pat_n['geo']]]

In [None]:
scot.columns = [x.strip() for x in scot.columns]

In [None]:
scot['2011'].astype(float).sum()

In [None]:
p['appl_laua_scotland'].sum()