# Patent analysis

Here we explore and integrate the patent data and create a clean dataset for analysis.

This includes:

* Identifying Scottish organisations
* Deciding what to do about organisation names
* Deciding what to do about sectors
* Merging with the strategic priority analysis
* Documenting and saving

Patent data is complex. [Data dictionary](https://github.com/nestauk/patent_analysis/raw/master/references/patstat_data_dict.pdf) and [guide](https://github.com/nestauk/patent_analysis/raw/master/references/The_Patents_Guide_2nd_edition.pdf)

### Preamble

In [None]:
%run ../notebook_preamble.ipy

from data_getters.labs.core import download_file
import random
from ast import literal_eval

In [None]:
def make_dirs(name,dirs = ['raw','processed']):
    '''
    Utility that creates directories to save the data
    
    '''
    
    for d in dirs:
        if name not in os.listdir(f'../../data/{d}'):
            os.mkdir(f'../../data/{d}/{name}')

In [None]:
#This is to download the data from nesta data getters

def patent_download(file_path=None, progress=True):
    """ Fetch Gateway To Research predicted industries

    Repo: http://github.com/nestauk/patent_analysis
    Commit: cb11b3f
    File: https://github.com/nestauk/patent_analysis/blob/master/notebooks/02-jmg-patent_merge.ipynb

    Args:
        file_path (`str`, optional): Path to download to. If None, stream file.
        progress (`bool`, optional): If `True` and `file_path` is not `None`,
            display download progress.
    """
    itemname = "Scotland_temp/15_10_2019_patents_combined.csv"
    return download_file(itemname, file_path, progress)

In [None]:
def make_data_dict(table,name,path,sample=5):
    '''
    A function to output the form for a data dictionary
    
    Args:
        -table (df) is the df we want to create the data dictionary for
        -name (str) of the df
        -path (str) is the place where we want to save the file
        

    
    '''
    
    types = [estimate_type(table[x],sample=sample) for x in table.columns]
        
    data_dict = pd.DataFrame()
    data_dict['variable'] = table.columns
        
    data_dict['type'] = types
    
    data_dict['description'] = ['' for x in data_dict['variable']]
        
    out = os.path.join(path,f'{today_str}_{name}.csv')
    
    #print(data_dict.columns)
    
    data_dict.to_csv(out)
    

def estimate_type(variable,sample):
    '''
    Estimates the type of a column. 

    Args:
        variable (iterable) with values
        sample (n) is the number of values to test
    
    '''
    
    selection = random.sample(list(variable),sample)
    
    types = pd.Series([type(x) for x in selection]).value_counts().sort_values(ascending=False)
    
    return(types.index[0])

                
def flat_freq(a_list):
    '''
    Return value counts for categories in a nested list
    
    '''
    return(pd.Series([x for el in a_list for x in el]).value_counts())

        

def flatten_list(a_list):
    
    return([x for el in a_list for x in el])

        

In [None]:
make_dirs('patents',['raw','processed'])

## 1. Read data

We read a patent dataset based on the processing and analysis that we undertook [here](https://github.com/nestauk/patent_analysis)

In the patent file that we read every row is a patent application and the columns contain information about it. In some cases, the columns contain lists of applicants, IPC codes and other things.



In [None]:
p_d = patent_download()

p = pd.read_csv(p_d)

In [None]:
p.head()

In [None]:
#We need to parse some of the lists in the data

#These are the list variables that we need to parse
list_vars = ['appl_psn_name','appl_person_address','appl_laua','appl_lad_name','appl_uk_postcode_long',
           'inv_psn_name','inv_person_address','inv_laua','inv_lad_name','inv_uk_postcode_long','tf_weight','tf_techn_field_nr', 'tf_techn_field', 'ipc_class_symbol_proc_10',
            'appl_nuts_name','inv_nuts_name','appl_nuts','inv_nuts']


for v in list_vars:
    
    print(v)
    
    
    p[v] = [literal_eval(x) if pd.isnull(x)==False else np.nan for x in p[v]]
    
    #Bring back the misssing variables
    if any(l in v for l in ['lad','laua','nuts','ttwa']):
        p[v] = [np.nan if type(var)!=list else np.nan if all(x=='missing' for x in var) else var for var in p[v]]
        

### Metadata: NUTS lookup

Read [NUTS lookup](http://geoportal1-ons.opendata.arcgis.com/datasets/9b4c94e915c844adb11e15a4b1e1294d_0.csv)

In [None]:
nuts = pd.read_csv('http://geoportal1-ons.opendata.arcgis.com/datasets/9b4c94e915c844adb11e15a4b1e1294d_0.csv')

In [None]:
#This is a NUTS 2 lookup
nuts_2_code_name_lookup = nuts.drop_duplicates('NUTS218CD').set_index('NUTS218CD')['NUTS218NM'].to_dict()

In [None]:
#And we also identify LADS in Scotland
lads_scotland = nuts.loc[[v[0]=='S' for v in nuts['LAD18CD']]][['LAD18CD','LAD18NM']].drop_duplicates('LAD18CD').set_index('LAD18CD')['LAD18NM'].to_dict()

In [None]:
#This is a NUTS code - lookup name for all NUTS codes regardless of their level
with open('../../data/aux/patstat_nuts_lookup.json','r') as infile:
    nuts_patstat_lookup = json.load(infile)

## 2. Process data

Create count of applications and inventions per NUTS area in the last two years. We focus on patent families to avoid double counting.

In [None]:
def count_patenting_in_nuts(df,variable,nuts_lookup,pat_fam_id='docdb_family_id'):
    '''
    This function creates counts of inventors and applicants in NUTS areas.
    
    Note that the NUTS areas are not available at an standardised level of resolution. We will prune the length of NUTS3 (length of code>4) and match with the 
    nuts2 code lookup. This also means we will throw away any patents that don't have better level of resolution than NUTS2.
    
    Args:
        df (dataframe) is the df with the patent information. Each row is a patent id and the columns contain metadata, authorship etc.
        variable (str) is the variable we want to use in the analysis
        pat_fam_id (str) is the patent family variable that we want to focus on
        nuts_lookup (dict) is the nuts2 code to name lookup
    
    '''
    
    #Group by patents
    
    #This gives us a set of nuts regions involved in a single invention. 
    #Note that this is binary (whether a nuts region participates in an invention, rather than the number of participants)
    #That would require a different approach using a person - patent lookup
    
    
    #All this drama is because we are concatenating lists, so we need to flatten them first
    fam = df.dropna(axis=0,subset=[variable]).groupby(pat_fam_id)[variable].apply(lambda x: list(set(flatten_list(list(x)))))
    
    #This gives us the distribution of activity over NUTS
    nuts_freqs = flat_freq(fam).reset_index(drop=False)
    
    #Now we want to focus on NUTS2
    nuts_freqs.columns = ['nuts','frequency']
    
    #Prune nuts to focus on nuts-2
    nuts_freqs['nuts_code'] = [x[:4] if len(x)>=4 else np.nan for x in nuts_freqs['nuts']]
    
    #Group and aggregate over nuts 2
    nuts_group = nuts_freqs.groupby('nuts_code')['frequency'].sum().reset_index(drop=False)
    
    
    #Get the names with the lookup
    nuts_group['nuts_name'] = [nuts_lookup[x] for x in nuts_group['nuts_code']]
    
    #Set the index
    nuts_group.set_index(['nuts_name','nuts_code'],inplace=True)
    
    #Name the variable
    nuts_group.columns = [variable]
    
    return(nuts_group)
    
    
    
    
    

In [None]:
pat_nuts = pd.concat([count_patenting_in_nuts(p.loc[p['appln_filing_year']>2015],var,nuts_patstat_lookup) for var in ['inv_nuts','appl_nuts']],axis=1)

In [None]:
pat_nuts.sort_values('inv_nuts',ascending=False).head(n=10)

Interesting -many more applications than inventors in London

In [None]:
pat_nuts.to_csv(f'../../data/processed/patents/{today_str}_patent_nuts.csv')