# Trademarks

Here we collect open trademark data from the Intellectual Property Office. 

The data is available here: https://www.gov.uk/government/publications/ipo-trade-mark-data-release

We will undertake the following activities:

* Collect all the data.
* Enrich it with information about the product codes that the trademarks refer to
* Enrich it with information about its NUTS location (we keep this flexible as we will using this code in multiple places)




## Preamble

In [None]:
%run ../notebook_preamble.ipy

In [None]:
import re
import random
from zipfile import ZipFile
from io import BytesIO
import csv

In [None]:
#dirs

if 'trademarks' not in os.listdir('../../data/raw'):
    os.makedirs('../../data/raw/trademarks')

if 'trademarks' not in os.listdir('../../data/processed/'):
    os.makedirs('../../data/processed/trademarks')

In [None]:
# %load ../utilities.py
# Some utilities

import random

def make_data_dict(table,name,path,sample=5):
    '''
    A function to output the form for a data dictionary
    
    Args:
        -table (df) is the df we want to create the data dictionary for
        -name (str) of the df
        -path (str) is the place where we want to save the file
        

    
    '''
    
    types = [estimate_type(table[x],sample=sample) for x in table.columns]
        
    data_dict = pd.DataFrame()
    data_dict['variable'] = table.columns
        
    data_dict['type'] = types
    
    data_dict['description'] = ['' for x in data_dict['variable']]
        
    out = os.path.join(path,f'{today_str}_{name}.csv')
    
    #print(data_dict.columns)
    
    data_dict.to_csv(out)
    

def estimate_type(variable,sample):
    '''
    Estimates the type of a column. 

    Args:
        variable (iterable) with values
        sample (n) is the number of values to test
    
    '''
    
    selection = random.sample(list(variable),sample)
    
    types = pd.Series([type(x) for x in selection]).value_counts().sort_values(ascending=False)
    
    return(types.index[0])

## 1. Collect data

We collect the data from the IPOs open data site. This is a zip file.

#### Collect trademark open dataset

In [None]:
#Download and parse the data
trademark_link = 'https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/680986/opendatadomestic.zip'
trade_request = requests.get(trademark_link)

In [None]:
tradem = ZipFile(BytesIO(trade_request.content)).extract('OpenDataDomestic.txt',path=f'../../data/raw/trademarks/{today_str}_trademarks.txt')

In [None]:
#Note that here we are escaping a small number (~20) of badlines.
#I couldn't quite determine what was the problem with them

tradem_df = pd.read_csv('../../data/raw/trademarks/12_11_2019_trademarks.txt/OpenDataDomestic.txt',delimiter='|',
                        encoding='utf-16',warn_bad_lines=False,error_bad_lines=False)

In [None]:
# This is what it looks like
tradem_df.head()

In [None]:
#Tidy the columns
tradem_df.columns = [re.sub(' ','_',x).lower() for x in tradem_df.columns]

#Convert year strings to years. Faster with string processing than with datetime
tradem_df['year_published'] = [int(str(x).split('-')[0]) if not pd.isnull(x) else x for x in tradem_df.published]

#### Product - category lookup

We will use this lookup to identify patents with scientific Nice codes

In [None]:
class_product_category_lookup = pd.read_csv('../../data/aux/12_11_2019_nice_class_to_category_lookup.csv')

## 2. Geocoding the trademarks

We are going to create a function that automatically geocodes the trademarks using a postcode-NUTS lookup. One challenge with this is that both postcodes and NUTS classifications change over time.



In [None]:
#Clean postcodes
tradem_df['postcode'] = [x.strip().lower() if (pd.isnull(x)==False) & (x!='Not Available') else np.nan for x in tradem_df.postcode]

In [None]:
tradem_uk = tradem_df.loc[tradem_df['country']=='United Kingdom'].dropna(axis=0,subset=['postcode'])

len(tradem_uk)

These are the trademarks in the UK with postcodes. We can use them in subsequent analyses

In [None]:
pc_url = 'https://www.arcgis.com/sharing/rest/content/items/19fac93960554b5e90840505bd73917f/data'

In [None]:
def geo_trademark(tradem_df,geography,nspl_file,lookup_file,geo_code,path_to_nspl):
    '''
    
    This function classifies trademars into locations using a postcode Lookup. As part of this we need to merge the merged file with a geo-code - geo-name
    lookup to get the geography names.
    
    Arguments:
        tradem_df (df) is the df with the trademark information. It needs to include a postode for matching
        geography (str) is the geography we want to match
        nspl_file (str) is the file with the nspl data
        lookup_file (str) is the name of the file with a lookup between variable names and codes
        geo_code (str) is the name of the variable name in the lookup
        path_to_nspl (str) if a link, then we download the nspl file.
    
    '''
    
    #Read the NSPL files
    
    if 'https' in path_to_nspl:
        
        print('downloading nspl')
        
        #Download the file
        nspl_request = requests.get(path_to_nspl)
        
        nspl_zipfile = ZipFile(BytesIO(nspl_request.content))
        
        #Read the nspl
        nspl = pd.read_csv(nspl_zipfile.open(f'Data/{nspl_file}'))[['pcds',geography]]
        
        #Read the lookup
        lookup = pd.read_csv(nspl_zipfile.open(f'Documents/{lookup_file}'))
        
    else:
        print('reading nspl')
        
        nspl = pd.read_csv(path_to_nspl+f'/Data/{nspl_file}')[['pcds',geography]]
        
        lookup = pd.read_csv(path_to_nspl+f'/Documents/{lookup_file}')
        
       
    print('processing data')
    #Throw away unnecessary postcodes in the nspl file (we are only interested in the first digit. Also, make them lowercase
    nspl['pcds_1st'] = nspl['pcds'].apply(lambda x: x.split(' ')[0].lower())
    
    
    #Merge
    tradem_merged = pd.merge(tradem_df,nspl.drop_duplicates('pcds_1st')[['pcds_1st',geography]],left_on='postcode',right_on='pcds_1st')
    
    
    #Merge with the lookup names
    #Remove Walsh column names from lookup
    lookup = lookup[[x for x in lookup.columns if x[-1]!='W']]
    
    tradem_w_names = pd.merge(tradem_merged,lookup,left_on=geography,right_on=geo_code)
    
    #Remove the geography variable as it has unstandardised names
    tradem_w_names.drop(axis=1,labels=geography,inplace=True)
    
    return(tradem_w_names)    

In [None]:
trademark_nuts = geo_trademark(tradem_df,geography='nuts',nspl_file='NSPL_AUG_2019_UK.csv',
                            lookup_file='LAU219_LAU119_NUTS18_MAY_2019_UK_LU.csv',
                            geo_code='LAU219CD',path_to_nspl = pc_url)

In [None]:
trademark_nuts.head()

## 3. Processing

We will create a df with registered trademark counts after 2010, and counts of trademarks in scientific and technnological nice codes. 

We identify what these are using the lookup we created in our project mapping innovation in Scotland, and which we loaded above.

In [None]:
#Filter to focus on recent & registered trademarks
trademark_clean = trademark_nuts.loc[(trademark_nuts['status']=='Registered')&(trademark_nuts['year_published']>=2010)]

In [None]:
#What are the scientic classes?
class_product_category_lookup.loc[class_product_category_lookup['category']=='scientific']

In [None]:
scientic_nice_classes = list(class_product_category_lookup.loc[class_product_category_lookup['category']=='scientific']['class'])

In [None]:
#Does a trademark have at least one scientific category?
#trademark_clean['is_scientific'] = trademark_nuts[scientic_nice_classes].sum(axis=1)>0

trademark_clean['is_scientific'] = trademark_nuts['class42']>0

In [None]:
trademark_clean['is_scientific'].sum()

### Group by NUTS to create aggregates

In [None]:
trademark_grouped = pd.concat([trademark_clean.groupby(['NUTS218NM','NUTS218CD']).size(),
                            trademark_clean.groupby(['NUTS218NM','NUTS218CD'])['is_scientific'].sum()],axis=1)

In [None]:
trademark_grouped.rename(columns={0:'trademark_n','is_scientific':'scientific_trademark_n'},inplace=True)

In [None]:
trademark_grouped['scientific_trademark_share'] = trademark_grouped['scientific_trademark_n']/trademark_grouped['trademark_n']

In [None]:
trademark_grouped.sort_values('scientific_trademark_share',ascending=False).head()

In [None]:
trademark_grouped.to_csv(f'../../data/processed/trademarks/{today_str}_nuts_trademarks.csv')