1. Columns selected

['patent_number', 'uspc_mainclass_id', 'uspc_mainclass_title',
       'patent_date', 'patent_title', 'patent_year', 'app_number', 'app_type',
       'assignee_country', 'assignee_first_name', 'assignee_first_seen_date',
       'assignee_id', 'assignee_last_name', 'assignee_organization',
       'assignee_type', 'cited_patent_category', 'cited_patent_date',
       'cited_patent_kind', 'cited_patent_number', 'cited_patent_sequence',
       'cited_patent_title', 'citedby_patent_category', 'citedby_patent_date',
       'citedby_patent_kind', 'citedby_patent_number', 'citedby_patent_title',
       'inventor_country', 'inventor_first_name', 'inventor_id',
       'inventor_last_name', 'inventor_sequence', 'uspc_subclass_id',
       'uspc_subclass_title', 'wipo_field_id', 'wipo_field_title',
       'wipo_sector_title']

In [22]:
#SET UP
import pandas as pd
import numpy as np

columns = ['patent_number', 'uspc_mainclass_id', 'uspc_mainclass_title', 'patent_date', 'patent_title', 
                 'patent_year', 'app_number', 'app_type', 'assignee_country', 'assignee_first_name', 
                 'assignee_id', 'assignee_last_name', 'assignee_organization', 
                 'assignee_type', 'cited_patent_category', 'cited_patent_date', 'cited_patent_kind',
                 'cited_patent_number', 'cited_patent_sequence', 'cited_patent_title', 'citedby_patent_category', 
                 'citedby_patent_date', 'citedby_patent_kind', 'citedby_patent_number', 'citedby_patent_title', 
                 'inventor_country', 'inventor_first_name', 'inventor_id', 'inventor_last_name', 'inventor_sequence', 
                 'uspc_subclass_id', 'uspc_subclass_title', 'wipo_field_id', 'wipo_field_title', 'wipo_sector_title']

#Reading the datasets ordered by year of patent granted
df_2004 = pd.read_csv('patent2004.csv', low_memory=False)
df_2005 = pd.read_csv('patent2005.csv', low_memory=False)
df_2006 = pd.read_csv('patent2006.csv', low_memory=False)
df_2007 = pd.read_csv('patent2007.csv', low_memory=False)
df_2008 = pd.read_csv('patent2008.csv', low_memory=False)
df_2009 = pd.read_csv('patent2009.csv', low_memory=False)
df_2010= pd.read_csv('patent2010.csv', low_memory=False)
df_2011= pd.read_csv('patent2011.csv', low_memory=False)
df_2012= pd.read_csv('patent2012.csv', low_memory=False)
df_2013= pd.read_csv('patent2013.csv', low_memory=False)




In [182]:
#Secondary dataset for further processing
assignee = pd.read_csv('assignee_patents.csv',low_memory=False)


## Filtering the data

Selecting only the columns needed to create features


In [23]:
#Selecting the columns need to create the features
df2004 = df_2004[columns]
df2005 = df_2005[columns]
df2006 = df_2006[columns]
df2007 = df_2007[columns]
df2008 = df_2008[columns]
df2009 = df_2009[columns]
df2010 = df_2010[columns]
df2011 = df_2011[columns]
df2012= df_2012[columns]
df2013= df_2013[columns]

## Joining the datasets

In [24]:
frames = [df2004,df2005, df2006, df2007,df2008,
          df2009,df2010,df2011,df2012,df2013]
patentdf = pd.concat(frames)
print("Numero de fila en el dataset",patentdf.size)

Numero de fila en el dataset 32450250


## Preprocessing: Generating the features

### Input Features

#### Base of Scientific Knowledge

We measure the life cycle of a patent by calculating the mean age of all cited patents and also by calculating the number of backward citations by said patent.

Two features.

In [53]:
def bsk_processing(patent):
    #Grouping by patent
    patent_number = patent['patent_number'].unique()

    #Creating the returning DF
    ultimate_df = pd.DataFrame()

    for pnumber in patent_number:
        temp= patent.loc[(patent.patent_number==pnumber)]
        
        
        
        #Calculating the mean age of cited patents
        mask = (temp['cited_patent_date'] != 'None')
        temp_df_valid = temp[mask]

        if(temp_df_valid.size !=0):
            TCT = np.mean(pd.DatetimeIndex(temp_df_valid['cited_patent_date']).year).astype(int)
        else:
            TCT = 0
        
        #Calculating the number of cited patents for current patent
        mask = (temp['cited_patent_number'] != 'None')
        temp = temp[mask]
        if(temp_df_valid.size >0):
            base_knowledge = len(temp['cited_patent_number'].unique())
        else:
            base_knowledge = 0
        
        ultimate_df = ultimate_df.append({'patent_number' : pnumber,
                                          'PK':base_knowledge,
                                          'TCT':TCT
                                             },ignore_index=True)

    return ultimate_df

### Coverage of the Patent

To represent the coverage of a patent we create three features. The first one is the numerical value of the mainclass of the patent, then we take into consideration the total number of classes to which a patent belongs. The third feature represents the number of countries to which the patent belongs since important patents might the registered in multiple countries.

Four features.

In [47]:
def cvrg_processing(patent):
    #Grouping by patent
    patent_number = patent['patent_number'].unique()

    #Creating the returning DF
    ultimate_df = pd.DataFrame()

    for pnumber in patent_number:
        temp= patent.loc[(patent.patent_number==pnumber)]
        
        #Numerical value of the main field of the patent
        main_field=temp['uspc_mainclass_id'].value_counts()
        MF=int(main_field.idxmax())
        
        #Number of classes to which the patent belongs
        TS = len(temp['uspc_mainclass_id'].unique())
        #Number of subclasses to which the patent belongs
        TSC = len(temp['uspc_subclass_id'].unique())
        
        #Number of countries in which the patent has been registered
        countries=len(temp['assignee_country'].unique())
        
        ultimate_df = ultimate_df.append({'patent_number' : pnumber,
                                          'MF' : MF,
                                          'TS' : TS,
                                          'TSC' : TSC,
                                          'CS' : countries
                                          
                                             },ignore_index=True)

    return ultimate_df

### Development

We clasify the patents by calculating the number of inventors involved in the development of the final result, and by knowing through a binary value if the patent is a product of more than one assignee for collaboration purposes.

In [60]:
def dev_processing(patent):
    #Grouping by patent
    patent_number = patent['patent_number'].unique()

    #Creating the returning DF
    ultimate_df = pd.DataFrame()

    for pnumber in patent_number:
        temp= patent.loc[(patent.patent_number==pnumber)]
        
        #Number of inventors
        total_inventors = len(temp['inventor_id'].unique())
        
        #Binary value
        #describing if a patent was created in 
        #collaboration with multiple assignees
        assignee_total = len(temp['assignee_organization'].unique()) 
        col=int(assignee_total>1) #1 if true, 0 if false
        
        ultimate_df = ultimate_df.append({'patent_number' : pnumber,
                                          'INV' : total_inventors,
                                          'COL' : col
                                          
                                             },ignore_index=True)

    return ultimate_df

### Capabilities of Assignee

In [198]:
#Grouping by patent
patent_number = df2004['patent_number'].unique()

    #Creating the returning DF
ultimate_df = pd.DataFrame()
temp= df2004.loc[(df2004.patent_number==patent_number[2])]
        
mask = (df2004['assignee_id'] != 'None')
temp_df_valid = df2004[mask]


df2004.groupby(['patent_number']).count()

Unnamed: 0_level_0,uspc_mainclass_id,uspc_mainclass_title,patent_date,patent_title,patent_year,app_number,app_type,assignee_country,assignee_first_name,assignee_id,...,inventor_country,inventor_first_name,inventor_id,inventor_last_name,inventor_sequence,uspc_subclass_id,uspc_subclass_title,wipo_field_id,wipo_field_title,wipo_sector_title
patent_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6672133,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
6672143,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
6673332,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
6673333,39,39,39,39,39,39,39,39,39,39,...,39,39,39,39,39,39,39,39,39,39
6673334,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RE38575,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
RE38604,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
RE38623,69,69,69,69,69,69,69,69,69,69,...,69,69,69,69,69,69,69,69,69,69
RE38628,8,8,8,8,8,8,8,8,8,8,...,8,8,8,8,8,8,8,8,8,8


In [199]:
df2004.loc[(df2004.patent_number==6672133)]

Unnamed: 0,patent_number,uspc_mainclass_id,uspc_mainclass_title,patent_date,patent_title,patent_year,app_number,app_type,assignee_country,assignee_first_name,...,inventor_country,inventor_first_name,inventor_id,inventor_last_name,inventor_sequence,uspc_subclass_id,uspc_subclass_title,wipo_field_id,wipo_field_title,wipo_sector_title


In [None]:
def dev_processing(patent,assignee):
    #Grouping by patent
    patent_number = patent['patent_number'].unique()

    #Creating the returning DF
    ultimate_df = pd.DataFrame()

    for pnumber in patent_number:
        temp= patent.loc[(patent.patent_number==pnumber)]
        
        mask = (temp['assignee_id'] != 'None')
        temp_df_valid = temp[mask]
        
        ultimate_df = ultimate_df.append({'patent_number' : pnumber,
                                          'INV' : total_inventors,
                                          'COL' : col
                                          
                                             },ignore_index=True)

    return ultimate_df

### Output Features

Many studies have shown the correlation between the number of forwad citations and the emerginess of a patent. Now we need to identify a relation between the input features and the outputfeatures through machine learning, to identify the emerginess of a patent based on the input features. 

In [174]:
#We aim to classify the emerginess of a patent based on the possible number of forward citations.
#There are 4 categories

def __category(forward_citations):
    if forward_citations <2:
        level = 4
        return level
    if forward_citations in range(2,10):
        level = 3
        return level
    if forward_citations in range(10,14):
        level = 2
        return level
    if forward_citations > 14 :
        level = 1
        return level
    
def __namelevel(l):
    namelevel = 'Nivel ' + str(l)
    return namelevel

In [175]:
def output_processing(patent):
    #Grouping by patent
    patent_number =  patent['patent_number'].unique()

    #Creating the returning DF
    ultimate_df = pd.DataFrame()

    
    for pnumber in patent_number:
        temp= patent.loc[(patent.patent_number==pnumber)]
        
        #Number of forward citations over the next 3, 5 and 10 years
        mask_citedby = (temp['citedby_patent_date'] != 'None')
        temp_citedby = temp[mask_citedby]
        citedby = np.array(pd.to_datetime(temp_citedby['citedby_patent_date'].unique()).year)
        st_date= np.array(pd.to_datetime(temp_citedby['patent_date'].unique()).year +3)
        mt_date= st_date+2
        lt_date= mt_date+5
        
        FC3=sum(citedby<=st_date)
        FC5=sum(citedby<=mt_date)
        FC10=sum(citedby<=lt_date)
        cat_fc3 = __category(FC3)
        cat_fc5 = __category(FC5) 
        cat_fc10 = __category(FC10)
        name_fc3 = __namelevel(cat_fc3)
        name_fc5 = __namelevel(cat_fc5)
        name_fc10 = __namelevel(cat_fc10)
        ultimate_df = ultimate_df.append({'patent_number' : pnumber,
                                          'FC3':FC3,
                                          'FC5':FC5,
                                          'FC10':FC10,
                                          'Category FC3':cat_fc3,
                                          'Category FC5':cat_fc5,
                                          'Category FC10':cat_fc10,
                                          'Category (FC3)':name_fc3,
                                          'Category (FC5)':name_fc5,
                                          'Category (FC10)':name_fc10
                                             },ignore_index=True)
        frames = ['patent_number','FC3','FC5','FC10',
                  'Category FC3','Category FC5','Category FC10',
                 'Category (FC3)','Category (FC5)','Category (FC10)']
        
    return ultimate_df.reindex(columns=frames)


The processing is made by each year due to low_memory space to compute the entire dataframe in one execution.

In [54]:
patent2004_features = bsk_processing(df2004)

In [49]:
patent2004_cvrg = cvrg_processing(df2004)

In [61]:
patent2004_dev = dev_processing(df2004)

In [176]:
patent2004_out = output_processing(df2004)

In [200]:
p2004 = patent2004_features.set_index('patent_number').join(
    patent2004_cvrg.set_index('patent_number')).join(
    patent2004_dev.set_index('patent_number')).join(
    patent2004_out.set_index('patent_number'))

In [208]:
p2004.head()

Unnamed: 0_level_0,PK,TCT,CS,MF,TS,TSC,COL,INV,FC3,FC5,FC10,Category FC3,Category FC5,Category FC10,Category (FC3),Category (FC5),Category (FC10)
patent_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
6672133,4.0,2000.0,1.0,250.0,4.0,4.0,0.0,4.0,0.0,2.0,3.0,4.0,3,3,Nivel 4,Nivel 3,Nivel 3
6672143,4.0,1991.0,1.0,73.0,3.0,4.0,0.0,3.0,0.0,0.0,1.0,4.0,4,4,Nivel 4,Nivel 4,Nivel 4
6673332,4.0,1988.0,1.0,436.0,2.0,3.0,0.0,3.0,0.0,0.0,0.0,4.0,4,4,Nivel 4,Nivel 4,Nivel 4
6673333,38.0,1993.0,1.0,424.0,1.0,2.0,0.0,3.0,0.0,1.0,8.0,4.0,4,3,Nivel 4,Nivel 4,Nivel 3
6673334,11.0,1999.0,1.0,424.0,2.0,7.0,0.0,5.0,1.0,1.0,3.0,4.0,4,3,Nivel 4,Nivel 4,Nivel 3


In [201]:
patent2005_features = bsk_processing(df2005)

In [202]:
patent2005_cvrg = cvrg_processing(df2005)

In [203]:
patent2005_dev = dev_processing(df2005)

In [204]:
patent2005_out = output_processing(df2005)

In [206]:
p2005 = patent2005_features.set_index('patent_number').join(
    patent2005_cvrg.set_index('patent_number')).join(
    patent2005_dev.set_index('patent_number')).join(
    patent2005_out.set_index('patent_number'))

In [207]:
p2005.head()

Unnamed: 0_level_0,PK,TCT,CS,MF,TS,TSC,COL,INV,FC3,FC5,FC10,Category FC3,Category FC5,Category FC10,Category (FC3),Category (FC5),Category (FC10)
patent_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
6837925,0.0,0.0,1.0,106.0,2.0,6.0,1.0,2.0,0.0,1.0,4.0,4.0,4,3,Nivel 4,Nivel 4,Nivel 3
6838054,4.0,1995.0,1.0,422.0,6.0,11.0,0.0,1.0,1.0,1.0,1.0,4.0,4,4,Nivel 4,Nivel 4,Nivel 4
6838073,34.0,1993.0,1.0,424.0,3.0,9.0,1.0,2.0,3.0,7.0,28.0,3.0,3,1,Nivel 3,Nivel 3,Nivel 1
6838074,57.0,1997.0,1.0,424.0,3.0,6.0,0.0,1.0,2.0,2.0,4.0,3.0,3,3,Nivel 3,Nivel 3,Nivel 3
6838075,41.0,1997.0,1.0,128.0,4.0,7.0,0.0,9.0,0.0,0.0,7.0,4.0,4,3,Nivel 4,Nivel 4,Nivel 3


In [209]:
patent2006_features = bsk_processing(df2006)

In [212]:
patent2006_cvrg = cvrg_processing(df2006)

In [213]:
patent2006_dev = dev_processing(df2006)

In [214]:
patent2006_out = output_processing(df2006)

In [215]:
p2006 = patent2006_features.set_index('patent_number').join(
    patent2006_cvrg.set_index('patent_number')).join(
    patent2006_dev.set_index('patent_number')).join(
    patent2006_out.set_index('patent_number'))

In [216]:
p2006

Unnamed: 0_level_0,PK,TCT,CS,MF,TS,TSC,COL,INV,FC3,FC5,FC10,Category FC3,Category FC5,Category FC10,Category (FC3),Category (FC5),Category (FC10)
patent_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
6982075,0.0,0.0,1.0,514.0,2.0,2.0,0.0,1.0,0.0,0.0,1.0,4,4,4,Nivel 4,Nivel 4,Nivel 4
6982076,0.0,0.0,1.0,424.0,1.0,2.0,0.0,4.0,0.0,1.0,1.0,4,4,4,Nivel 4,Nivel 4,Nivel 4
6982077,16.0,1996.0,1.0,424.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0,4,4,4,Nivel 4,Nivel 4,Nivel 4
6982078,1.0,2001.0,1.0,514.0,3.0,7.0,0.0,2.0,0.0,1.0,1.0,4,4,4,Nivel 4,Nivel 4,Nivel 4
6982079,11.0,1995.0,1.0,424.0,2.0,5.0,0.0,1.0,0.0,1.0,8.0,4,4,3,Nivel 4,Nivel 4,Nivel 3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RE39321,92.0,1990.0,1.0,424.0,3.0,24.0,0.0,7.0,2.0,11.0,41.0,3,2,1,Nivel 3,Nivel 2,Nivel 1
RE39337,1.0,1996.0,1.0,424.0,2.0,7.0,0.0,2.0,0.0,0.0,0.0,4,4,4,Nivel 4,Nivel 4,Nivel 4
RE39403,5.0,1988.0,1.0,424.0,3.0,3.0,0.0,3.0,0.0,0.0,4.0,4,4,3,Nivel 4,Nivel 4,Nivel 3
RE39433,0.0,0.0,1.0,424.0,4.0,7.0,0.0,3.0,0.0,0.0,0.0,4,4,4,Nivel 4,Nivel 4,Nivel 4


In [219]:
frames = [p2004,p2005,p2006]
final_patent_features =  pd.concat(frames)
final_patent_features.head()

Unnamed: 0_level_0,PK,TCT,CS,MF,TS,TSC,COL,INV,FC3,FC5,FC10,Category FC3,Category FC5,Category FC10,Category (FC3),Category (FC5),Category (FC10)
patent_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
6672133,4.0,2000.0,1.0,250.0,4.0,4.0,0.0,4.0,0.0,2.0,3.0,4,3,3,Nivel 4,Nivel 3,Nivel 3
6672143,4.0,1991.0,1.0,73.0,3.0,4.0,0.0,3.0,0.0,0.0,1.0,4,4,4,Nivel 4,Nivel 4,Nivel 4
6673332,4.0,1988.0,1.0,436.0,2.0,3.0,0.0,3.0,0.0,0.0,0.0,4,4,4,Nivel 4,Nivel 4,Nivel 4
6673333,38.0,1993.0,1.0,424.0,1.0,2.0,0.0,3.0,0.0,1.0,8.0,4,4,3,Nivel 4,Nivel 4,Nivel 3
6673334,11.0,1999.0,1.0,424.0,2.0,7.0,0.0,5.0,1.0,1.0,3.0,4,4,3,Nivel 4,Nivel 4,Nivel 3


## Machine Learning

In [217]:
from sklearn.ensemble import RandomForestClassifier

In [220]:
#Defining Training Set

In [36]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=256, n_jobs=-1, random_state=17)
rnd_clf.fit(X, y)

13505