In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
display(HTML("<style>.output_result { max-width:98% !important; }</style>"))


# Code

## Imports

In [2]:
import pandas as pd
import numpy as np
import datetime
from collections import Counter
from tqdm.notebook import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams, bigrams, trigrams
import statistics as stat
import seaborn as sns
import re
from nltk.stem.wordnet import WordNetLemmatizer
import plotly.colors as colors
import dataframe_image as dfi
import plotly.graph_objects as go
import plotly.subplots as pltsub

# Set default color palette
colors_plotly_default = colors.qualitative.Plotly

main_path_mac = '/Users/philippmetzger/Documents/GitHub/battery_patents/'
#main_path_ssd = '/Volumes/Samsung Portable SSD T3 Media/'
main_path_ssd = '/Volumes/T7/Julius SSD Fortsetzung/'

import sys
packages_path = main_path_mac+'/07 Packages'
sys.path.append(packages_path)

from helpers import (current_time_string,
                              image_saver,
                              country_labels_dict,
                              ctry_code_name_dict,
                              message,
                              numbers_dict)


## Read the whole dataset and reduce it to what we are interested in

In [3]:
# Read the whole dataset
dataset_name = 'data_batteries_2022-01-26_1852'

path = main_path_ssd+'Dataset saves/04 From 15 Nov 2021 (release of 2021 Autumn edition)/01 Preprocessed/03 final - technologies tagged/'+dataset_name+'.csv'

print('Loading data from:')
print(path)

data = pd.read_csv(path, delimiter = ";", low_memory = False, na_values=['', ' ', '  '], keep_default_na = False)

print('Number of rows:', len(data))

print('Distinct values in column "granted":', pd.unique(data['granted']))

# Reduce it to non active parts, electrodes, secondary cells, charging, redox flow, and Nickel-Hydrogen
a = (data['non_active_parts_electrodes_secondary_cells'] == 1)
b = (data['charging'] == 1)
c = (data['is_Redox flow'] == 1)
d = (data['is_Nickel–hydrogen'] == 1)

data_reduced = data[a | b | c | d].copy()
del data
data = data_reduced

# Futher reduce it to IPFs only
data_ipf = data[data['tag'] == 'IPF'].copy()
ipf_percentage = (len(set(data_ipf['docdb_family_id'])) / len(set(data['docdb_family_id']))) * 100
print('Percentage of IPFs in relation to all battery patent families:'+str(round(ipf_percentage, 2))+'%')
del data
data = data_ipf


Loading data from:
/Volumes/T7/Julius SSD Fortsetzung/Dataset saves/04 From 15 Nov 2021 (release of 2021 Autumn edition)/01 Preprocessed/03 final - technologies tagged/data_batteries_2022-01-26_1852.csv
Number of rows: 4086532
Distinct values in column "granted": ['N' 'Y']
Percentage of IPFs in relation to all battery patent families:19.41%


## Sort it by ['dobdb_family_id', 'earliest_publn_date']

In [4]:
data = data.sort_values(by = ['docdb_family_id', 'earliest_publn_date'])


## Reduce to years we are interested in

In [5]:
print(set(data['earliest_publn_year_this_family_id']))


{1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019}


In [6]:
data_reduced = data[data['earliest_publn_year_this_family_id'] >= 2000].copy()
del data
data = data_reduced


In [7]:
print(set(data['earliest_publn_year_this_family_id']))


{2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019}


In [8]:
only_granted = False


## In appln_abstract and appln_title: Replace NaNs with '  '

In [9]:
data['appln_abstract'].fillna('  ', inplace=True)
data['appln_title'].fillna('  ', inplace=True)


## Infer our time frame from data

In [10]:
year_begin = min(data['earliest_publn_year_this_family_id'])
year_end = max(data['earliest_publn_year_this_family_id'])

years = list(range(year_begin, year_end + 1))
print(years)


[2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]


## Of every family, keep only the last english, non-nan title and abstract

In [11]:
family_ids = pd.unique(data['docdb_family_id'])

# Create two dictionaries containing an empty set for each year
titles_dict = {}
abstracts_dict = {} 
for year in years:
    
    titles_dict[year] = set()
    abstracts_dict[year] = set()
    
for family_id in tqdm(family_ids):
    
    data_this_family_id = data[data['docdb_family_id']==family_id]
    
    earliest_publn_year_this_family_id = list(set(data_this_family_id['earliest_publn_year_this_family_id']))[0]
    
    
    # Get titles
    
    titles_this_family_id = list(pd.unique(data_this_family_id[
        data_this_family_id['appln_title_lg']=='en'
    ]['appln_title']))
    try:
        titles_this_family_id.remove('  ')
    except:
        pass
    
    try:
        last_title_this_family_id = titles_this_family_id[-1]
    except:
        pass
    
    titles_dict[earliest_publn_year_this_family_id].add(last_title_this_family_id)
    
    
    # Get abstracts
    
    abstracts_this_family_id = list(pd.unique(data_this_family_id[
        data_this_family_id['appln_abstract_lg']=='en'
    ]['appln_abstract']))
    try:
        abstracts_this_family_id.remove('  ')
    except:
        pass
    
    try:
        last_abstract_this_family_id = abstracts_this_family_id[-1]
    except:
        pass
    
    abstracts_dict[earliest_publn_year_this_family_id].add(last_abstract_this_family_id)
    

  0%|          | 0/92700 [00:00<?, ?it/s]

## For Bruno: Of every family, keep only the last english, non-nan title and abstract and also save the respective family ID and year

In [12]:
family_ids = pd.unique(data['docdb_family_id'])

# Create two dictionaries containing an empty set for each year
titles_dict_bruno = {}
abstracts_dict_bruno = {} 
for family_id in family_ids:
    
    titles_dict_bruno[family_id] = tuple()
    abstracts_dict_bruno[family_id] = tuple()
    
for family_id in tqdm(family_ids):
    
    data_this_family_id = data[data['docdb_family_id']==family_id]
    
    earliest_publn_year_this_family_id = list(set(data_this_family_id['earliest_publn_year_this_family_id']))[0]
    
    
    # Get titles
    
    titles_this_family_id = list(pd.unique(data_this_family_id[
        data_this_family_id['appln_title_lg']=='en'
    ]['appln_title']))
    try:
        titles_this_family_id.remove('  ')
    except:
        pass
    
    try:
        last_title_this_family_id = titles_this_family_id[-1]
    except:
        pass
    
    titles_dict_bruno[family_id] = (earliest_publn_year_this_family_id, last_title_this_family_id)
    
    
    # Get abstracts
    
    abstracts_this_family_id = list(pd.unique(data_this_family_id[
        data_this_family_id['appln_abstract_lg']=='en'
    ]['appln_abstract']))
    try:
        abstracts_this_family_id.remove('  ')
    except:
        pass
    
    try:
        last_abstract_this_family_id = abstracts_this_family_id[-1]
    except:
        pass
    
    abstracts_dict_bruno[family_id] = (earliest_publn_year_this_family_id, last_abstract_this_family_id)
    

  0%|          | 0/92700 [00:00<?, ?it/s]

In [13]:
# Unfold abstracts_dict_bruno into a dataframe

df_bruno = pd.DataFrame.from_dict(abstracts_dict_bruno, orient='index', columns = ['earliest_publn_year_this_family_id', 'most_recent_abstract_this_family_id'])


In [14]:
# Make the index (family IDs) a column

df_bruno.reset_index(inplace=True)
df_bruno = df_bruno.rename(columns = {'index': 'family_id'})


In [15]:
df_bruno


Unnamed: 0,family_id,earliest_publn_year_this_family_id,most_recent_abstract_this_family_id
0,1574492,2015,An underwater vehicle includes an on board pow...
1,3511554,2000,"The method involves placing all loads (7,8,9,1..."
2,3613974,2002,The electrode for an electrochemical arrangeme...
3,3673165,2002,The invention describes a method of regulating...
4,3681483,2001,The invention relates to an essentially flat e...
...,...,...,...
92695,73455420,2019,The present invention provides a storage syste...
92696,73474213,2015,PROBLEM TO BE SOLVED: To provide a method allo...
92697,74557388,2015,"A surgical instrument can comprise a handle, a..."
92698,74844536,2004,"FIELD: electrical engineering, namely manufact..."


In [16]:
df_bruno.to_csv('ready_to_eat_bruno.csv', index = False)


In [17]:
# Unfold titles_dict_bruno into a dataframe

df_bruno_titles = pd.DataFrame.from_dict(titles_dict_bruno, orient='index', columns = ['earliest_publn_year_this_family_id', 'most_recent_title_this_family_id'])
df_bruno_titles.drop('earliest_publn_year_this_family_id', axis = 1, inplace = True)


In [18]:
df_bruno_titles


Unnamed: 0,most_recent_title_this_family_id
1574492,- Underwater vehicle comprising power storage ...
3511554,Method of controlling emergency power supply i...
3613974,Electrode for an electrochemical arrangement c...
3673165,Method for regulating an inverter system
3681483,MULTILAYER ELECTRODE
...,...
73455420,AUTOMATED STORAGE SYSTEM WITH A CONTAINER VEHI...
73474213,WIRELESS CHARGING UNIT AND COUPLER BASED DOCKI...
74557388,POWER MANAGEMENT CONTROL SYSTEM FOR SURGICAL I...
74844536,METHOD FOR CONTINUOUSLY MAKING ELECTRIC CURREN...


In [19]:
# Join df_bruno and df_bruno_titles
df_bruno = df_bruno.join(other = df_bruno_titles, on = 'family_id', how = 'inner')

In [20]:
df_bruno


Unnamed: 0,family_id,earliest_publn_year_this_family_id,most_recent_abstract_this_family_id,most_recent_title_this_family_id
0,1574492,2015,An underwater vehicle includes an on board pow...,- Underwater vehicle comprising power storage ...
1,3511554,2000,"The method involves placing all loads (7,8,9,1...",Method of controlling emergency power supply i...
2,3613974,2002,The electrode for an electrochemical arrangeme...,Electrode for an electrochemical arrangement c...
3,3673165,2002,The invention describes a method of regulating...,Method for regulating an inverter system
4,3681483,2001,The invention relates to an essentially flat e...,MULTILAYER ELECTRODE
...,...,...,...,...
92695,73455420,2019,The present invention provides a storage syste...,AUTOMATED STORAGE SYSTEM WITH A CONTAINER VEHI...
92696,73474213,2015,PROBLEM TO BE SOLVED: To provide a method allo...,WIRELESS CHARGING UNIT AND COUPLER BASED DOCKI...
92697,74557388,2015,"A surgical instrument can comprise a handle, a...",POWER MANAGEMENT CONTROL SYSTEM FOR SURGICAL I...
92698,74844536,2004,"FIELD: electrical engineering, namely manufact...",METHOD FOR CONTINUOUSLY MAKING ELECTRIC CURREN...


## Get titles and abstracts counts for each year

In [21]:
titles_counts = []
for year in list(titles_dict):
    titles_counts.append(len(titles_dict[year]))

print(titles_counts)


[939, 1135, 1105, 1151, 1243, 1501, 1818, 2002, 2298, 2627, 3126, 4622, 5970, 6614, 7040, 6929, 6968, 7562, 8513, 9523]


In [22]:
abstracts_counts = []
for year in list(abstracts_dict):
    abstracts_counts.append(len(abstracts_dict[year]))

print(abstracts_counts)


[975, 1164, 1132, 1191, 1288, 1566, 1955, 2147, 2480, 2808, 3376, 5152, 6757, 7463, 7936, 7733, 7830, 8438, 9677, 11016]


In [23]:
sum(abstracts_counts)


92084

## Write counts in a dataframe and normalise them

In [24]:
# Read total yearly counts and add column 'normalised' 

#max_patent_count = total_yearly_counts_df['patent families count'].max()
#total_yearly_counts_df['patent families count normalised'] = total_yearly_counts_df['patent families count'] / max_patent_count

total_yearly_counts_df = pd.DataFrame()

total_yearly_counts_df['titles counts'] = titles_counts
#max_titles_count = total_yearly_counts_df['titles counts'].max()
#total_yearly_counts_df['titles count normalised'] = total_yearly_counts_df['titles counts'] / max_titles_count


total_yearly_counts_df['abstracts counts'] = abstracts_counts
#max_abstracts_count = total_yearly_counts_df['abstracts counts'].max()
#total_yearly_counts_df['abstracts count normalised'] = total_yearly_counts_df['abstracts counts'] / max_abstracts_count


total_yearly_counts_df


Unnamed: 0,titles counts,abstracts counts
0,939,975
1,1135,1164
2,1105,1132
3,1151,1191
4,1243,1288
5,1501,1566
6,1818,1955
7,2002,2147
8,2298,2480
9,2627,2808


## Define stopwords, contexts, equivalents, words to replace, and punctuation

In [25]:
stopwords_ = stopwords.words('english')
stopwords_.extend([
    'thereof', 'therefor', 'thereafter', 'thereby', 'wherein', 'utmost', 'whether',
    'without', 'within',
    'xo', 'e', 'etc', 'ab', 'b', 'c', 'pct', 'wo', 'pt', 'pts', 'wt', 'xii', 'xiii', 'ymyo', 'xmn', 'xiv', 'le', 'sub',
    'r', 'x', 'g', 'p', 'v', 'zfz', 'zsz', 'z', 'f',
    'positive', 'negative', 'left', 'right',
    'high', 'low',
    'less', 'les', 'more', 'least',
    'judging', 'preparing', 'producing', 'comprising', 'following', 'containing', 'including', 'using', 'consisting',
    'making',
    'one', 'two', 'never',
    'end',
    'almost', 'like', 'also',
    'especially', 'preferably', 'surely', 'nearly', 'previously', 'mainly',
    'involves', 'comprises', 'provides', 'relates', 'belongs', 'discloses', 'includes',
    'solved', 'expressed', 'specified', 'provided', 'selected', 'characterized', 'included', 'equipped',
    'decided', 'made', 'filed', 'used', 'formed', 'said',
    'provide', 'improve', 'prevent', 'obtain', 'reduce', 'enhance', 'increase', 'suppress', 'realize', 'use',
    'first', 'second',
    'simple', 'convenient',
    'whose',
    'according',
    'capable', 'preferable', 'desirable', 'good',
    'desirably',
    'kind',
    'date', 'temp', 'sec', 
    'jan', 'apr', 'may', 'jun', 'jul', 'nov', 'oct',
    'jp',
    'problem', 'drawing', 'figure', 'invention', 'model', 'publication', 'utility', 'preparation',
    'method', 'application', 'purpose', 'number',
    'new', 'novel',
    'excellent',
    'non',
    'top', 'bottom'
])


# Contexts in which first word should be kept
contexts_after = [
    ['positive', 'electrode'],
    ['negative', 'electrode'],
    ['positive', 'electrodes'],
    ['negative', 'electrodes'],
    ['positive', 'active', 'material'],
    ['negative', 'active', 'material'],
    ['non', 'aqueous'],
    ['non', 'sintered'],
    ['top', 'cap'],
    ['bottom', 'plate']
]


# Contexts in which second word should be kept
contexts_before = [
    ['lithium', 'containing']
]
    

treat_as_same = [
    [('method', 'manufacturing'), ('manufacturing', 'method')],
    [('storage', 'battery', 'alkaline'), ('alkaline', 'storage', 'battery')],
    [('battery', 'alkaline', 'storage'), ('alkaline', 'storage', 'battery')]
]


replace_words = {
    'soln': 'solution',
    'aq': 'aqueous',
    'nonaqueous': 'non-aqueous',
    'obtd': 'obtained',
    'hr': 'hour',
    'pub': 'publication',
    'compsn': 'composition',
    'contg': 'containing',
    'compd': 'compound',
    'mfg': 'manufacturing',
    'methodfor': 'method for',
    'al': 'aluminium',
    'aluminum': 'aluminium',
    'co': 'cobalt',
    'mn': 'manganese',
    'ni': 'nickel',
    'zr': 'zirconium',
    'cr': 'chromium',
    'ti': 'titanium',
    'li': 'lithium',
    'la': 'lanthanum',
    'ce': 'cerium',
    'fe': 'iron',
    'ltoreq':'less than or equal',
    'deg': 'degree'
}


# This is not used in this application after all:
punctuation = '!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~'


In [26]:
# Check if replace_words dictionary works as it should

item = ['negative', 'obtd', 'soln', 'nonaqueous', 'active', 'material']

item_replaced = []
for word in item:
    
    if word in list(replace_words):
        
        item_replaced.extend(replace_words[word].split())
        
    else:
        
        item_replaced.append(word)
    
item = item_replaced
    
item


['negative', 'obtained', 'solution', 'non-aqueous', 'active', 'material']

## Define a function for taking care of key phrases extraction and counting

In [27]:
def growing_keywords(n_gram_length, item_type):
    
    # Initialise lemmatizer
    lem = WordNetLemmatizer()
    
    # create string for identifying the right language column
    #item_type_lg = item_type+'_lg'
    
    # Initialise n grams list
    ngrams_lists = []

    # Loop over years
    for year in tqdm(years):

        # Initialise n grams list for this year
        ngrams_list_this_year = []

        # Get all  
        #items_year = list(set(data[(data[item_type_lg]=='en') & (data['earliest_publn_year_this_family_id']==year)][item_type]))
        
        # Get this year's titles / abstracts (depending which mode we're in)
        if item_type == 'appln_title':
            items_year = list(titles_dict[year])
        if item_type == 'appln_abstract':
            items_year = list(abstracts_dict[year])
                
        for item in items_year:
            
            # Make all lowercase
            item = item.lower()

            # Punctuation removal
            item = re.sub('[^a-zA-Z]', ' ', item) 
            #for x in punctuation:
            #    item = item.replace(x,' ')

            # Tokenise
            item = item.split()
                        
            #####
            
            # 12 Oct 2022: Fixing the issue with "non aqueous" and "aqueous"
            
            #item_new = []
            
            aqueous_count = item.count('aqueous')
            
            #if aqueous_count>0:
            #    print(aqueous_count)
                
            for aqueous_occurrence in list(range(aqueous_count)):
                
                item_new = []
                
                #print(aqueous_occurrence)
            
                if 'aqueous' in item:
                    
                    #print('here')

                    aqueous_index = item.index('aqueous')
                    
                    #print(aqueous_index)
                    #print(item[aqueous_index])
                    
                    if aqueous_index>0:
                        
                        #print(aqueous_index - 1)
                        #print(item[aqueous_index - 1])
                        
                        if item[aqueous_index - 1] == 'non':
                            
                            #print('here')
                            
                            #print(item)
                            #print(len(item))

                            #for i, word in enumerate(item):
                            for word_index in list(range(len(item))):
                                
                                #print((i, word))
                                
                                #print(word_index)
                                #print(item[word_index])

                                if (word_index != aqueous_index) & (word_index != aqueous_index - 1):

                                    item_new.append(item[word_index])

                                elif (word_index == (aqueous_index - 1)):

                                    item_new.append('non-aqueous')

                                else:
                                    pass

                            item_save = item
                            item = item_new
                
                #print(item_save)
                #print(item_new)
                #print()
                                
            #####
            
            # Replace certain words with others (according to replace_words dictionary defined above)
            item_replaced = []
            
            for word in item:

                if word in list(replace_words):
        
                    item_replaced.extend(replace_words[word].split())

                else:

                    item_replaced.append(word)

            item = item_replaced
            
            #####
            
            # Remove stopwords; but only if they are not in a context that indicates that they should be kept.
            # Such contexts are defined in cell above (contexts_after and contexts_before).
            
            item_without_stopwords = []

            # For debugging: List of stopwords that will be checked
            check_list = []
            # For debugging: List of stopwords that are actually removed (because they are not in a certain context)
            remove_list = []
                        
            for i, word in enumerate(item):
                
                remove = False

                if word in stopwords_:
 
                    check_list.append(word)

                    # Treat cases with context after the word or phrase
        
                    in_context_after = False

                    for j, context in enumerate(contexts_after):
                        
                        if i<(len(item)-(len(context)-1)):

                            try:

                                to_compare = []
                                
                                for k in range(len(context)):
                                    to_compare.append(item[i+k])
                                    
                                if to_compare == context:
                                    in_context_after = True
                                    
                                    if False:
                                        print('not removed due to context')
                                        print(to_compare)
                                        print(context)
                                        print()

                            except:
                                print(contexts_after)
                                print(item)
                                print(len(item))
                                print(i+k)
                                print(to_compare)
                                print(context)

                    if not in_context_after:

                        remove = True
                        
                    # Treat cases with context before the word or phrase
        
                    in_context_before = False

                    for j, context in enumerate(contexts_before):
                        
                        if i>(len(context)-1):
                                
                            try:

                                to_compare = []

                                for k in range(len(context)):
                                    
                                    to_compare.append(item[i+k-(len(context)-1)])

                                if to_compare == context:
                                    in_context_before = True
                                    
                                    if False:
                                        print('not removed due to context')
                                        print(to_compare)
                                        print(context)
                                        print()

                            except:
                                print(contexts_before)
                                print(item)
                                print(len(item))
                                print(i+k)
                                print(to_compare)
                                print(context)

                    if not in_context_before:

                        remove = True
                        
                if not remove:
                    
                    item_without_stopwords.append(word)
                    
                else:
                                            
                    remove_list.append(word)

            #####
            
            # Delete words that are a repetition of the word before
            
            # Always add the first word to next stage
            # If this list of words is empty, do nothing
            try:
                item_without_stopwords_and_repetitions = [item_without_stopwords[0]]
            except Exception as e:
                pass
                                    
            # Loop over the other words (the second and following) and add them to the next stage, if they are not 
            # a repetition of the word before
            for i in range(1, len(item_without_stopwords)):
                
                if (item_without_stopwords[i-1] != item_without_stopwords[i]):
                    
                    item_without_stopwords_and_repetitions.append(item_without_stopwords[i]) 
            
            #####
            
            # Lemmatisation
            item_without_stopwords_lemmatized = [lem.lemmatize(word) for word in item_without_stopwords_and_repetitions] 

            #####
            
            # Get ngrams
            ngrams_ = list(ngrams(item_without_stopwords_lemmatized, n_gram_length))
            
            # Treat certain pairs of n_grams as the same (defined in cell above (treat_as_same))
            for n_gram_treat_as_same in treat_as_same:
                
                while n_gram_treat_as_same[0] in ngrams_:
                    
                    ngrams_.remove(n_gram_treat_as_same[0])
                    ngrams_.append(n_gram_treat_as_same[1])

            # Add to list
            ngrams_list_this_year.extend(ngrams_)
            
        # For 3-grams: delete it if first word is equal to third word, e.g. battery pack battery
        if n_gram_length == 3:
            
            ngrams_list_this_year_reduced = []
            
            for item in ngrams_list_this_year:
                
                if not item[0] == item[2]:
                    
                    ngrams_list_this_year_reduced.append(item)
                    
            ngrams_list_this_year = ngrams_list_this_year_reduced

        ngrams_lists.append(ngrams_list_this_year)
        
    print('N-grams created')

    #####
    
    # Count n grams' appearances
    counter_list = []
    unique_keys = set()

    for list_ in ngrams_lists:

        counter = dict(Counter(list_).most_common())
        counter_list.append(counter)

        unique_keys = unique_keys.union(set(counter.keys()))
        
    print('N-grams counted')

    #####
    
    # NEW 17 Jan 2022: Delete all phrases that have at least one year where the counter is zero.
    # V2, same day: Delete all phrases that have more than 15 years where the counter is zero.
    if False:
        
        unique_keys_reduced = set()

        for key_ in unique_keys:

            has_zero = 0

            for counter in counter_list:

                if key_ not in counter:

                    has_zero += 1

            if has_zero > 15:

                for counter in counter_list:

                    try:
                        counter.pop(key_)
                    except Exception as pop_error:
                        pass
                        #print(type(pop_error))
                        #print(pop_error)
                        #print()

            else:

                unique_keys_reduced.add(key_)

        unique_keys = unique_keys_reduced
                
    #####
    
    # Create a count entry of 0 for n grams that is present in at least one year but not in other(s)
    for counter in counter_list:

        for key_ in unique_keys:

            if key_ not in counter:

                counter[key_] = 0

    #####      
    
    relative = False
    
    # NEW 10 Nov 2021: Increment all counts by 1 (in order to avoid division by 0 in growth calculation)
    # This is only necessary when using relative growth (see growth calcuation further down)
    if relative:
        
        for counter in counter_list:

            for key_ in counter:

                    counter[key_] = counter[key_] + 1
    
    def growing_keywords_sub(counter_list, unique_keys, scale):
    
        #####

        # NEW 10 Nov 2021: Scale by year's distinct title / abstract count
        # NEW 18 Jan 2022: Scale by year's distinct title / abstract count to make it "per 1000 titles / abstracts"

        if scale:

            if item_type == 'appln_title':
                #normalised_patent_counts = list(total_yearly_counts_df['titles count normalised'])
                patent_counts = list(total_yearly_counts_df['titles counts'])

            elif item_type == 'appln_abstract':
                #normalised_patent_counts = list(total_yearly_counts_df['abstracts count normalised'])
                patent_counts = list(total_yearly_counts_df['abstracts counts'])

            else:
                print('Item type not recognised')
                return

            for i, counter in enumerate(counter_list):

                #normalised_patent_count_this_year = normalised_patent_counts[i]
                patent_count_this_year = patent_counts[i]

                for key_ in counter:

                    #value_scaled = counter[key_] / normalised_patent_count_this_year
                    value_scaled = counter[key_] / patent_count_this_year * 1000
                    counter[key_] = value_scaled


        #####

        # Calculate increase over whole time span
        growth_dict = {}
        growth_dict_absolute = {}

        for key_ in unique_keys:

            growth_dict[key_] = counter_list[len(counter_list) - 1][key_] - counter_list[0][key_]

        print('Difference over whole timespan calculated')

        # Calculate sum of absolute differences year-over-year; absolute => Growing and shrinking are treated as the same
        for key_ in unique_keys:

            growth = []
            for i in range(len(counter_list)-1):

                if not relative:

                    # Growth as abs(x1 - x0)
                    growth.append(abs(counter_list[i+1][key_]-counter_list[i][key_]))

                else:

                    # Growth as abs(x1 / x0) - 1. Only works if x0 is not zero.
                    try:
                        growth.append(abs((counter_list[i+1][key_] / counter_list[i][key_]) - 1))
                    except:
                        print('error')

            try:
                growth_dict_absolute[key_] = sum(growth)
            except Exception as e2:
                print(type(e2))
                print(e2)

        print('Sum of absolute differences (abs(count_year_i+1 - count_year_i)) calculated')

        #####
        #####

        # Prepare positive growth plot
        highest_growth = dict(sorted(growth_dict.items(), key=lambda x:x[1], reverse=True))
        top_30_growth = list(highest_growth)[:30]
        top_50_growth = list(highest_growth)[:50]

        growing_list = []
        growing_list.append(top_50_growth)

        #####

        counts_list = []

        #for key_ in top_30_growth:
        for key_ in top_50_growth:

            counts = []

            for counter in counter_list:

                counts.append(counter[key_])

            counts_list.append(counts)

        #####

        df_keyword_growth = pd.DataFrame(index=years)
        for i, ngram_ in enumerate(top_50_growth):

            ngram_string = ' '.join(ngram_)

            df_keyword_growth[ngram_string] = counts_list[i]

        df_keyword_growth = df_keyword_growth.transpose()

        # Round all values
        df_keyword_growth = df_keyword_growth.round()
        df_keyword_growth = df_keyword_growth.apply(pd.to_numeric, downcast='integer')
        
        growing_list.append(df_keyword_growth)

        #####

        cm = sns.light_palette((260, 75, 60), input="husl", as_cmap=True)

        plot_positive_growth = df_keyword_growth.style.background_gradient(cmap=cm, axis=1)
        
        # Display thousands with comma separation
        plot_positive_growth.format("{:,d}")

        growing_list.append(plot_positive_growth)

        print('Positive change plot created')

        #####
        #####

        # Prepare negative growth plot

        highest_growth = dict(sorted(growth_dict.items(), key=lambda x:x[1], reverse=False))
        top_30_growth = list(highest_growth)[:30]
        top_50_growth = list(highest_growth)[:50]

        shrinking_list = []
        shrinking_list.append(top_50_growth)

        #####

        counts_list = []

        for key_ in top_50_growth:

            counts = []

            for counter in counter_list:

                counts.append(counter[key_])

            counts_list.append(counts)

        #####

        df_keyword_growth = pd.DataFrame(index=years)
        for i, ngram_ in enumerate(top_50_growth):

            ngram_string = ' '.join(ngram_)

            df_keyword_growth[ngram_string] = counts_list[i]

        df_keyword_growth = df_keyword_growth.transpose()

        # Round all values
        df_keyword_growth = df_keyword_growth.round()
        df_keyword_growth = df_keyword_growth.apply(pd.to_numeric, downcast='integer')

        shrinking_list.append(df_keyword_growth)

        #####

        cm = sns.light_palette((260, 75, 60), input="husl", as_cmap=True)

        plot_negative_growth = df_keyword_growth.style.background_gradient(cmap=cm, axis=1)
        
        # Display thousands with comma separation
        plot_negative_growth.format("{:,d}")

        shrinking_list.append(plot_negative_growth)

        print('Negative change plot created')

        #####
        #####

        # Prepare absolute growth plot
        highest_growth = dict(sorted(growth_dict_absolute.items(), key=lambda x:x[1], reverse=True))
        top_30_growth = list(highest_growth)[:30]
        top_50_growth = list(highest_growth)[:50]

        absolute_growth_list = []
        absolute_growth_list.append(top_50_growth)

        #####

        counts_list = []

        #for key_ in top_30_growth:
        for key_ in top_50_growth:

            counts = []

            for counter in counter_list:

                counts.append(counter[key_])

            counts_list.append(counts)

        #####

        df_keyword_growth = pd.DataFrame(index=years)
        for i, ngram_ in enumerate(top_50_growth):

            ngram_string = ' '.join(ngram_)

            df_keyword_growth[ngram_string] = counts_list[i]

        df_keyword_growth = df_keyword_growth.transpose()

        # Round all values
        df_keyword_growth = df_keyword_growth.round()
        df_keyword_growth = df_keyword_growth.apply(pd.to_numeric, downcast='integer')

        absolute_growth_list.append(df_keyword_growth)        
        
        #####

        cm = sns.light_palette((260, 75, 60), input="husl", as_cmap=True)
        
        plot_absolute_growth = df_keyword_growth.style.background_gradient(cmap=cm, axis=1)
        
        # Display thousands with comma separation
        plot_absolute_growth.format("{:,d}")

        absolute_growth_list.append(plot_absolute_growth)

        print('Absolute change plot created')

        return growing_list, shrinking_list, absolute_growth_list
    
    growing_list, shrinking_list, absolute_growth_list = growing_keywords_sub(counter_list, unique_keys, False)
    growing_list_scaled, shrinking_list_scaled, absolute_growth_list_scaled = growing_keywords_sub(counter_list, unique_keys, True)
    
    return growing_list, shrinking_list, absolute_growth_list, growing_list_scaled, shrinking_list_scaled, absolute_growth_list_scaled


## Define a function for generating LaTeX code

In [28]:
#hier

In [29]:
def generate_latex_code(df):
    """This function takes a dataframe with n-gram counts as input and generates LaTeX code for creating a table with row-wise color gradients"""
    
    max_ = df.max(axis = 1)
    min_ = df.min(axis = 1)
    
    intensity = (df.subtract(min_, axis = 0)).divide((max_ - min_), axis = 0) * 100
    
    table_width = df.shape[1]
    table_height = df.shape[0]

    latex_code = '\\begin{tabularx}{\linewidth} {| >{\\raggedright\\arraybackslash}p{3.7cm}'

    for i in range(table_width):
        latex_code = latex_code+'| >{\\raggedleft\\arraybackslash}X '

    latex_code = latex_code+'| }\n'

    first_row = '\mc{} & '
    for year in years[:-1]:
        first_row = first_row + '\mc{'+str(year)+'} & '
    first_row = first_row + '\mc{'+str(years[-1])+'}'
    first_row = first_row+' \\\\'

    latex_code = latex_code+first_row+'\n\\hline\n\\hline'

    for i in range(table_height):
    #for i in range(3): # For testing purposes: Create only 3 rows

        intensity_this_phrase = list(intensity.loc[df.index[i]])

        this_row_code = df.index[i]+' & '
        for j in range(table_width):

            # Make text color white when cell color is darker
            if (intensity_this_phrase[j] >= 40):
                textcolor = 'white'
            else:
                textcolor = 'black'

            if j < (table_width - 1):
                this_row_code = this_row_code+'\\cellcolor{blue!'+str(intensity_this_phrase[j])+'!white}\\textcolor{'+textcolor+'}{'+str(df.iloc[i,j])+'} & '

            else:                        
                this_row_code = this_row_code+'\\cellcolor{blue!'+str(intensity_this_phrase[j])+'!white}\\textcolor{'+textcolor+'}{'+str(df.iloc[i,j])+'} \\\\'

        this_row_code = this_row_code+'\n\\hline'

        latex_code = latex_code+'\n'+this_row_code

    latex_code = latex_code+'\n\\end{tabularx}'

    print(latex_code)
    

## Two more definitions

In [30]:
# Define whether to use median or mean
#measure_function = stat.mean
#measure_function = stat.mean


# Results

## Titles

### Titles - unigrams

In [117]:
growing_list_title_1, shrinking_list_title_1, highest_abs_change_list_title_1, growing_list_title_1_scaled, shrinking_list_title_1_scaled, highest_abs_change_list_title_1_scaled = growing_keywords(
    1,
    'appln_title'
)


  0%|          | 0/20 [00:00<?, ?it/s]

N-grams created
N-grams counted
Difference over whole timespan calculated
Sum of absolute differences (abs(count_year_i+1 - count_year_i)) calculated
Positive change plot created
Negative change plot created
Absolute change plot created
Difference over whole timespan calculated
Sum of absolute differences (abs(count_year_i+1 - count_year_i)) calculated
Positive change plot created
Negative change plot created
Absolute change plot created


In [118]:
growing_list_title_1[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
battery,571,663,587,657,705,805,1033,1107,1312,1444,1725,2730,3428,3791,4108,3961,4082,4197,4939,5966
device,144,156,166,225,254,302,397,469,564,625,765,1067,1543,1889,1954,1800,1812,2027,2159,2559
system,88,127,128,150,137,186,236,320,423,513,630,1042,1334,1530,1528,1530,1508,1737,1925,2103
secondary,125,160,165,194,218,243,340,313,390,407,538,844,1104,1282,1375,1339,1379,1249,1477,1923
electrode,126,151,135,146,186,192,231,259,313,328,429,644,929,969,1092,1083,1011,1138,1283,1617
power,115,146,169,163,203,258,314,372,467,451,606,973,1342,1523,1773,1609,1631,1850,1702,1602
lithium,124,167,155,135,191,204,271,243,272,334,459,722,977,1054,1189,1237,1144,1026,1224,1601
vehicle,55,73,91,83,87,84,106,176,208,232,392,579,832,921,754,704,671,872,1142,1523
charging,76,108,103,100,114,144,132,173,225,293,379,497,725,894,823,822,921,1067,1208,1417
material,84,89,107,109,106,128,148,134,185,212,331,448,635,650,763,787,687,796,785,972


In [119]:
growing_list_title_1_scaled[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
system,94,112,116,130,110,124,130,160,184,195,202,225,223,231,217,221,216,230,226,221
device,153,137,150,195,204,201,218,234,245,238,245,231,258,286,278,260,260,268,254,269
vehicle,59,64,82,72,70,56,58,88,91,88,125,125,139,139,107,102,96,115,134,160
secondary,133,141,149,169,175,162,187,156,170,155,172,183,185,194,195,193,198,165,173,202
charging,81,95,93,87,92,96,73,86,98,112,121,108,121,135,117,119,132,141,142,149
module,9,18,12,14,13,19,31,19,37,28,29,39,47,42,50,47,48,53,60,63
energy,19,27,30,40,42,42,30,55,59,59,66,75,71,87,71,72,67,73,76,69
wireless,7,4,5,5,6,13,7,7,13,15,33,26,40,51,68,67,76,79,60,54
power,122,129,153,142,163,172,173,186,203,172,194,211,225,230,252,232,234,245,200,168
ion,26,27,24,29,27,38,50,37,42,38,61,71,81,83,87,95,88,70,71,71


In [120]:
shrinking_list_title_1[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
telephone,14,8,14,7,12,13,9,1,2,3,6,2,6,4,7,5,1,0,0,2
absorbing,11,8,1,0,1,2,1,10,2,2,2,1,4,3,4,4,7,4,2,4
alkaline,30,40,36,24,17,21,29,19,19,21,16,11,31,11,23,17,11,15,24,24
hydrogen,25,34,14,18,12,13,21,24,16,21,8,14,12,12,17,26,17,15,25,20
manganate,5,1,0,0,0,2,1,0,5,2,3,0,0,0,2,1,0,3,0,0
digital,8,1,4,3,4,1,5,2,3,0,5,10,4,5,3,9,9,8,5,3
sealed,17,16,8,6,9,7,16,13,12,13,29,17,23,29,33,22,23,11,18,12
ionically,4,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,2,0
polarizable,4,1,2,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
cadmium,4,0,2,2,0,0,2,3,1,0,1,0,1,0,0,1,0,0,0,0


In [121]:
shrinking_list_title_1_scaled[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
cell,109,126,103,105,91,58,72,63,61,67,67,65,77,91,89,79,78,58,68,61
process,48,34,30,27,36,27,22,20,19,20,26,18,17,13,12,10,10,11,11,11
alloy,33,22,16,14,10,13,7,14,4,8,5,5,6,4,4,4,2,2,1,3
alkaline,32,35,33,21,14,14,16,9,8,8,5,2,5,2,3,2,2,2,3,3
circuit,62,69,72,57,64,67,67,67,56,69,46,50,36,38,39,39,34,32,40,33
portable,36,24,30,30,32,49,33,31,31,27,23,23,15,17,13,15,13,13,6,9
rechargeable,45,41,47,30,29,35,39,46,37,29,27,28,32,24,29,27,31,20,18,19
hydrogen,27,30,13,16,10,9,12,12,7,8,3,3,2,2,2,4,2,2,3,2
charger,43,35,35,54,45,37,34,38,36,37,28,27,21,26,20,23,23,17,18,21
polymer,32,22,25,22,21,19,13,10,8,7,6,8,6,7,7,9,10,9,11,11


In [122]:
highest_abs_change_list_title_1[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
battery,571,663,587,657,705,805,1033,1107,1312,1444,1725,2730,3428,3791,4108,3961,4082,4197,4939,5966
device,144,156,166,225,254,302,397,469,564,625,765,1067,1543,1889,1954,1800,1812,2027,2159,2559
power,115,146,169,163,203,258,314,372,467,451,606,973,1342,1523,1773,1609,1631,1850,1702,1602
secondary,125,160,165,194,218,243,340,313,390,407,538,844,1104,1282,1375,1339,1379,1249,1477,1923
system,88,127,128,150,137,186,236,320,423,513,630,1042,1334,1530,1528,1530,1508,1737,1925,2103
lithium,124,167,155,135,191,204,271,243,272,334,459,722,977,1054,1189,1237,1144,1026,1224,1601
vehicle,55,73,91,83,87,84,106,176,208,232,392,579,832,921,754,704,671,872,1142,1523
electrode,126,151,135,146,186,192,231,259,313,328,429,644,929,969,1092,1083,1011,1138,1283,1617
charging,76,108,103,100,114,144,132,173,225,293,379,497,725,894,823,822,921,1067,1208,1417
electric,42,67,51,62,57,72,121,123,144,178,255,425,578,678,486,494,441,494,600,726


In [123]:
highest_abs_change_list_title_1_scaled[2]

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
battery,608,584,531,571,567,536,568,553,571,550,552,591,574,573,584,572,586,555,580,626
power,122,129,153,142,163,172,173,186,203,172,194,211,225,230,252,232,234,245,200,168
lithium,132,147,140,117,154,136,149,121,118,127,147,156,164,159,169,179,164,136,144,168
device,153,137,150,195,204,201,218,234,245,238,245,231,258,286,278,260,260,268,254,269
secondary,133,141,149,169,175,162,187,156,170,155,172,183,185,194,195,193,198,165,173,202
vehicle,59,64,82,72,70,56,58,88,91,88,125,125,139,139,107,102,96,115,134,160
system,94,112,116,130,110,124,130,160,184,195,202,225,223,231,217,221,216,230,226,221
supply,61,50,68,46,72,85,70,84,80,57,63,71,65,57,62,54,58,56,54,44
cell,109,126,103,105,91,58,72,63,61,67,67,65,77,91,89,79,78,58,68,61
charging,81,95,93,87,92,96,73,86,98,112,121,108,121,135,117,119,132,141,142,149


### Titles - bigrams

In [124]:
growing_list_title_2, shrinking_list_title_2, highest_abs_change_list_title_2, growing_list_title_2_scaled, shrinking_list_title_2_scaled, highest_abs_change_list_title_2_scaled = growing_keywords(
    2,
    'appln_title'
)


  0%|          | 0/20 [00:00<?, ?it/s]

N-grams created
N-grams counted
Difference over whole timespan calculated
Sum of absolute differences (abs(count_year_i+1 - count_year_i)) calculated
Positive change plot created
Negative change plot created
Absolute change plot created
Difference over whole timespan calculated
Sum of absolute differences (abs(count_year_i+1 - count_year_i)) calculated
Positive change plot created
Negative change plot created
Absolute change plot created


In [125]:
growing_list_title_2[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
secondary battery,96,116,118,141,183,223,312,296,363,387,502,797,1036,1173,1256,1207,1255,1170,1393,1835
lithium ion,22,31,25,33,29,55,87,70,90,98,183,319,468,506,566,595,546,475,528,583
battery pack,27,43,23,37,54,49,89,110,106,124,154,235,302,326,297,308,288,373,443,524
lithium secondary,42,63,58,48,77,85,101,68,77,101,147,184,236,277,250,218,241,190,290,487
active material,33,38,49,47,41,44,59,63,98,102,156,213,329,308,380,362,317,366,347,462
battery module,5,12,6,1,6,6,39,17,36,33,52,92,128,143,188,182,183,202,280,374
electric vehicle,11,10,7,12,6,6,16,18,27,36,111,153,245,290,173,198,138,221,251,343
energy storage,0,7,13,15,16,19,21,37,48,50,63,116,159,258,196,225,199,261,299,332
power supply,46,46,69,47,77,114,108,144,170,126,176,262,343,321,384,335,357,385,401,372
storage device,7,6,8,9,13,16,18,29,53,47,68,119,202,288,301,271,246,277,265,329


In [126]:
growing_list_title_2_scaled[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
secondary battery,102,102,107,123,147,149,172,148,158,147,161,172,174,177,178,174,180,155,164,193
lithium ion,23,27,23,29,23,37,48,35,39,37,59,69,78,77,80,86,78,63,62,61
energy storage,0,6,12,13,13,13,12,18,21,19,20,25,27,39,28,32,29,35,35,35
battery module,5,11,5,1,5,4,21,8,16,13,17,20,21,22,27,26,26,27,33,39
storage device,7,5,7,8,10,11,10,14,23,18,22,26,34,44,43,39,35,37,31,35
battery pack,29,38,21,32,43,33,49,55,46,47,49,51,51,49,42,44,41,49,52,55
electric vehicle,12,9,6,10,5,4,9,9,12,14,36,33,41,44,25,29,20,29,29,36
ion battery,10,9,11,8,3,12,17,13,17,16,24,32,31,29,34,37,39,35,39,33
electrode active,9,15,11,19,19,16,14,19,17,21,25,29,30,23,23,29,28,31,25,31
solid state,0,3,0,2,0,1,1,2,2,1,2,4,6,8,8,6,13,17,14,22


In [127]:
shrinking_list_title_2[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
absorbing alloy,10,6,1,0,0,1,1,8,1,2,1,0,3,0,0,3,2,0,0,0
alloy electrode,12,7,4,1,1,0,3,6,0,4,2,3,3,2,2,1,2,0,0,3
hydrogen absorbing,10,7,1,0,0,1,1,8,1,2,1,0,3,0,0,3,2,0,0,1
process production,10,7,2,5,3,5,2,5,2,4,13,25,10,2,4,3,1,1,3,3
portable telephone,7,4,7,4,2,4,2,0,0,0,2,0,0,0,1,0,0,0,0,0
electronic machine,5,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
nickel hydroxide,6,3,3,3,3,1,3,0,1,1,0,1,2,1,4,0,1,2,1,1
lithium manganate,5,1,0,0,0,2,1,0,3,2,3,0,0,0,1,1,0,3,0,0
layer capacitor,7,11,8,1,7,7,11,15,2,5,2,7,4,3,4,5,3,1,3,2
polarizable electrode,4,1,2,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [128]:
shrinking_list_title_2_scaled[2]

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
battery charger,27,19,17,23,14,13,15,13,15,13,11,7,5,5,6,4,4,2,4,3
electrochemical cell,28,26,13,13,19,11,12,12,15,6,7,6,10,11,10,8,6,7,7,6
storage battery,28,38,27,36,30,25,27,23,10,13,13,12,12,17,19,17,17,12,11,10
secondary cell,21,32,31,41,26,11,12,5,8,5,8,7,10,14,15,15,14,9,8,6
rechargeable lithium,20,17,15,5,7,11,4,9,8,6,7,10,11,6,12,8,9,5,4,5
lead acid,19,14,7,22,15,13,11,3,9,8,8,5,5,4,4,5,6,7,5,5
alloy electrode,13,6,4,1,1,0,2,3,0,2,1,1,1,0,0,0,0,0,0,0
charging battery,16,9,12,10,7,9,4,7,11,9,5,7,6,5,5,5,5,5,5,4
alkaline storage,12,21,16,12,8,7,8,5,2,3,3,2,3,0,2,1,0,0,0,1
absorbing alloy,11,5,1,0,0,1,1,4,0,1,0,0,1,0,0,0,0,0,0,0


In [129]:
highest_abs_change_list_title_2[2]

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
secondary battery,96,116,118,141,183,223,312,296,363,387,502,797,1036,1173,1256,1207,1255,1170,1393,1835
lithium ion,22,31,25,33,29,55,87,70,90,98,183,319,468,506,566,595,546,475,528,583
lithium secondary,42,63,58,48,77,85,101,68,77,101,147,184,236,277,250,218,241,190,290,487
electric vehicle,11,10,7,12,6,6,16,18,27,36,111,153,245,290,173,198,138,221,251,343
power supply,46,46,69,47,77,114,108,144,170,126,176,262,343,321,384,335,357,385,401,372
battery pack,27,43,23,37,54,49,89,110,106,124,154,235,302,326,297,308,288,373,443,524
active material,33,38,49,47,41,44,59,63,98,102,156,213,329,308,380,362,317,366,347,462
non-aqueous electrolyte,27,39,42,39,55,58,75,77,142,107,125,205,276,284,277,335,380,311,262,343
ion secondary,4,13,6,16,19,28,43,32,36,40,88,150,199,289,320,322,242,213,202,239
energy storage,0,7,13,15,16,19,21,37,48,50,63,116,159,258,196,225,199,261,299,332


In [130]:
highest_abs_change_list_title_2_scaled[2]

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
secondary battery,102,102,107,123,147,149,172,148,158,147,161,172,174,177,178,174,180,155,164,193
power supply,49,41,62,41,62,76,59,72,74,48,56,57,57,49,55,48,51,51,47,39
lithium secondary,45,56,52,42,62,57,56,34,34,38,47,40,40,42,36,31,35,25,34,51
lithium ion,23,27,23,29,23,37,48,35,39,37,59,69,78,77,80,86,78,63,62,61
non-aqueous electrolyte,29,34,38,34,44,39,41,38,62,41,40,44,46,43,39,48,55,41,31,36
battery pack,29,38,21,32,43,33,49,55,46,47,49,51,51,49,42,44,41,49,52,55
active material,35,33,44,41,33,29,32,31,43,39,50,46,55,47,54,52,45,48,41,49
electric vehicle,12,9,6,10,5,4,9,9,12,14,36,33,41,44,25,29,20,29,29,36
motor vehicle,9,8,17,5,18,8,9,11,9,7,12,15,19,20,16,10,14,10,18,27
ion secondary,4,11,5,14,15,19,24,16,16,15,28,32,33,44,45,46,35,28,24,25


### Titles - trigrams

In [131]:
growing_list_title_3, shrinking_list_title_3, highest_abs_change_list_title_3, growing_list_title_3_scaled, shrinking_list_title_3_scaled, highest_abs_change_list_title_3_scaled = growing_keywords(
    3,
    'appln_title'
)


  0%|          | 0/20 [00:00<?, ?it/s]

N-grams created
N-grams counted
Difference over whole timespan calculated
Sum of absolute differences (abs(count_year_i+1 - count_year_i)) calculated
Positive change plot created
Negative change plot created
Absolute change plot created
Difference over whole timespan calculated
Sum of absolute differences (abs(count_year_i+1 - count_year_i)) calculated
Positive change plot created
Negative change plot created
Absolute change plot created


In [132]:
growing_list_title_3[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
lithium secondary battery,35,49,47,41,68,80,95,63,66,99,141,179,229,265,248,209,233,187,281,483
lithium ion battery,9,10,12,9,4,18,31,27,39,43,75,145,185,185,221,225,248,235,291,280
electrode active material,8,16,11,21,22,21,25,35,40,54,72,124,169,147,154,192,186,231,199,272
ion secondary battery,3,10,6,11,18,23,42,32,36,38,86,142,184,271,286,283,220,195,194,230
lithium ion secondary,4,13,6,16,18,28,43,32,36,40,87,150,198,283,306,316,224,202,192,224
electrolyte secondary battery,14,16,19,19,34,39,47,52,99,82,93,132,164,178,198,252,259,200,181,228
secondary battery electrode,5,5,3,4,18,14,20,19,19,20,39,58,123,113,128,131,123,147,151,213
non-aqueous electrolyte secondary,20,23,25,29,41,41,48,53,99,80,96,137,170,187,216,263,275,206,171,225
secondary battery manufacturing,5,8,15,12,15,22,23,24,37,47,33,73,91,76,86,91,94,67,111,155
secondary battery lithium,3,5,7,10,13,15,25,10,16,25,43,46,64,96,117,101,92,76,98,147


In [133]:
growing_list_title_3_scaled[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
ion secondary battery,3,9,5,10,14,15,23,16,16,14,28,31,31,41,41,41,32,26,23,24
electrode active material,9,14,10,18,18,14,14,17,17,21,23,27,28,22,22,28,27,31,23,29
lithium ion battery,10,9,11,8,3,12,17,13,17,16,24,31,31,28,31,32,36,31,34,29
lithium ion secondary,4,11,5,14,14,19,24,16,16,15,28,32,33,43,43,46,32,27,23,24
secondary battery electrode,5,4,3,3,14,9,11,9,8,8,12,13,21,17,18,19,18,19,18,22
lithium secondary battery,37,43,43,36,55,53,52,31,29,38,45,39,38,40,35,30,33,25,33,51
energy storage device,0,3,5,3,9,7,4,7,7,8,8,9,10,17,11,13,12,13,12,13
secondary battery lithium,3,4,6,9,10,10,14,5,7,10,14,10,11,15,17,15,13,10,12,15
solid state battery,0,0,0,0,0,0,0,0,1,0,1,2,3,5,4,3,2,5,6,11
secondary battery manufacturing,5,7,14,10,12,15,13,12,16,18,11,16,15,11,12,13,13,9,13,16


In [134]:
shrinking_list_title_3[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
hydrogen absorbing alloy,10,6,1,0,0,1,1,8,1,2,1,0,3,0,0,3,2,0,0,0
absorbing alloy electrode,7,3,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0
alkaline storage battery,13,25,25,18,11,12,22,11,4,13,10,12,19,4,18,16,1,0,4,7
lithium secondary cell,7,14,10,7,9,4,5,4,11,2,5,5,7,12,1,8,8,2,9,1
double layer capacitor,7,11,6,1,7,7,10,15,2,5,2,7,4,3,4,5,3,1,3,2
sheet battery case,4,2,1,2,0,4,0,0,2,2,1,0,2,2,1,0,0,0,1,0
hydrogen storage alloy,8,7,6,2,2,6,3,7,2,9,3,7,3,2,1,3,3,0,1,4
charge storage device,4,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,3,2,0
anode rechargeable lithium,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
electrode alkaline secondary,4,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,4,1


In [135]:
shrinking_list_title_3_scaled[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
alkaline storage battery,14,22,23,16,9,8,12,5,2,5,3,3,3,1,3,2,0,0,0,1
rechargeable lithium battery,17,14,11,3,7,9,3,9,7,5,5,9,10,5,11,8,8,5,4,5
hydrogen absorbing alloy,11,5,1,0,0,1,1,4,0,1,0,0,1,0,0,0,0,0,0,0
lead acid battery,14,12,6,21,10,11,7,1,7,6,6,3,3,3,3,3,6,6,4,4
hydrogen storage alloy,9,6,5,2,2,4,2,3,1,3,1,2,1,0,0,0,0,0,0,0
electrolyte secondary cell,9,8,7,8,7,2,2,1,1,0,2,2,1,4,3,3,3,2,1,1
absorbing alloy electrode,7,3,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
lithium secondary cell,7,12,9,6,7,3,3,2,5,1,2,1,1,2,0,1,1,0,1,0
double layer capacitor,7,10,5,1,6,5,6,7,1,2,1,2,1,0,1,1,0,0,0,0
material rechargeable lithium,9,4,5,1,2,1,0,3,2,2,2,3,3,1,2,2,2,1,1,1


In [136]:
highest_abs_change_list_title_3[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
lithium secondary battery,35,49,47,41,68,80,95,63,66,99,141,179,229,265,248,209,233,187,281,483
lithium ion secondary,4,13,6,16,18,28,43,32,36,40,87,150,198,283,306,316,224,202,192,224
non-aqueous electrolyte secondary,20,23,25,29,41,41,48,53,99,80,96,137,170,187,216,263,275,206,171,225
ion secondary battery,3,10,6,11,18,23,42,32,36,38,86,142,184,271,286,283,220,195,194,230
electrolyte secondary battery,14,16,19,19,34,39,47,52,99,82,93,132,164,178,198,252,259,200,181,228
electrode active material,8,16,11,21,22,21,25,35,40,54,72,124,169,147,154,192,186,231,199,272
lithium ion battery,9,10,12,9,4,18,31,27,39,43,75,145,185,185,221,225,248,235,291,280
secondary battery manufacturing,5,8,15,12,15,22,23,24,37,47,33,73,91,76,86,91,94,67,111,155
secondary battery electrode,5,5,3,4,18,14,20,19,19,20,39,58,123,113,128,131,123,147,151,213
secondary battery lithium,3,5,7,10,13,15,25,10,16,25,43,46,64,96,117,101,92,76,98,147


In [137]:
highest_abs_change_list_title_3_scaled[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
lithium secondary battery,37,43,43,36,55,53,52,31,29,38,45,39,38,40,35,30,33,25,33,51
lithium ion secondary,4,11,5,14,14,19,24,16,16,15,28,32,33,43,43,46,32,27,23,24
non-aqueous electrolyte secondary,21,20,23,25,33,27,26,26,43,30,31,30,28,28,31,38,39,27,20,24
ion secondary battery,3,9,5,10,14,15,23,16,16,14,28,31,31,41,41,41,32,26,23,24
electrolyte secondary battery,15,14,17,17,27,26,26,26,43,31,30,29,27,27,28,36,37,26,21,24
lithium ion battery,10,9,11,8,3,12,17,13,17,16,24,31,31,28,31,32,36,31,34,29
electrode active material,9,14,10,18,18,14,14,17,17,21,23,27,28,22,22,28,27,31,23,29
non-aqueous secondary battery,6,1,2,2,9,3,5,4,13,6,7,4,8,8,5,8,9,10,15,9
material lithium secondary,11,13,14,9,18,19,12,6,6,10,14,9,10,10,10,7,8,5,5,10
lead acid battery,14,12,6,21,10,11,7,1,7,6,6,3,3,3,3,3,6,6,4,4


## Abstracts

### Abstracts - unigrams

In [None]:
growing_list_abstract_1, shrinking_list_abstract_1, highest_abs_change_list_abstract_1, growing_list_abstract_1_scaled, shrinking_list_abstract_1_scaled, highest_abs_change_list_abstract_1_scaled = growing_keywords(
    1,
    'appln_abstract'
)


  0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
growing_list_abstract_1[2]


In [None]:
growing_list_abstract_1_scaled[2]

In [None]:
shrinking_list_abstract_1[2]


In [None]:
shrinking_list_abstract_1_scaled[2]


In [None]:
highest_abs_change_list_abstract_1[2]


In [None]:
highest_abs_change_list_abstract_1_scaled[2]


### Abstracts - bigrams

In [None]:
growing_list_abstract_2, shrinking_list_abstract_2, highest_abs_change_list_abstract_2, growing_list_abstract_2_scaled, shrinking_list_abstract_2_scaled, highest_abs_change_list_abstract_2_scaled = growing_keywords(
    2,
    'appln_abstract'
)


In [None]:
growing_list_abstract_2[2]


In [None]:
growing_list_abstract_2_scaled[2]

In [None]:
shrinking_list_abstract_2[2]


In [None]:
shrinking_list_abstract_2_scaled[2]


In [None]:
highest_abs_change_list_abstract_2[2]


In [None]:
highest_abs_change_list_abstract_2_scaled[2]


### Abstracts - trigrams

In [30]:
growing_list_abstract_3, shrinking_list_abstract_3, highest_abs_change_list_abstract_3, growing_list_abstract_3_scaled, shrinking_list_abstract_3_scaled, highest_abs_change_list_abstract_3_scaled = growing_keywords(
    3,
    'appln_abstract'
)


  0%|          | 0/20 [00:00<?, ?it/s]

N-grams created
N-grams counted
Difference over whole timespan calculated
Sum of absolute differences (abs(count_year_i+1 - count_year_i)) calculated
Positive change plot created
Negative change plot created
Absolute change plot created
Difference over whole timespan calculated
Sum of absolute differences (abs(count_year_i+1 - count_year_i)) calculated
Positive change plot created
Negative change plot created
Absolute change plot created


In [40]:
# Before fixing the 'aqueous' issue
growing_list_abstract_3[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
electrode active material,89,82,72,121,136,157,241,278,294,356,427,991,1253,1288,1323,1386,1273,1308,1382,1990
active material layer,17,9,29,27,37,96,114,143,134,188,186,364,419,553,499,383,336,510,619,883
lithium secondary battery,46,91,72,65,112,128,192,140,124,178,222,340,451,487,490,410,492,374,540,844
lithium ion battery,15,26,41,31,16,49,58,83,99,121,197,321,476,421,449,520,618,568,702,685
energy storage device,5,41,27,39,33,20,44,70,109,94,147,183,224,367,367,277,370,538,576,648
secondary battery electrode,29,27,21,25,55,82,113,109,95,122,146,286,417,422,384,400,419,441,435,593
electrode current collector,7,7,10,12,15,15,24,46,57,104,96,135,206,171,155,172,215,226,412,539
power storage device,11,3,7,10,2,8,4,30,69,37,183,166,317,310,361,311,311,520,293,394
electrolyte secondary battery,25,27,41,36,65,74,110,118,180,129,113,214,301,269,437,425,412,286,264,405
aqueous electrolyte secondary,24,38,45,46,67,75,111,116,182,135,121,216,307,289,456,427,421,284,248,390


In [277]:
# After attempting to fix the 'aqueous' issue
growing_list_abstract_3[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
electrode active material,89,82,72,121,136,157,241,278,294,356,427,991,1253,1288,1323,1386,1273,1308,1382,1990
active material layer,17,9,29,27,37,96,114,143,134,188,186,364,419,553,499,383,336,510,619,883
lithium secondary battery,46,91,72,65,112,128,192,140,124,178,222,340,451,487,490,410,492,374,540,844
lithium ion battery,15,26,41,31,16,49,58,83,99,121,197,321,476,421,449,520,618,568,702,685
energy storage device,5,41,27,39,33,20,44,70,109,94,147,183,224,367,367,277,370,538,576,648
secondary battery electrode,29,27,21,25,55,82,113,109,95,122,146,286,417,422,384,400,419,441,435,593
electrode current collector,7,7,10,12,15,15,24,46,57,104,96,135,206,171,155,172,215,226,412,539
power storage device,11,3,7,10,2,8,4,30,69,37,183,166,317,310,361,311,311,520,293,394
electrolyte secondary battery,25,27,41,36,65,74,110,118,180,129,113,214,301,269,437,425,412,286,264,405
plurality battery cell,3,1,3,1,7,9,10,28,26,29,32,135,175,229,177,206,212,232,314,351


In [356]:
# After actully fixing the 'aqueous' issue
growing_list_abstract_3[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
electrode active material,89,82,72,121,136,157,241,278,294,356,427,991,1253,1288,1323,1386,1273,1308,1382,1990
active material layer,17,9,29,27,37,96,114,143,134,188,186,364,419,553,499,383,336,510,619,883
lithium secondary battery,46,91,72,65,112,128,192,140,124,178,222,340,451,487,490,410,492,374,540,844
lithium ion battery,15,26,41,31,16,49,58,83,99,121,197,321,476,421,449,520,618,568,702,685
energy storage device,5,41,27,39,33,20,44,70,109,94,147,183,224,367,367,277,370,538,576,648
secondary battery electrode,29,27,21,25,55,82,113,109,95,122,146,286,417,422,384,400,419,441,435,593
electrode current collector,7,7,10,12,15,15,24,46,57,104,96,135,206,171,155,172,215,226,412,539
power storage device,11,3,7,10,2,8,4,30,69,37,183,166,317,310,361,311,311,520,293,394
electrolyte secondary battery,25,27,41,36,65,74,110,118,180,129,113,214,301,269,437,425,412,286,264,405
non-aqueous electrolyte secondary,24,38,45,46,67,75,109,116,182,135,121,216,307,288,447,425,419,284,248,384


In [357]:
# Generate LaTeX code
generate_latex_code(growing_list_abstract_3[1])


\begin{tabularx}{\linewidth} {| >{\raggedright\arraybackslash}p{3.7cm}| >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | }
\mc{} & \mc{2000} & \mc{2001} & \mc{2002} & \mc{2003} & \mc{2004} & \mc{2005} & \mc{2006} & \mc{2007} & \mc{2008} & \mc{2009} & \mc{2010} & \mc{2011} & \mc{2012} & \mc{2013} & \mc{2014} & \mc{2015} & \mc{2016} & \mc{2017} & \mc{2018} & \mc{2019} \\
\hline
\hline
ele

In [358]:
# Export as PNG
#dfi.export(growing_list_abstract_3[2], 'growing_list_abstract_3.png')


In [43]:
# Before fixing the 'aqueous' issue
growing_list_abstract_3_scaled[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
electrode active material,91,70,64,102,106,100,123,129,119,127,126,192,185,173,167,179,163,155,143,181
active material layer,17,8,26,23,29,61,58,67,54,67,55,71,62,74,63,50,43,60,64,80
energy storage device,5,35,24,33,26,13,23,33,44,33,44,36,33,49,46,36,47,64,60,59
lithium ion battery,15,22,36,26,12,31,30,39,40,43,58,62,70,56,57,67,79,67,73,62
electrode current collector,7,6,9,10,12,10,12,21,23,37,28,26,30,23,20,22,27,27,43,49
lithium secondary battery,47,78,64,55,87,82,98,65,50,63,66,66,67,65,62,53,63,44,56,77
plurality battery cell,3,1,3,1,5,6,5,13,10,10,9,26,26,31,22,27,27,27,32,32
power storage device,11,3,6,8,2,5,2,14,28,13,54,32,47,42,45,40,40,62,30,36
current collector electrode,5,7,5,3,6,7,6,13,15,16,17,25,17,15,17,13,17,16,22,29
secondary battery electrode,30,23,19,21,43,52,58,51,38,43,43,56,62,57,48,52,54,52,45,54


In [280]:
# After attempting to fix the 'aqueous' issue
growing_list_abstract_3_scaled[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
electrode active material,91,70,64,102,106,100,123,129,119,127,126,192,185,173,167,179,163,155,143,181
active material layer,17,8,26,23,29,61,58,67,54,67,55,71,62,74,63,50,43,60,64,80
energy storage device,5,35,24,33,26,13,23,33,44,33,44,36,33,49,46,36,47,64,60,59
lithium ion battery,15,22,36,26,12,31,30,39,40,43,58,62,70,56,57,67,79,67,73,62
electrode current collector,7,6,9,10,12,10,12,21,23,37,28,26,30,23,20,22,27,27,43,49
lithium secondary battery,47,78,64,55,87,82,98,65,50,63,66,66,67,65,62,53,63,44,56,77
plurality battery cell,3,1,3,1,5,6,5,13,10,10,9,26,26,31,22,27,27,27,32,32
power storage device,11,3,6,8,2,5,2,14,28,13,54,32,47,42,45,40,40,62,30,36
current collector electrode,5,7,5,3,6,7,6,13,15,16,17,25,17,15,17,13,17,16,22,29
secondary battery electrode,30,23,19,21,43,52,58,51,38,43,43,56,62,57,48,52,54,52,45,54


In [359]:
# After actully fixing the 'aqueous' issue
growing_list_abstract_3_scaled[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
electrode active material,91,70,64,102,106,100,123,129,119,127,126,192,185,173,167,179,163,155,143,181
active material layer,17,8,26,23,29,61,58,67,54,67,55,71,62,74,63,50,43,60,64,80
energy storage device,5,35,24,33,26,13,23,33,44,33,44,36,33,49,46,36,47,64,60,59
lithium ion battery,15,22,36,26,12,31,30,39,40,43,58,62,70,56,57,67,79,67,73,62
electrode current collector,7,6,9,10,12,10,12,21,23,37,28,26,30,23,20,22,27,27,43,49
lithium secondary battery,47,78,64,55,87,82,98,65,50,63,66,66,67,65,62,53,63,44,56,77
plurality battery cell,3,1,3,1,5,6,5,13,10,10,9,26,26,31,22,27,27,27,32,32
power storage device,11,3,6,8,2,5,2,14,28,13,54,32,47,42,45,40,40,62,30,36
current collector electrode,5,7,5,3,6,7,6,13,15,16,17,25,17,15,17,13,17,16,22,29
secondary battery electrode,30,23,19,21,43,52,58,51,38,43,43,56,62,57,48,52,54,52,45,54


In [31]:
# Making letters in beginning of string capitals
growing_list_abstract_3_scaled[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
electrode active material,91,70,64,102,106,100,123,129,119,127,126,192,185,173,167,179,163,155,143,181
active material layer,17,8,26,23,29,61,58,67,54,67,55,71,62,74,63,50,43,60,64,80
energy storage device,5,35,24,33,26,13,23,33,44,33,44,36,33,49,46,36,47,64,60,59
lithium ion battery,15,22,36,26,12,31,30,39,40,43,58,62,70,56,57,67,79,67,73,62
electrode current collector,7,6,9,10,12,10,12,21,23,37,28,26,30,23,20,22,27,27,43,49
lithium secondary battery,47,78,64,55,87,82,98,65,50,63,66,66,67,65,62,53,63,44,56,77
plurality battery cell,3,1,3,1,5,6,5,13,10,10,9,26,26,31,22,27,27,27,32,32
power storage device,11,3,6,8,2,5,2,14,28,13,54,32,47,42,45,40,40,62,30,36
current collector electrode,5,7,5,3,6,7,6,13,15,16,17,25,17,15,17,13,17,16,22,29
secondary battery electrode,30,23,19,21,43,52,58,51,38,43,43,56,62,57,48,52,54,52,45,54


In [44]:
growing_list_abstract_3_scaled[1]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
electrode active material,91,70,64,102,106,100,123,129,119,127,126,192,185,173,167,179,163,155,143,181
active material layer,17,8,26,23,29,61,58,67,54,67,55,71,62,74,63,50,43,60,64,80
energy storage device,5,35,24,33,26,13,23,33,44,33,44,36,33,49,46,36,47,64,60,59
lithium ion battery,15,22,36,26,12,31,30,39,40,43,58,62,70,56,57,67,79,67,73,62
electrode current collector,7,6,9,10,12,10,12,21,23,37,28,26,30,23,20,22,27,27,43,49
lithium secondary battery,47,78,64,55,87,82,98,65,50,63,66,66,67,65,62,53,63,44,56,77
plurality battery cell,3,1,3,1,5,6,5,13,10,10,9,26,26,31,22,27,27,27,32,32
power storage device,11,3,6,8,2,5,2,14,28,13,54,32,47,42,45,40,40,62,30,36
current collector electrode,5,7,5,3,6,7,6,13,15,16,17,25,17,15,17,13,17,16,22,29
secondary battery electrode,30,23,19,21,43,52,58,51,38,43,43,56,62,57,48,52,54,52,45,54


In [46]:
abstract_intensities_strings = list(growing_list_abstract_3_scaled[1].index)

newindex = []

for item in abstract_intensities_strings:
    
    newstring_firstpart = item[0].capitalize()
    newstring_secondpart = item[1:]

    newstring = newstring_firstpart + newstring_secondpart
    
    newindex.append(newstring)
    
#newindex


In [67]:
growing_list_abstract_3_scaled[1].drop('newindex', axis = 1, inplace=True)


In [68]:
growing_list_abstract_3_scaled[1].drop('', axis = 1, inplace=True)


KeyError: "[''] not found in axis"

In [69]:
growing_list_abstract_3_scaled[1]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
electrode active material,91,70,64,102,106,100,123,129,119,127,126,192,185,173,167,179,163,155,143,181
active material layer,17,8,26,23,29,61,58,67,54,67,55,71,62,74,63,50,43,60,64,80
energy storage device,5,35,24,33,26,13,23,33,44,33,44,36,33,49,46,36,47,64,60,59
lithium ion battery,15,22,36,26,12,31,30,39,40,43,58,62,70,56,57,67,79,67,73,62
electrode current collector,7,6,9,10,12,10,12,21,23,37,28,26,30,23,20,22,27,27,43,49
lithium secondary battery,47,78,64,55,87,82,98,65,50,63,66,66,67,65,62,53,63,44,56,77
plurality battery cell,3,1,3,1,5,6,5,13,10,10,9,26,26,31,22,27,27,27,32,32
power storage device,11,3,6,8,2,5,2,14,28,13,54,32,47,42,45,40,40,62,30,36
current collector electrode,5,7,5,3,6,7,6,13,15,16,17,25,17,15,17,13,17,16,22,29
secondary battery electrode,30,23,19,21,43,52,58,51,38,43,43,56,62,57,48,52,54,52,45,54


In [70]:
growing_list_abstract_3_scaled[1][''] = newindex

abstract_intensities_new_df = growing_list_abstract_3_scaled[1].set_index('')

abstract_intensities_new_df

abstract_intensities_new_df


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
,,,,,,,,,,,,,,,,,,,,
Electrode active material,91.0,70.0,64.0,102.0,106.0,100.0,123.0,129.0,119.0,127.0,126.0,192.0,185.0,173.0,167.0,179.0,163.0,155.0,143.0,181.0
Active material layer,17.0,8.0,26.0,23.0,29.0,61.0,58.0,67.0,54.0,67.0,55.0,71.0,62.0,74.0,63.0,50.0,43.0,60.0,64.0,80.0
Energy storage device,5.0,35.0,24.0,33.0,26.0,13.0,23.0,33.0,44.0,33.0,44.0,36.0,33.0,49.0,46.0,36.0,47.0,64.0,60.0,59.0
Lithium ion battery,15.0,22.0,36.0,26.0,12.0,31.0,30.0,39.0,40.0,43.0,58.0,62.0,70.0,56.0,57.0,67.0,79.0,67.0,73.0,62.0
Electrode current collector,7.0,6.0,9.0,10.0,12.0,10.0,12.0,21.0,23.0,37.0,28.0,26.0,30.0,23.0,20.0,22.0,27.0,27.0,43.0,49.0
Lithium secondary battery,47.0,78.0,64.0,55.0,87.0,82.0,98.0,65.0,50.0,63.0,66.0,66.0,67.0,65.0,62.0,53.0,63.0,44.0,56.0,77.0
Plurality battery cell,3.0,1.0,3.0,1.0,5.0,6.0,5.0,13.0,10.0,10.0,9.0,26.0,26.0,31.0,22.0,27.0,27.0,27.0,32.0,32.0
Power storage device,11.0,3.0,6.0,8.0,2.0,5.0,2.0,14.0,28.0,13.0,54.0,32.0,47.0,42.0,45.0,40.0,40.0,62.0,30.0,36.0
Current collector electrode,5.0,7.0,5.0,3.0,6.0,7.0,6.0,13.0,15.0,16.0,17.0,25.0,17.0,15.0,17.0,13.0,17.0,16.0,22.0,29.0


In [None]:
#hier

In [360]:
# Generate LaTeX code
generate_latex_code(growing_list_abstract_3_scaled[1])


\begin{tabularx}{\linewidth} {| >{\raggedright\arraybackslash}p{3.7cm}| >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | }
\mc{} & \mc{2000} & \mc{2001} & \mc{2002} & \mc{2003} & \mc{2004} & \mc{2005} & \mc{2006} & \mc{2007} & \mc{2008} & \mc{2009} & \mc{2010} & \mc{2011} & \mc{2012} & \mc{2013} & \mc{2014} & \mc{2015} & \mc{2016} & \mc{2017} & \mc{2018} & \mc{2019} \\
\hline
\hline
ele

In [72]:
# Generate LaTeX code with capital letters in beginnings of string
generate_latex_code(abstract_intensities_new_df)


\begin{tabularx}{\linewidth} {| >{\raggedright\arraybackslash}p{3.7cm}| >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | }
\mc{} & \mc{2000} & \mc{2001} & \mc{2002} & \mc{2003} & \mc{2004} & \mc{2005} & \mc{2006} & \mc{2007} & \mc{2008} & \mc{2009} & \mc{2010} & \mc{2011} & \mc{2012} & \mc{2013} & \mc{2014} & \mc{2015} & \mc{2016} & \mc{2017} & \mc{2018} & \mc{2019} \\
\hline
\hline
Ele

In [361]:
# Export as PNG
#dfi.export(growing_list_abstract_3_scaled[2], 'growing_list_abstract_3_scaled.png')


In [362]:
shrinking_list_abstract_3[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
hydrogen absorbing alloy,34,34,9,6,2,2,3,22,4,6,3,1,5,0,2,8,13,4,15,0
temperature detection section,12,1,0,1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0
charge storage device,15,3,2,0,2,3,4,2,2,8,4,10,8,4,4,6,9,17,3,3
absorbing alloy electrode,11,8,3,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,3,0
copyright jpo inpit,11,11,16,21,20,29,169,440,495,604,484,500,0,0,0,0,0,0,0,0
safety valve element,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
lithium secondary cell,11,27,10,10,11,6,12,4,11,6,5,8,11,13,3,17,14,6,6,1
battery safety valve,10,1,0,0,3,0,0,2,0,4,3,7,3,1,2,0,0,2,1,0
portable information terminal,13,2,1,1,0,0,0,0,0,0,0,0,0,18,3,9,2,8,3,3
absorbing alloy powder,9,5,3,0,0,0,0,5,0,0,0,0,1,0,0,0,5,4,0,0


In [363]:
len(shrinking_list_abstract_3[1].index.values)


50

In [364]:
for item in shrinking_list_abstract_3[1].index.values:
    print(item)

hydrogen absorbing alloy
temperature detection section
charge storage device
absorbing alloy electrode
copyright jpo inpit
safety valve element
lithium secondary cell
battery safety valve
portable information terminal
absorbing alloy powder
hydrogen storage alloy
information storage device
battery storage casing
polarity type electrode
polarity type collector
bi directional switch
current detector detecting
double layer capacitor
rest period followed
cooling medium passage
signal processing section
lithium ion polymer
main anode body
metallic porous body
function predetermined time
voltage supply terminal
ion polymer battery
output voltage vout
power feed mean
conductive porous body
hydrogen generating system
voltage change rate
residual battery capacity
circuit controlling charging
lithium manganese oxide
anode body portion
large secondary battery
sintered cadmium electrode
solar array system
consumer usage recorder
anode extension portion
alkaline electrochemical cell
signaling devic

In [365]:
# Generate LaTeX code
generate_latex_code(shrinking_list_abstract_3[1])


\begin{tabularx}{\linewidth} {| >{\raggedright\arraybackslash}p{3.7cm}| >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | }
\mc{} & \mc{2000} & \mc{2001} & \mc{2002} & \mc{2003} & \mc{2004} & \mc{2005} & \mc{2006} & \mc{2007} & \mc{2008} & \mc{2009} & \mc{2010} & \mc{2011} & \mc{2012} & \mc{2013} & \mc{2014} & \mc{2015} & \mc{2016} & \mc{2017} & \mc{2018} & \mc{2019} \\
\hline
\hline
hyd

In [366]:
# Export as PNG
#dfi.export(shrinking_list_abstract_3[2], 'shrinking_list_abstract_3.png')


In [367]:
shrinking_list_abstract_3_scaled[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
hydrogen absorbing alloy,35,29,8,5,2,1,2,10,2,2,1,0,1,0,0,1,2,0,2,0
lithium manganese oxide,21,15,10,5,0,1,1,1,0,1,1,5,3,4,2,2,2,2,2,1
lead acid battery,27,29,19,36,28,17,10,8,19,11,9,11,9,9,10,11,13,12,12,9
hydrogen storage alloy,18,31,19,4,15,24,12,14,2,12,7,5,2,1,1,3,0,0,1,1
external power source,21,6,5,6,8,8,11,11,8,9,10,9,5,9,9,9,7,9,6,4
charge discharge cycle,21,16,11,16,19,10,13,16,15,14,11,10,9,10,6,7,7,5,4,5
charge storage device,15,3,2,0,2,2,2,1,1,3,1,2,1,1,1,1,1,2,0,0
rechargeable lithium battery,17,19,9,2,6,7,2,7,8,6,5,10,4,2,7,4,5,1,1,3
alkaline storage battery,15,28,17,16,13,13,13,8,2,4,4,3,2,0,2,2,0,0,0,1
double layer capacitor,14,26,23,16,24,24,21,16,5,11,8,6,4,3,4,2,1,1,1,1


In [368]:
len(shrinking_list_abstract_3_scaled[1].index.values)


50

In [369]:
shrinking_list_abstract_3[1].index.values == shrinking_list_abstract_3_scaled[1].index.values

array([ True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False])

In [370]:
for item in shrinking_list_abstract_3_scaled[1].index.values:
    print(item)
    

hydrogen absorbing alloy
lithium manganese oxide
lead acid battery
hydrogen storage alloy
external power source
charge discharge cycle
charge storage device
rechargeable lithium battery
alkaline storage battery
double layer capacitor
portable information terminal
temperature detection section
electric double layer
absorbing alloy electrode
copyright jpo inpit
lithium secondary cell
active material electrode
polyolefin microporous film
safety valve element
battery safety valve
cooling medium passage
predetermined time period
nickel metal hydride
absorbing alloy powder
information storage device
transition metal compound
alkaline secondary battery
charging control circuit
iron cobalt nickel
power supply controller
battery storage casing
polarity type electrode
polarity type collector
bi directional switch
current detector detecting
rest period followed
mobile phone battery
electric power converter
treated steel sheet
surface treated steel
electrical system battery
manganese iron cobalt
c

In [371]:
# Generate LaTeX code
generate_latex_code(shrinking_list_abstract_3_scaled[1])


\begin{tabularx}{\linewidth} {| >{\raggedright\arraybackslash}p{3.7cm}| >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | >{\raggedleft\arraybackslash}X | }
\mc{} & \mc{2000} & \mc{2001} & \mc{2002} & \mc{2003} & \mc{2004} & \mc{2005} & \mc{2006} & \mc{2007} & \mc{2008} & \mc{2009} & \mc{2010} & \mc{2011} & \mc{2012} & \mc{2013} & \mc{2014} & \mc{2015} & \mc{2016} & \mc{2017} & \mc{2018} & \mc{2019} \\
\hline
\hline
hyd

In [372]:
# Export as PNG
#dfi.export(shrinking_list_abstract_3_scaled[2], 'shrinking_list_abstract_3_scaled.png')


In [373]:
highest_abs_change_list_abstract_3[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
electrode active material,89,82,72,121,136,157,241,278,294,356,427,991,1253,1288,1323,1386,1273,1308,1382,1990
lithium secondary battery,46,91,72,65,112,128,192,140,124,178,222,340,451,487,490,410,492,374,540,844
active material layer,17,9,29,27,37,96,114,143,134,188,186,364,419,553,499,383,336,510,619,883
copyright jpo inpit,11,11,16,21,20,29,169,440,495,604,484,500,0,0,0,0,0,0,0,0
power storage device,11,3,7,10,2,8,4,30,69,37,183,166,317,310,361,311,311,520,293,394
lithium ion battery,15,26,41,31,16,49,58,83,99,121,197,321,476,421,449,520,618,568,702,685
electrolyte secondary battery,25,27,41,36,65,74,110,118,180,129,113,214,301,269,437,425,412,286,264,405
energy storage device,5,41,27,39,33,20,44,70,109,94,147,183,224,367,367,277,370,538,576,648
non-aqueous electrolyte secondary,24,38,45,46,67,75,109,116,182,135,121,216,307,288,447,425,419,284,248,384
lithium ion secondary,12,29,18,30,33,63,72,74,76,84,164,270,366,418,442,506,379,342,291,345


In [374]:
highest_abs_change_list_abstract_3_scaled[2]


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
copyright jpo ncipi,5,3,4,11,120,232,138,0,0,0,0,0,0,0,0,0,0,0,0,0
copyright jpo inpit,11,9,14,18,16,19,86,205,200,215,143,97,0,0,0,0,0,0,0,0
electrode active material,91,70,64,102,106,100,123,129,119,127,126,192,185,173,167,179,163,155,143,181
lithium secondary battery,47,78,64,55,87,82,98,65,50,63,66,66,67,65,62,53,63,44,56,77
active material layer,17,8,26,23,29,61,58,67,54,67,55,71,62,74,63,50,43,60,64,80
power storage device,11,3,6,8,2,5,2,14,28,13,54,32,47,42,45,40,40,62,30,36
energy storage device,5,35,24,33,26,13,23,33,44,33,44,36,33,49,46,36,47,64,60,59
electrolyte secondary battery,26,23,36,30,50,47,56,55,73,46,33,42,45,36,55,55,53,34,27,37
non-aqueous electrolyte secondary,25,33,40,39,52,48,56,54,73,48,36,42,45,39,56,55,54,34,26,35
lithium ion battery,15,22,36,26,12,31,30,39,40,43,58,62,70,56,57,67,79,67,73,62


## Search certain strings

### Define patterns, functions, and strings we want to search

In [31]:
df_bruno

Unnamed: 0,family_id,earliest_publn_year_this_family_id,most_recent_abstract_this_family_id,most_recent_title_this_family_id
0,1574492,2015,An underwater vehicle includes an on board pow...,- Underwater vehicle comprising power storage ...
1,3511554,2000,"The method involves placing all loads (7,8,9,1...",Method of controlling emergency power supply i...
2,3613974,2002,The electrode for an electrochemical arrangeme...,Electrode for an electrochemical arrangement c...
3,3673165,2002,The invention describes a method of regulating...,Method for regulating an inverter system
4,3681483,2001,The invention relates to an essentially flat e...,MULTILAYER ELECTRODE
...,...,...,...,...
92695,73455420,2019,The present invention provides a storage syste...,AUTOMATED STORAGE SYSTEM WITH A CONTAINER VEHI...
92696,73474213,2015,PROBLEM TO BE SOLVED: To provide a method allo...,WIRELESS CHARGING UNIT AND COUPLER BASED DOCKI...
92697,74557388,2015,"A surgical instrument can comprise a handle, a...",POWER MANAGEMENT CONTROL SYSTEM FOR SURGICAL I...
92698,74844536,2004,"FIELD: electrical engineering, namely manufact...",METHOD FOR CONTINUOUSLY MAKING ELECTRIC CURREN...


In [32]:
df_bruno_spaces_added = df_bruno.copy()


In [33]:
# Add space in front and at the end of each string
df_bruno_spaces_added['most_recent_abstract_this_family_id'] = ' '+df_bruno_spaces_added['most_recent_abstract_this_family_id']
df_bruno_spaces_added['most_recent_abstract_this_family_id'] = df_bruno_spaces_added['most_recent_abstract_this_family_id']+' '

df_bruno_spaces_added['most_recent_title_this_family_id'] = ' '+df_bruno_spaces_added['most_recent_title_this_family_id']
df_bruno_spaces_added['most_recent_title_this_family_id'] = df_bruno_spaces_added['most_recent_title_this_family_id']+' '


In [34]:
df_bruno_spaces_added

Unnamed: 0,family_id,earliest_publn_year_this_family_id,most_recent_abstract_this_family_id,most_recent_title_this_family_id
0,1574492,2015,An underwater vehicle includes an on board po...,- Underwater vehicle comprising power storage...
1,3511554,2000,"The method involves placing all loads (7,8,9,...",Method of controlling emergency power supply ...
2,3613974,2002,The electrode for an electrochemical arrangem...,Electrode for an electrochemical arrangement ...
3,3673165,2002,The invention describes a method of regulatin...,Method for regulating an inverter system
4,3681483,2001,The invention relates to an essentially flat ...,MULTILAYER ELECTRODE
...,...,...,...,...
92695,73455420,2019,The present invention provides a storage syst...,AUTOMATED STORAGE SYSTEM WITH A CONTAINER VEH...
92696,73474213,2015,PROBLEM TO BE SOLVED: To provide a method all...,WIRELESS CHARGING UNIT AND COUPLER BASED DOCK...
92697,74557388,2015,"A surgical instrument can comprise a handle, ...",POWER MANAGEMENT CONTROL SYSTEM FOR SURGICAL ...
92698,74844536,2004,"FIELD: electrical engineering, namely manufac...",METHOD FOR CONTINUOUSLY MAKING ELECTRIC CURRE...


In [35]:
df_bruno_spaces_added['most_recent_abstract_this_family_id'][0]


' An underwater vehicle includes an on board power grid, the power grid including powerlines, a plurality of DC energy storage sources based on lithium-ion batteries, each source having a positive terminal and a negative terminal, and being connected on the one hand to a charger and on the other hand to energy consumers of the vehicle, one of the terminals of each source being connected to a first powerline of the power grid, the other terminal of each source being connected on the one hand by a second powerline of the power grid to the charger through one-way semiconductor conducting unit and on the other hand by a third powerline of the electric grid to consumers through one-way semiconductor conducting unit. '

In [36]:
df_bruno_spaces_added['most_recent_title_this_family_id'][0]


' - Underwater vehicle comprising power storage sources made from lithium-ion batteries '

In [37]:
def search_for_string(string, remove_punctuation):
    
    # Define that the string searched for has to have a whitespace before and after it. This is ok because all abstracts and titles have one in the beginning and end as well
    string = ' '+string+' '
    
    def remove_punctuation(item):
        item = re.sub('[^a-zA-Z]', ' ', item)
        return item
    
    feature = 'appln_title'
    feature_lg = 'appln_title_lg'
    data_this = data[data[feature_lg]=='en']
    
    if remove_punctuation:
        result_titles = data_this[data_this[feature].map(lambda x: remove_punctuation(x)).str.contains(string, case=False)][['earliest_publn_year_this_family_id', 'docdb_family_id', feature]]
    else:
        result_titles = data_this[data_this[feature].str.contains(string, case=False)][['earliest_publn_year_this_family_id', 'docdb_family_id', feature]]
        
    feature = 'appln_abstract'
    feature_lg = 'appln_abstract_lg'
    data_this = data[data[feature_lg]=='en']
    
    if remove_punctuation:
        result_abstracts = data_this[data_this[feature].map(lambda x: remove_punctuation(x)).str.contains(string, case=False)][['earliest_publn_year_this_family_id', 'docdb_family_id', feature]]
    else:
        result_abstracts = data_this[data_this[feature].str.contains(string, case=False)][['earliest_publn_year_this_family_id', 'docdb_family_id', feature]]
        
    return [result_titles, result_abstracts]

def get_occurence_counts(result):

    dicts = []

    #print_ = ['Titles:', 'Abstracts:']
    
    for i, item in enumerate(result):

        #print(i)
        
        dict_ = {}

        for year in set(item['earliest_publn_year_this_family_id']):

            ids = set(item[item['earliest_publn_year_this_family_id'] == year]['docdb_family_id'])
            num_ids = len(ids)

            dict_[year] = [num_ids, ids]

        dicts.append(dict_)
    
    return dicts
        
        #print(print_[i])
        
        #for year in list(dict_):
            #print(str(year)+': '+str(dict_[year][0])+' '+str(dict_[year][1]))
        
        #print()
            

In [38]:
if False: # Old version
    
    def search_for_string(string, remove_punctuation):
    
        string = ' '+string+' '
        #print(string)
        #print()

        def remove_punctuation(item):
            item = re.sub('[^a-zA-Z]', ' ', item)
            return item

        #feature = 'appln_title'
        feature = 'most_recent_title_this_family_id'
        #feature_lg = 'appln_title_lg'


        #data_this = data[data[feature_lg]=='en']
        data_this = df_bruno_spaces_added


        if remove_punctuation:
            #result_titles = data_this[data_this[feature].map(lambda x: remove_punctuation(x)).str.contains(string, case=False)][['earliest_publn_year_this_family_id', 'docdb_family_id', feature]]
            result_titles = data_this[data_this[feature].map(lambda x: remove_punctuation(x)).str.contains(string, case=False)][['earliest_publn_year_this_family_id', 'family_id', feature]]
        else:
            #result_titles = data_this[data_this[feature].str.contains(string, case=False)][['earliest_publn_year_this_family_id', 'docdb_family_id', feature]]
            result_titles = data_this[data_this[feature].str.contains(string, case=False)][['earliest_publn_year_this_family_id', 'family_id', feature]]

        #feature = 'appln_abstract'
        feature = 'most_recent_abstract_this_family_id'
        #feature_lg = 'appln_abstract_lg'
        #data_this = data[data[feature_lg]=='en']

        if remove_punctuation:
            #result_abstracts = data_this[data_this[feature].map(lambda x: remove_punctuation(x)).str.contains(string, case=False)][['earliest_publn_year_this_family_id', 'docdb_family_id', feature]]
            result_abstracts = data_this[data_this[feature].map(lambda x: remove_punctuation(x)).str.contains(string, case=False)][['earliest_publn_year_this_family_id', 'family_id', feature]]
        else:
            #result_abstracts = data_this[data_this[feature].str.contains(string, case=False)][['earliest_publn_year_this_family_id', 'docdb_family_id', feature]]
            result_abstracts = data_this[data_this[feature].str.contains(string, case=False)][['earliest_publn_year_this_family_id', 'family_id', feature]]

        return [result_titles, result_abstracts]

    def get_occurence_counts(result):

        dicts = []

        #print_ = ['Titles:', 'Abstracts:']

        for i, item in enumerate(result):

            #print(i)

            dict_ = {}

            for year in set(item['earliest_publn_year_this_family_id']):

                #ids = set(item[item['earliest_publn_year_this_family_id'] == year]['docdb_family_id'])
                ids = set(item[item['earliest_publn_year_this_family_id'] == year]['family_id'])
                num_ids = len(ids)

                dict_[year] = [num_ids, ids]

            dicts.append(dict_)

        return dicts

            #print(print_[i])

            #for year in list(dict_):
                #print(str(year)+': '+str(dict_[year][0])+' '+str(dict_[year][1]))

            #print()
            

In [39]:
"""
strings_array = [
    [
        're use',
        'reuse',
        're using',
        'reusing'
    ],
    [
        'recycle',
        'recycling',
    ],
    [
        'reduce',
        'reducing'
    ],
    [
        'recover',
        'recovering',
        'retrieve',
        'retrieving'
    ],
    [
        'repair',
        'repairing'
    ],
    [
        'waste'
    ],
    [
        'durable',
        'durability'
    ],
    [
        'closed loop'
    ]
]

strings_array
"""

"\nstrings_array = [\n    [\n        're use',\n        'reuse',\n        're using',\n        'reusing'\n    ],\n    [\n        'recycle',\n        'recycling',\n    ],\n    [\n        'reduce',\n        'reducing'\n    ],\n    [\n        'recover',\n        'recovering',\n        'retrieve',\n        'retrieving'\n    ],\n    [\n        'repair',\n        'repairing'\n    ],\n    [\n        'waste'\n    ],\n    [\n        'durable',\n        'durability'\n    ],\n    [\n        'closed loop'\n    ]\n]\n\nstrings_array\n"

In [40]:
"""
strings_array = [
    [
        'circular economy'
    ],
    [
        'redesign',
        'redesigning'
    ],
    [
        'symbiosis'
    ],
    [
        'urban mining'
    ],
    [
        'metabolism',
        'metabolic'
    ],
    [
        'crade-to-cradle',
    ],
    [
        'decouple',
        'decoupling'
    ],
    [
        'lifecycle',
        'life cycle'
    ],
    [
        'downcycle',
        'down cycle',
        'downcycling'
    ],
    [
        'end of life'
    ],
    [
        'upcycle',
        'up cycle',
        'upcycling'
    ],
    [
        'extended producer responsibility'
    ],
    [
        'technical nutrients'
    ],
    [
        'renew'
    ],
    [
        'green'
    ],
    [
        'hydrogen'
    ],
    [
        'standard'
    ],
    [
        'wind'
    ],
    [
        'solar',
        'photovoltaic '
    ],
    [
        'electric vehicle',
        'electric vehicles',
        'electric mobility'
    ]
]

strings_array
"""

"\nstrings_array = [\n    [\n        'circular economy'\n    ],\n    [\n        'redesign',\n        'redesigning'\n    ],\n    [\n        'symbiosis'\n    ],\n    [\n        'urban mining'\n    ],\n    [\n        'metabolism',\n        'metabolic'\n    ],\n    [\n        'crade-to-cradle',\n    ],\n    [\n        'decouple',\n        'decoupling'\n    ],\n    [\n        'lifecycle',\n        'life cycle'\n    ],\n    [\n        'downcycle',\n        'down cycle',\n        'downcycling'\n    ],\n    [\n        'end of life'\n    ],\n    [\n        'upcycle',\n        'up cycle',\n        'upcycling'\n    ],\n    [\n        'extended producer responsibility'\n    ],\n    [\n        'technical nutrients'\n    ],\n    [\n        'renew'\n    ],\n    [\n        'green'\n    ],\n    [\n        'hydrogen'\n    ],\n    [\n        'standard'\n    ],\n    [\n        'wind'\n    ],\n    [\n        'solar',\n        'photovoltaic '\n    ],\n    [\n        'electric vehicle',\n        'electric ve

In [41]:
strings_array = [
    [
        're use',
        'reuse',
        're using',
        'reusing'
    ],
    [
        'repair',
        'repairing'
    ],
    [
        'recycle',
        'recycling'
    ],
    [
        'recover',
        'recovering',
        'retrieve',
        'retrieving'
    ]
]

strings_array


[['re use', 'reuse', 're using', 'reusing'],
 ['repair', 'repairing'],
 ['recycle', 'recycling'],
 ['recover', 'recovering', 'retrieve', 'retrieving']]

### Conduct search and save results

In [42]:
results_array = []

"""
results_array dimensions, their length:
1. string group, number of strings in strings_array
2. string, number of strings in group
3. titles/abstracts, 2
"""

outputs_list = []

for strings_list in tqdm(strings_array):
    
    results_list = []
    
    
    print(strings_list)
    
    for string_ in tqdm(strings_list):

        result = search_for_string(string_, True)
        results_list.append(result)
        
    results_array.append(results_list)
        
    results_merged = [pd.DataFrame(), pd.DataFrame()]
    for result in results_list:
        
        for i in [0,1]:
            
            results_merged[i] = results_merged[i].append(result[i])
            
    output = get_occurence_counts(results_merged)
    
    outputs_list.append(output)


  0%|          | 0/4 [00:00<?, ?it/s]

['re use', 'reuse', 're using', 'reusing']


  0%|          | 0/4 [00:00<?, ?it/s]

['repair', 'repairing']


  0%|          | 0/2 [00:00<?, ?it/s]

['recycle', 'recycling']


  0%|          | 0/2 [00:00<?, ?it/s]

['recover', 'recovering', 'retrieve', 'retrieving']


  0%|          | 0/4 [00:00<?, ?it/s]

In [43]:
results_array[0][1][1]


Unnamed: 0,earliest_publn_year_this_family_id,docdb_family_id,appln_abstract
53985,2000,11556126,PROBLEM TO BE SOLVED: To improve working capab...
53986,2000,11556126,PROBLEM TO BE SOLVED: To improve working capab...
53987,2000,11556126,PROBLEM TO BE SOLVED: To improve working capab...
53988,2000,11556126,PROBLEM TO BE SOLVED: To improve working capab...
53989,2000,11556126,PROBLEM TO BE SOLVED: To improve working capab...
...,...,...,...
4063814,2019,68781600,"The invention discloses a wireless earphone, a..."
4067096,2019,68825302,The utility model discloses a power supply dev...
4067097,2019,68825302,The utility model discloses a power supply dev...
4067098,2019,68825302,The utility model discloses a power supply dev...


In [44]:
outputs_list[0][1]


{2016: [10,
  {54477888,
   55302826,
   55456581,
   55651868,
   55828517,
   55828518,
   56798719,
   56984078,
   57277761,
   57440304}],
 2017: [5, {59258318, 59310505, 59624174, 59630174, 60788443}],
 2018: [7,
  {58772414, 60915254, 60937890, 61129474, 61658429, 62907340, 63918410}],
 2019: [20,
  {65367784,
   65514795,
   65948886,
   66100345,
   66189983,
   66251600,
   66277773,
   66448410,
   66630414,
   66655726,
   67300602,
   68057230,
   68100428,
   68467560,
   68501353,
   68547008,
   68616395,
   68727833,
   68781600,
   68825302}],
 2000: [3, {11556126, 17939624, 19615154}],
 2001: [1, {18614279}],
 2002: [1, {18710688}],
 2003: [2, {19040220, 27788265}],
 2004: [1, {31946364}],
 2006: [1, {35784005}],
 2008: [2, {39123527, 39590444}],
 2009: [4, {40453756, 40600666, 41315562, 41454580}],
 2010: [1, {42991482}],
 2011: [10,
  {43844582,
   43900475,
   44011496,
   44167153,
   44186663,
   44562918,
   45097881,
   45097909,
   45371217,
   45397920}],
 2

### Print counts for a defined string / string group. Use i = ... to define which one to show. Can be printed with or without the respectiev family IDs (print_family_ids = True/False)

In [45]:
i = 0

print_family_ids = False

print('String / string group: '+str(strings_array[i]))

for k, item in enumerate(['titles', 'abstracts']):
    
    print()
    print(item)

    to_print = outputs_list[i][k]

    print()
    if print_family_ids:
        print('year, number of occurrences, corresponding docdb family IDs')
    else:
        print('year, number of occurrences')
    print('---------------------------')

    for key_ in sorted(list(to_print)):
        
        if print_family_ids:
            print(str(key_)+', '+str(to_print[key_][0])+', '+str(to_print[key_][1]))
        else:
            print(str(key_)+', '+str(to_print[key_][0]))


String / string group: ['re use', 'reuse', 're using', 'reusing']

titles

year, number of occurrences
---------------------------
2004, 1
2009, 1
2010, 1
2011, 2
2013, 2
2014, 1
2016, 1
2018, 5
2019, 5

abstracts

year, number of occurrences
---------------------------
2000, 3
2001, 1
2002, 1
2003, 2
2004, 1
2006, 1
2008, 2
2009, 4
2010, 1
2011, 10
2012, 7
2013, 4
2014, 7
2015, 3
2016, 10
2017, 5
2018, 7
2019, 20


### Write counts to a .txt file

In [46]:
f1 = open('string_occ_counts.txt', 'w')
    
f2 = open('string_occ_counts_with_family_ids.txt', 'w')
    
for j, f in enumerate([f1, f2]):
    
    for i in range((len(outputs_list))):

        if j == 0:
            print_family_ids = False
        elif j == 1:
            print_family_ids = True
        else:
            print('Something is wrong.')

        f.write('String / string group: '+str(strings_array[i])+'\n')

        for k, item in enumerate(['titles', 'abstracts']):

            f.write('\n')
            f.write(item+'\n')

            to_print = outputs_list[i][k]

            f.write('')
            if print_family_ids:
                f.write('year;number of occurrences;corresponding docdb family IDs'+'\n')
            else:
                f.write('year,number of occurrences'+'\n')
            #f.write('---------------------------'+'\n')

            for key_ in sorted(list(to_print)):

                if print_family_ids:
                    f.write(str(key_)+';'+str(to_print[key_][0])+';'+str(to_print[key_][1])+'\n')
                else:
                    f.write(str(key_)+','+str(to_print[key_][0])+'\n')

        f.write('\n')
        f.write('\n')
        f.write('\n')

    f.close()


### Compute union of title family IDs and abstract family IDs


### Create labels for new table

In [47]:
one_string_list = []

for string_group in strings_array:
    
    one_string = ''
    
    for i, string_ in enumerate(string_group):
        
        if i > 0:
            one_string = one_string+'/'+string_
        else:
            one_string = one_string+string_
        
    one_string_list.append(one_string)

one_string_list

['re use/reuse/re using/reusing',
 'repair/repairing',
 'recycle/recycling',
 'recover/recovering/retrieve/retrieving']

### Alternative labels

In [48]:
one_string_list = ['recover', 'recycle', 'repair', 'reuse']

### Compute unions and create table

In [49]:
occ_array = []

for i in range((len(outputs_list))):
    
    #print(strings_array[i])
    
    occ_list = []
    
    for year in years:
        
        #print(year)
        
        # Get this years family IDs for titles ### [word group][titles / abstracts][year][count / family IDs]
        try:
            family_ids_titles = outputs_list[i][0][year][1]
        except Exception as e_titles:
            #print(type(e_titles))
            #print(e_titles)
            family_ids_titles = set()
            
        #print(type(family_ids_titles))
        #print(family_ids_titles)
        
        # Get this years family IDs for abstracts ### [word group][titles / abstracts][year][count / family IDs]
        try:
            family_ids_abstracts = outputs_list[i][1][year][1]
        except Exception as e_abstracts:
            family_ids_abstracts = set()
        
        #print(family_ids_abstracts)
        
        union_titles_abstracts = family_ids_titles.union(family_ids_abstracts)
        #print(union_titles_abstracts)
        
        occ_number = len(union_titles_abstracts)
        #print(occ_number)
        
        occ_list.append(occ_number)
        
        #print()
        
    occ_array.append(occ_list)
    
occ_df = pd.DataFrame(occ_array, index = one_string_list, columns = years)
occ_df


Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
recover,3,1,1,2,2,0,1,0,2,5,2,10,7,5,7,3,10,5,9,21
recycle,0,0,0,1,1,1,3,3,2,5,2,7,13,10,7,8,6,13,10,15
repair,5,2,3,6,3,2,4,6,3,6,4,24,16,17,13,21,29,22,25,40
reuse,5,10,3,10,12,10,6,16,12,18,14,40,38,36,44,29,35,39,34,59


### Compute yearly aggregate


In [50]:
yearly_aggregate = occ_df.sum(axis = 0)
yearly_aggregate = pd.DataFrame(yearly_aggregate)

yearly_aggregate = yearly_aggregate.reset_index()
yearly_aggregate = yearly_aggregate.rename(columns = {'index': 'year', 0: 'aggregate'})

yearly_aggregate

Unnamed: 0,year,aggregate
0,2000,13
1,2001,13
2,2002,7
3,2003,19
4,2004,18
5,2005,13
6,2006,14
7,2007,25
8,2008,19
9,2009,34


In [51]:
yearly_aggregate.to_csv(path_or_buf = 'cicular_counts', sep = ';', index = False)

### Compute each word group's total and sort from largest to smallest

In [52]:
group_totals = occ_df.sum(axis = 1)
group_totals.sort_values(ascending = False, inplace = True)
group_totals = pd.DataFrame(group_totals)
group_totals = group_totals.reset_index()
group_totals = group_totals.rename(columns = {'index': 'word group', 0: 'total occurrences'})


group_totals


Unnamed: 0,word group,total occurrences
0,reuse,470
1,repair,251
2,recycle,107
3,recover,96


### Visualise yearly aggregate on the left and each word group's total on the right


In [53]:
plot_agg = go.Scatter(
    x=yearly_aggregate['year'],
    y=yearly_aggregate['aggregate'],
    showlegend = False
)

plot_totals = go.Bar(
    x=group_totals['word group'],
    y=group_totals['total occurrences'],
    showlegend = False
)

In [None]:
#hier

In [78]:
circular_plot = pltsub.make_subplots(rows=1, cols = 2)

circular_plot.add_trace(plot_agg, row=1, col=1) 

circular_plot.add_trace(plot_totals, row=1, col=2)

##########

# Update left plot
circular_plot.update_traces(
    marker_color = colors_plotly_default[0],
    col=1)

circular_plot.update_xaxes(
    title_text="Earliest publication year",
    showgrid = False,
    gridcolor='black',
    color = 'black',
    dtick = 1,
    row = 1,
    col = 1)

circular_plot.update_yaxes(
    title_text="IPFs with keyword in title or abstract",
    showgrid = True,
    gridcolor='black',
    color = 'black',
    zerolinecolor = 'black',
    zerolinewidth = 1,
    row = 1,
    col = 1)

circular_plot.add_vline(x=2000, line_width = 1, line_color="black", row=1, col=1)

##########

# Update right plot

circular_plot.update_traces(
    marker_color = colors_plotly_default[0],
    col=2)

circular_plot.update_xaxes(
    title_text="Word group",
    showgrid = False,
    gridcolor='black',
    color = 'black',
    row = 1,
    col = 2)

circular_plot.update_yaxes(
    title_text="IPFs with this word group",
    showgrid = True,
    gridcolor='black',
    color = 'black',
    zerolinecolor = 'black',
    zerolinewidth = 1,
    row = 1,
    col = 2)

circular_plot.add_vline(x=-0.5, line_width=1, line_color="black", row=1, col=2)

margin_ = 10

circular_plot.update_layout(
    plot_bgcolor = "white",
    #margin=dict(l=margin_, r=margin_, t=margin_, b=margin_)
)

circular_plot.show()



In [79]:
image_saver(circular_plot, 'circular', True)


executing a function from helpers.py


In [80]:
path_total_yearly_counts_df_2 = main_path_mac+'/02 Code/total_yearly_counts'

total_yearly_counts_df_2 = pd.read_csv(path_total_yearly_counts_df_2, delimiter=';')
total_yearly_counts_df_2.rename(columns = {'count': 'patent families count'}, inplace = True)


In [81]:
total_yearly_counts_df_2


Unnamed: 0,year,patent families count
0,2000,979.0
1,2001,1164.0
2,2002,1132.0
3,2003,1195.0
4,2004,1288.0
5,2005,1594.0
6,2006,1998.0
7,2007,2152.0
8,2008,2484.0
9,2009,2816.0


### Compute year over year increases

In [82]:
totals_series = total_yearly_counts_df_2['patent families count']

increase = []

for i in range(1,len(totals_series)):
    
    increase_this_year = (totals_series[i] - totals_series[i-1]) / totals_series[i-1]
    print(increase_this_year)
    increase.append(increase_this_year)
    
sum(increase) / len(increase)

0.188968335035746
-0.027491408934704267
0.05565371024734696
0.07782426778242882
0.237577639751551
0.2534504391468031
0.07707707707707873
0.15427509293680347
0.13365539452495992
0.2002840909090871
0.5275147928994095
0.311253147394926
0.10265878877400393
0.06845277963831285
-0.022567703109332145
0.013853258081069455
0.07717611336032615
0.14622973925299299
0.141305461625172


0.14300794823126217

In [83]:
totals_series = yearly_aggregate['aggregate']

increase = []

for i in range(1,len(totals_series)):
    
    increase_this_year = (totals_series[i] - totals_series[i-1]) / totals_series[i-1]
    print(increase_this_year)
    increase.append(increase_this_year)
    
sum(increase) / len(increase)


0.0
-0.46153846153846156
1.7142857142857142
-0.05263157894736842
-0.2777777777777778
0.07692307692307693
0.7857142857142857
-0.24
0.7894736842105263
-0.35294117647058826
2.6818181818181817
-0.08641975308641975
-0.08108108108108109
0.04411764705882353
-0.14084507042253522
0.3114754098360656
-0.0125
-0.012658227848101266
0.7307692307692307


0.28506232123387215

### Compute share of circular IPFs out of all our IPFs

#### For each year

In [84]:
yearly_aggregate['aggregate'] / total_yearly_counts_df_2['patent families count']

0     0.013279
1     0.011168
2     0.006184
3     0.015900
4     0.013975
5     0.008156
6     0.007007
7     0.011617
8     0.007649
9     0.012074
10    0.006509
11    0.015689
12    0.010931
13    0.009109
14    0.008902
15    0.007825
16    0.010121
17    0.009279
18    0.007993
19    0.012121
dtype: float64

#### For whole time period

In [85]:
sum(yearly_aggregate['aggregate']) / sum(total_yearly_counts_df_2['patent families count'])

0.009971187154003027

## Compare IEA&EPO's li-ion and other lithium series to ours


In [86]:
ours_li_ion = np.array([51, 76, 81, 52, 74, 88, 137, 115, 100, 122, 143, 219, 248, 292, 339, 350, 317, 318, 386])
print(len(ours_li_ion))
print(ours_li_ion)
print()

ours_other_li = np.array([87, 97, 91, 95, 92, 112, 161, 191, 203, 247, 319, 528, 766, 747, 815, 783, 764, 691, 800])
print(len(ours_other_li))
print(ours_other_li)
print()

theirs = np.array([376, 454, 457, 424, 510, 553, 693, 704, 887, 928, 1097, 1556, 1933, 2223, 2373, 2428, 2392, 2374, 2547])
print(len(theirs))
print(theirs)



19
[ 51  76  81  52  74  88 137 115 100 122 143 219 248 292 339 350 317 318
 386]

19
[ 87  97  91  95  92 112 161 191 203 247 319 528 766 747 815 783 764 691
 800]

19
[ 376  454  457  424  510  553  693  704  887  928 1097 1556 1933 2223
 2373 2428 2392 2374 2547]


In [87]:
ours = ours_li_ion + ours_other_li
ours


array([ 138,  173,  172,  147,  166,  200,  298,  306,  303,  369,  462,
        747, 1014, 1039, 1154, 1133, 1081, 1009, 1186])

In [88]:
r = np.corrcoef(x = ours, y = theirs)
r


array([[1.       , 0.9939594],
       [0.9939594, 1.       ]])

In [89]:
theirs_ours = [theirs, ours]
theirs_ours


[array([ 376,  454,  457,  424,  510,  553,  693,  704,  887,  928, 1097,
        1556, 1933, 2223, 2373, 2428, 2392, 2374, 2547]),
 array([ 138,  173,  172,  147,  166,  200,  298,  306,  303,  369,  462,
         747, 1014, 1039, 1154, 1133, 1081, 1009, 1186])]

In [90]:
years_this = years[:-1]
print(years_this)


[2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]


In [91]:
for i, item in enumerate(theirs_ours):
    
    x = years_this
    y = item
    data_name = ['IEA & EPO:<br>Lithium and li-ion', 'Our paper:<br>Lithium-ion + Other lithium'][i]
    #print(country)
    
    line_width = 3
    
    if True:
        if i == 0:
            comparison_data = [dict(type = 'scatter',
                                          x = x,
                                          y = y,
                                          name = data_name,
                                          line_width = line_width
                                         )
                                    ]
        else:
            if True:
                comparison_data.append(dict(type = 'scatter',
                                                  x = x,
                                                  y = y,
                                                  name = data_name,
                                                  line_width = line_width
                                                 )
                                            )
                

In [92]:
comparison_title = "Comparison of IEA & EPO's vs. our li-ion and lithium counts, "+str(min(years_this))+'-'+str(max(years_this))

margin_ = 10

comparison_layout =dict(
    #title=dict(
    #    text = person_ctry_code_title,
    #    y = 0.9,
    #    x = 0.5,
    #    xanchor = 'center',
    #    yanchor = 'top',
    #    font = dict(color = 'black')
    #),
    yaxis = dict(
        color = 'black',
        title = 'Number of IPFs',
        showgrid = True,
        gridwidth = 1,
        gridcolor = 'black',
        zerolinecolor = 'black',
        zerolinewidth = 1
    ),
    xaxis = dict(
        color = 'black',
        title='Earliest publication year',
        dtick = 1,
        tickvals = years_this
    ),
    legend = dict(
        xanchor = "left",
        yanchor = "middle",
        y = 0.5, 
        x = 1,
        orientation = "v"
    ),
    plot_bgcolor = 'white',
    margin=dict(l=margin_, r=margin_, t=margin_, b=margin_)
)




In [93]:
comparison_plot = go.Figure(data = comparison_data, layout = comparison_layout)

#comparison_plot.update_yaxes(type="log",
#                            dtick=1)
           
comparison_plot.update_xaxes(dtick=1)

comparison_plot.show()

# Save this plot as eps

filename = 'comparison_li_and_li_ion'

image_saver(comparison_plot, filename, True)

comparison_title


executing a function from helpers.py


"Comparison of IEA & EPO's vs. our li-ion and lithium counts, 2000-2018"

In [94]:
theirs_ours[0]


array([ 376,  454,  457,  424,  510,  553,  693,  704,  887,  928, 1097,
       1556, 1933, 2223, 2373, 2428, 2392, 2374, 2547])

In [95]:
x = theirs_ours[0]
y = theirs_ours[1]
data_name = 'Our paper:<br>Lithium-ion + Other lithium'
#print(country)

line_width = 0

comparison2_data = [dict(type = 'scatter',
                         x = x,
                         y = y,
                         name = data_name,
                         line_width = line_width
                        )
                   ]


In [96]:
comparison2_title = "Plotting IEA & EPO' against our li-ion and lithium counts"

margin_ = 10

comparison2_layout =dict(
    #title=dict(
    #    text = person_ctry_code_title,
    #    y = 0.9,
    #    x = 0.5,
    #    xanchor = 'center',
    #    yanchor = 'top',
    #    font = dict(color = 'black')
    #),
    yaxis = dict(
        color = 'black',
        title = 'Our paper: Lithium-ion + Other lithium',
        showgrid = True,
        gridwidth = 1,
        gridcolor = 'black',
        zerolinecolor = 'black',
        zerolinewidth = 1
    ),
    xaxis = dict(
        color = 'black',
        title='IEA & EPO: Lithium and li-ion',
        showgrid = True,
        gridwidth = 1,
        gridcolor = 'black',
        #dtick = 1,
        #tickvals = years_this
    ),
    legend = dict(
        xanchor = "left",
        yanchor = "middle",
        y = 0.5, 
        x = 1,
        orientation = "v"
    ),
    plot_bgcolor = 'white',
    margin=dict(l=margin_, r=margin_, t=margin_, b=margin_)
)


In [97]:
comparison2_plot = go.Figure(data = comparison2_data, layout = comparison2_layout)

#comparison_plot.update_yaxes(type="log",
#                            dtick=1)
           
#comparison2_plot.update_xaxes(dtick=1)

comparison2_plot.show()

# Save this plot as eps

filename = 'linearity_check_li_and_li_ion'

image_saver(comparison2_plot, filename, True)

comparison2_title


executing a function from helpers.py


"Plotting IEA & EPO' against our li-ion and lithium counts"

# Two analytical tasks:
[1]
- we need trigrams on the "circular words", that is, every time any circular words appears: what is the top 10 or top 20 arrays with two other words next to it (these other words will be our evidence that they are really circular and not just false) positives
- the circular words are these:
"Reuse",
"Recycle",
"Reduce",
"Recover",
"Repair",
"Waste",
"Durable",
"Closed loop",
"Renew"
- the word "reduce" really looks suspect ... we are sure you will only have technical terms and dry engineering words around it ... this will mean this word is NOT a real indicator of curcularity ...


[2]
- now we must identify the words that are really circularity indicators ... imagine that "Recycle" and "Repair" have many nice associated words that we can argue are pro-circular, so we will select them (it will be a matter of judgement on our part)
- this step is quite critical ... we will have to judge which of the circular words are effectively the good circularity indicators ... we must stop here and establish which words these are before we proceed .... this will be a contribution!
- if there are a number of words that are pro-circular indicators (e.g. "Recycle", "Repair", "Reuse", etc.) them i propose the following -> identify for every single year the patents that display ANY of these circular terms
- from the number of pro-circular patents we will display the time series ... hopefully it will go up !!! ... this will be quite an important contribution!
- compare this time series with the agregate ... we would like to see circular patents rising faster than the aggregate trend (that would be quite nice, but i would be quite surprised if we can make a strong case that in fact Batteries have become very much more circular over time!!!)

In [98]:
df_bruno


Unnamed: 0,family_id,earliest_publn_year_this_family_id,most_recent_abstract_this_family_id,most_recent_title_this_family_id
0,1574492,2015,An underwater vehicle includes an on board pow...,- Underwater vehicle comprising power storage ...
1,3511554,2000,"The method involves placing all loads (7,8,9,1...",Method of controlling emergency power supply i...
2,3613974,2002,The electrode for an electrochemical arrangeme...,Electrode for an electrochemical arrangement c...
3,3673165,2002,The invention describes a method of regulating...,Method for regulating an inverter system
4,3681483,2001,The invention relates to an essentially flat e...,MULTILAYER ELECTRODE
...,...,...,...,...
92695,73455420,2019,The present invention provides a storage syste...,AUTOMATED STORAGE SYSTEM WITH A CONTAINER VEHI...
92696,73474213,2015,PROBLEM TO BE SOLVED: To provide a method allo...,WIRELESS CHARGING UNIT AND COUPLER BASED DOCKI...
92697,74557388,2015,"A surgical instrument can comprise a handle, a...",POWER MANAGEMENT CONTROL SYSTEM FOR SURGICAL I...
92698,74844536,2004,"FIELD: electrical engineering, namely manufact...",METHOD FOR CONTINUOUSLY MAKING ELECTRIC CURREN...


### Reduce dataset to rows that contain strings we are potentially interested in

In [99]:
strings_array = [
    [
        're use',
        'reuse',
        're using',
        'reusing'
    ],
    [
        'recycle',
        'recycling',
    ],
    [
        'reduce',
        'reducing'
    ],
    [
        'recover',
        'recovering',
        'retrieve',
        'retrieving'
    ],
    [
        'repair',
        'repairing'
    ],
    [
        'waste'
    ],
    [
        'durable',
        'durability'
    ],
    [
        'closed loop'
    ],
    [
        'renew',
        'renewing'
    ]
]

proxy_list = [
    '<REUSE>',
    '<RECYCLE>',
    '<REDUCE>',
    '<RECOVER>',
    '<REPAIR>',
    '<WASTE>',
    '<DURABLE>',
    '<CLOSED-LOOP>',
    '<RENEW>',
]


In [100]:
strings_array_with_lengths = []

for string_group in strings_array:
    
    string_group_with_lengths = []
    
    for string_ in string_group:
        
        string_group_with_lengths.append((string_, len(string_)))
        
    strings_array_with_lengths.append(string_group_with_lengths)
    
strings_array_with_lengths


[[('re use', 6), ('reuse', 5), ('re using', 8), ('reusing', 7)],
 [('recycle', 7), ('recycling', 9)],
 [('reduce', 6), ('reducing', 8)],
 [('recover', 7), ('recovering', 10), ('retrieve', 8), ('retrieving', 10)],
 [('repair', 6), ('repairing', 9)],
 [('waste', 5)],
 [('durable', 7), ('durability', 10)],
 [('closed loop', 11)],
 [('renew', 5), ('renewing', 8)]]

In [101]:
strings_array_sorted = []

for string_group in strings_array_with_lengths:
    
    string_group_sorted = sorted(string_group, key = lambda y: y[1], reverse = True)
    
    string_group_sorted_just_string = []
    
    for string_ in string_group_sorted:
        
        string_group_sorted_just_string.append(string_[0])
        
    strings_array_sorted.append(string_group_sorted_just_string)
    
strings_array_sorted


[['re using', 'reusing', 're use', 'reuse'],
 ['recycling', 'recycle'],
 ['reducing', 'reduce'],
 ['recovering', 'retrieving', 'retrieve', 'recover'],
 ['repairing', 'repair'],
 ['waste'],
 ['durability', 'durable'],
 ['closed loop'],
 ['renewing', 'renew']]

In [102]:
strings_array = strings_array_sorted


### Make lower case, punctuation removal, numbers removal, deal with multiple spaces, and add beginning of string marker and end of string marker

In [103]:
df_bruno

Unnamed: 0,family_id,earliest_publn_year_this_family_id,most_recent_abstract_this_family_id,most_recent_title_this_family_id
0,1574492,2015,An underwater vehicle includes an on board pow...,- Underwater vehicle comprising power storage ...
1,3511554,2000,"The method involves placing all loads (7,8,9,1...",Method of controlling emergency power supply i...
2,3613974,2002,The electrode for an electrochemical arrangeme...,Electrode for an electrochemical arrangement c...
3,3673165,2002,The invention describes a method of regulating...,Method for regulating an inverter system
4,3681483,2001,The invention relates to an essentially flat e...,MULTILAYER ELECTRODE
...,...,...,...,...
92695,73455420,2019,The present invention provides a storage syste...,AUTOMATED STORAGE SYSTEM WITH A CONTAINER VEHI...
92696,73474213,2015,PROBLEM TO BE SOLVED: To provide a method allo...,WIRELESS CHARGING UNIT AND COUPLER BASED DOCKI...
92697,74557388,2015,"A surgical instrument can comprise a handle, a...",POWER MANAGEMENT CONTROL SYSTEM FOR SURGICAL I...
92698,74844536,2004,"FIELD: electrical engineering, namely manufact...",METHOD FOR CONTINUOUSLY MAKING ELECTRIC CURREN...


In [104]:
df_bruno_lower_punc_removed = df_bruno.copy()

# make lower case
df_bruno_lower_punc_removed['most_recent_abstract_this_family_id'] = df_bruno_lower_punc_removed['most_recent_abstract_this_family_id'].str.lower()

# remove punctuation
df_bruno_lower_punc_removed['most_recent_abstract_this_family_id'] = df_bruno_lower_punc_removed['most_recent_abstract_this_family_id'].str.replace(pat = '[^\w\s]', repl = ' ', regex = True)

# remove numbers
df_bruno_lower_punc_removed['most_recent_abstract_this_family_id'] = df_bruno_lower_punc_removed['most_recent_abstract_this_family_id'].str.replace(pat = '\d', repl = ' ', regex = True)

# Add <beg> and <end> markers
df_bruno_lower_punc_removed['most_recent_abstract_this_family_id'] = '<beg> '+df_bruno_lower_punc_removed['most_recent_abstract_this_family_id']
df_bruno_lower_punc_removed['most_recent_abstract_this_family_id'] = df_bruno_lower_punc_removed['most_recent_abstract_this_family_id']+' <end>'

# replace more-than-one spaces with one space
df_bruno_lower_punc_removed['most_recent_abstract_this_family_id'] = df_bruno_lower_punc_removed['most_recent_abstract_this_family_id'].str.replace(pat = '\s{2,}', repl = ' ', regex = True)


In [105]:
df_bruno_lower_punc_removed


Unnamed: 0,family_id,earliest_publn_year_this_family_id,most_recent_abstract_this_family_id,most_recent_title_this_family_id
0,1574492,2015,<beg> an underwater vehicle includes an on boa...,- Underwater vehicle comprising power storage ...
1,3511554,2000,<beg> the method involves placing all loads in...,Method of controlling emergency power supply i...
2,3613974,2002,<beg> the electrode for an electrochemical arr...,Electrode for an electrochemical arrangement c...
3,3673165,2002,<beg> the invention describes a method of regu...,Method for regulating an inverter system
4,3681483,2001,<beg> the invention relates to an essentially ...,MULTILAYER ELECTRODE
...,...,...,...,...
92695,73455420,2019,<beg> the present invention provides a storage...,AUTOMATED STORAGE SYSTEM WITH A CONTAINER VEHI...
92696,73474213,2015,<beg> problem to be solved to provide a method...,WIRELESS CHARGING UNIT AND COUPLER BASED DOCKI...
92697,74557388,2015,<beg> a surgical instrument can comprise a han...,POWER MANAGEMENT CONTROL SYSTEM FOR SURGICAL I...
92698,74844536,2004,<beg> field electrical engineering namely manu...,METHOD FOR CONTINUOUSLY MAKING ELECTRIC CURREN...


In [106]:
df_bruno_lower_punc_removed['most_recent_abstract_this_family_id'][0]

'<beg> an underwater vehicle includes an on board power grid the power grid including powerlines a plurality of dc energy storage sources based on lithium ion batteries each source having a positive terminal and a negative terminal and being connected on the one hand to a charger and on the other hand to energy consumers of the vehicle one of the terminals of each source being connected to a first powerline of the power grid the other terminal of each source being connected on the one hand by a second powerline of the power grid to the charger through one way semiconductor conducting unit and on the other hand by a third powerline of the electric grid to consumers through one way semiconductor conducting unit <end>'

### Get one dataframe for each string group with rows we are interested in only

In [107]:
df_bruno_lower_punc_removed


Unnamed: 0,family_id,earliest_publn_year_this_family_id,most_recent_abstract_this_family_id,most_recent_title_this_family_id
0,1574492,2015,<beg> an underwater vehicle includes an on boa...,- Underwater vehicle comprising power storage ...
1,3511554,2000,<beg> the method involves placing all loads in...,Method of controlling emergency power supply i...
2,3613974,2002,<beg> the electrode for an electrochemical arr...,Electrode for an electrochemical arrangement c...
3,3673165,2002,<beg> the invention describes a method of regu...,Method for regulating an inverter system
4,3681483,2001,<beg> the invention relates to an essentially ...,MULTILAYER ELECTRODE
...,...,...,...,...
92695,73455420,2019,<beg> the present invention provides a storage...,AUTOMATED STORAGE SYSTEM WITH A CONTAINER VEHI...
92696,73474213,2015,<beg> problem to be solved to provide a method...,WIRELESS CHARGING UNIT AND COUPLER BASED DOCKI...
92697,74557388,2015,<beg> a surgical instrument can comprise a han...,POWER MANAGEMENT CONTROL SYSTEM FOR SURGICAL I...
92698,74844536,2004,<beg> field electrical engineering namely manu...,METHOD FOR CONTINUOUSLY MAKING ELECTRIC CURREN...


In [108]:
df_strings_list = []

for string_list in tqdm(strings_array):
    
    df_this_group = pd.DataFrame()
    
    for string_ in string_list:
    
        pattern = ' '+string_+' '
        print(pattern)
        
        bool_ = df_bruno_lower_punc_removed['most_recent_abstract_this_family_id'].str.contains(pattern, case = False, regex = True)

        df_this_string = df_bruno_lower_punc_removed[bool_]

        df_this_group = pd.concat([df_this_group, df_this_string])
        
    df_this_group.drop_duplicates(inplace = True)
    
    df_strings_list.append(df_this_group)


  0%|          | 0/9 [00:00<?, ?it/s]

 re using 
 reusing 
 re use 
 reuse 
 recycling 
 recycle 
 reducing 
 reduce 
 recovering 
 retrieving 
 retrieve 
 recover 
 repairing 
 repair 
 waste 
 durability 
 durable 
 closed loop 
 renewing 
 renew 


### Replace string with string group proxy

In [109]:
df_strings_list_replaced = []

for i, df_this_group in enumerate(df_strings_list):
    
    #print(i)
    #print(proxy_list[i])
    
    df_this_group_proxy_replacement = df_this_group.copy()
    
    for string_ in strings_array[i]:
        
        #print(string_)
        
        df_this_group_proxy_replacement['most_recent_abstract_this_family_id'] = df_this_group_proxy_replacement['most_recent_abstract_this_family_id'].str.replace(pat = string_, repl = proxy_list[i], regex = False)
        
    df_strings_list_replaced.append(df_this_group_proxy_replacement)


In [110]:
df_strings_list_replaced[8]

Unnamed: 0,family_id,earliest_publn_year_this_family_id,most_recent_abstract_this_family_id,most_recent_title_this_family_id
1040,16240516,2001,<beg> a battery charge control device is capab...,Battery charge control device having function ...
4580,31986790,2004,<beg> p problem to be solved to apply a suitab...,CONTROL DEVICE FOR ON-VEHICLE STORAGE BATTERY
12768,39580401,2008,<beg> in the method for estimating battery res...,Method and device for estimating battery resid...
13703,39995613,2008,<beg> p problem to be solved to provide a meth...,METHOD AND DEVICE FOR ESTIMATING BATTERY RESID...
24809,44773990,2012,<beg> this invention provides a residual batte...,Method for determining a power level of a batt...
40894,49915906,2014,<beg> problem to be solved to provide an on de...,ON-DEMAND TYPE MULTIPLE POWER SUPPLY MANAGEMEN...
36326,48521113,2014,<beg> a method for charging a flow battery com...,METHOD FOR CHARGING A ZINC/LEAD FLOW BATTERY A...


In [111]:
df_strings_list_replaced[8]['most_recent_abstract_this_family_id'][1040]

'<beg> a battery charge control device is capable of detecting a gassing state in a battery without using a temperature sensor under condition of charging the battery with a charging current when a terminal voltage vi of the battery monitored by the device exceeds a threshold value a temporary gassing detecting unit informs a voltage regulation calculating unit that the battery is in course of reaching its gassing state the voltage regulation calculating unit calculates a changing rate of terminal voltage vi of the battery while it is in course of reaching the gassing state when the calculated changing rate exceeds a preset decision value a gassing detecting unit judges that the battery has just reached the gassing state a decision value <RENEW> unit changes the decision value corresponding to the charging current <end>'

### For each string, get trigrams

In [112]:
stopwords_basic = stopwords.words('english')
#stopwords_basic


In [113]:
trigram_counters_list = []

# Initialise lemmatizer
lem = WordNetLemmatizer()

for i,df_ in enumerate(df_strings_list_replaced):

    most_recent_abstract_this_family_id_list = list(df_['most_recent_abstract_this_family_id'])
    earliest_publn_year_this_family_id_list = list(df_['earliest_publn_year_this_family_id'])
    family_id_list = list(df_['family_id'])
    
    #print(len(most_recent_abstract_this_family_id_list))
    #print(len(earliest_publn_year_this_family_id_list))
    #print(len(family_id_list))
    #print()
    
    trigrams_list_this_string = []

    for j in tqdm(range(len(most_recent_abstract_this_family_id_list))):

        item = most_recent_abstract_this_family_id_list[j]

        # Tokenise
        item = item.split()
        
        ####################
        
        if True:
            
            # Replace certain words with others (according to replace_words dictionary defined above)
            item_replaced = []

            for word in item:

                if word in list(replace_words):

                    item_replaced.extend(replace_words[word].split())

                else:

                    item_replaced.append(word)

            item = item_replaced
            
        ####################
            
        if True:
            
            # Stopword removal
            
            #stopwords_this = stopwords_basic
            stopwords_this = stopwords_
            
            item_stopwords_removed = []
            
            for word in item:
                
                if not word in stopwords_this:
                    
                    item_stopwords_removed.append(word)
                    
            item = item_stopwords_removed
            
        ####################
            
        if True:
            
            # Delete words that are a repetition of the word before
            
            # Always add the first word to next stage
            # If this list of words is empty, do nothing
            try:
                #print(item[0])
                item_without_repetitions = [item[0]]
            except Exception as e:
                pass
                                    
            # Loop over the other words (the second and following) and add them to the next stage, if they are not 
            # a repetition of the word before
            for k in range(1, len(item)):
                
                if (item[k-1] != item[k]):
                    
                    item_without_repetitions.append(item[k])
                    
            item = item_without_repetitions
            
        ####################
        
        if True:
            
            # Lemmatisation
            item_lemmatised = [lem.lemmatize(word) for word in item]
            
            item = item_lemmatised
                    
        ####################
        
        # Get trigrams
        
        # Get this proxy
        proxy_this = proxy_list[i]
        
        # Use numpy to get indices of where our proxy is located
        item_np_array = np.array(item)
        indices = list(np.where(item_np_array == proxy_this)[0])

        #print(indices)
        
        for occ_index in indices:
                    
            before = item[occ_index - 1]
            after = item[occ_index + 1]
                
            trigram = (before, item[occ_index], after)
                
            #print(trigram)
            
            # Only add this trigram if its first element is not the same as its third
            if not trigram[0] == trigram[2]:
            
                trigrams_list_this_string.append(trigram)
            
    counter = dict(Counter(trigrams_list_this_string).most_common())
    trigram_counters_list.append(counter)
    

  0%|          | 0/62 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/4068 [00:00<?, ?it/s]

  0%|          | 0/312 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/265 [00:00<?, ?it/s]

  0%|          | 0/607 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

In [114]:
trigram_counters_list[8]


{('progressed', '<RENEW>', 'value'): 2,
 ('value', '<RENEW>', 'unit'): 1,
 ('process', '<RENEW>', 'timer'): 1,
 ('voltage', '<RENEW>', 'unit'): 1,
 ('unit', '<RENEW>', 'data'): 1,
 ('amount', '<RENEW>', 'mean'): 1,
 ('continuously', '<RENEW>', 'electrolyte'): 1}

In [115]:
trigram_counters_list_first_ones = []

for trigram_counter in trigram_counters_list:
    
    trigram_counter_first_ones = []
    
    for i, count in enumerate(list(trigram_counter)):
        
        if i < 20:
            
            trigram_counter_first_ones.append((count, trigram_counter[count]))
            
    
    trigram_counters_list_first_ones.append(trigram_counter_first_ones)     
            

In [116]:
print("'<beg>' = beginning of abstract")
print("'<end>' = end of abstract")
print()

print()
print()
print()

for i, trigram_counter_first_ones in enumerate(trigram_counters_list_first_ones):
    
    print('Strings: '+str(strings_array[i]))
    print('Proxy: '+proxy_list[i])
    print()
    
    print('trigram,count')
    print('----------------------------')
    
    for item in trigram_counter_first_ones:
        
        to_print = item[0][0]+' '+item[0][1]+' '+item[0][2]+','+str(item[1])
        print(to_print)

    print()
    print()
    print()


'<beg>' = beginning of abstract
'<end>' = end of abstract




Strings: ['re using', 'reusing', 're use', 'reuse']
Proxy: <REUSE>

trigram,count
----------------------------
step <REUSE> battery,2
diagnosis <REUSE> electric,2
determine <REUSE> battery,2
point <REUSE> start,2
element <REUSE> residual,1
power <REUSE> current,1
efficiency <REUSE> light,1
primary <REUSE> storage,1
battery <REUSE> side,1
equipment <REUSE> storage,1
server <REUSE> side,1
<beg> <REUSE> cell,1
solution <REUSE> cell,1
state <REUSE> object,1
battery <REUSE> object,1
recycling <REUSE> spent,1
recycle <REUSE> various,1
efficiently <REUSE> reflected,1
<beg> <REUSE> battery,1
station <REUSE> battery,1



Strings: ['recycling', 'recycle']
Proxy: <RECYCLE>

trigram,count
----------------------------
<beg> <RECYCLE> lithium,4
recovery <RECYCLE> raw,3
battery <RECYCLE> apparatus,3
process <RECYCLE> electrode,3
battery <RECYCLE> process,3
present <RECYCLE> lithium,3
<beg> <RECYCLE> lead,3
battery <RECYCLE> system,2
concer