In [1]:
import json

import pandas as pd
import numpy as np

import chart_studio
import chart_studio.plotly as cs_py

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from plotly.colors import n_colors
import colorcet as cc

import scipy.stats as stats


from sklearn.decomposition import NMF, PCA, KernelPCA
#from sklearn.cluster import KMeans
from sklearn.cluster  import OPTICS 



In [2]:

def load_json(json_file):
    with open(json_file) as f:
        data = json.load(f)
    return data

# load chart_studio credentials from json
chart_studio.tools.set_credentials_file(**load_json('auth/.chart_studio.json'))

In [3]:
# Read patents data
patents = pd.read_excel('out\inst_patents.xlsx')
insts = pd.read_excel('out\polon_discp.xlsx')

## Typy sektorów wg. klasyfikacji NACE

In [4]:
nace_df = pd.read_excel('concordance\\nace.xlsx')

nace_code_labels  = nace_df[(nace_df['Code'].str.len() <=2) ].to_dict(orient='records')


code_label_mapping={}

for d in nace_code_labels:
    if d['Code'] == d['Sector']:
        code_label_mapping[d['Code']] = d['Code'] + ' - ' +d['Label']
    else:
        code_label_mapping[d['Code']] =  d['Sector'] +d['Code'] + ' - ' +d['Label']

In [5]:
patents_nace = pd.read_excel('out\patents_nace.xlsx', header=[0,1], index_col=[0])

In [6]:
# Join patents and insts on institution name
patents_inst = patents.merge(insts, left_on='institution_id', right_on='uid', how='left')

In [7]:
# Filter institutions by the type of institution (leave only PRI)
patents_inst = patents_inst[patents_inst['kind'].isin(['PUBLIC_UNIVERSITY','SCIENTIFIC_INSTITUTION'])]

In [8]:
decision_types_pl_eng = {
    'Brak informacji o decyzji': 'No information on decision',
    'Patent udzielony': 'Patent granted',
    'Patent wygasł po okresie, na jaki został udzielony': 'Patent expired after the period for which it was granted',
    'Decyzja o wygaśnięciu decyzji warunkowej': 'Decision on the expiration of a conditional decision',
    'Konwersja zgłoszenia wynalazku na zgłoszenie wzoru użytkowego': 'Conversion of a invention application to a utility model application',
    'Decyzja o unieważnieniu patentu /wygaśnięciu patentu (materiał biologiczny)': 'Decision on the invalidation/expiration of a patent (biological material)',
    'Decyzja o wygaśnięciu patentu (rezygnacja uprawnionego z ochrony)': 'Decision on patent expiration (waiver of protection by the entitled party)',
    'Decyzja o wygaśnięciu patentu (brak opłaty za ochronę)': 'Decision on patent expiration (no protection fee)',
    'Decyzja o umorzeniu postępowania': 'Decision to discontinue proceedings',
    'Decyzja o odmowie udzielenia patentu': 'Decision to refuse to grant a patent'
}


decision_types_not_influencing =[
    'No information on decision',
    'Decision to discontinue proceedings',
    'Decision to refuse to grant a patent',
    'Decision on the expiration of a conditional decision'
]

institution_kind_mapping = {
    'PUBLIC_UNIVERSITY': 'Public university',
    'SCIENTIFIC_INSTITUTION': 'Scientific institution'
}

# The rest is valid for decision types influencing
decision_types_influencing = [ v for k,v in decision_types_pl_eng.items() if v not in decision_types_not_influencing]


# Replace nan in decision
patents_inst['decision'] = patents_inst['decision'].fillna('Brak informacji o decyzji')
# Translate decision types
patents_inst['decision'] = patents_inst['decision'].map(decision_types_pl_eng)
patents_inst['kind'] = patents_inst['kind'].map(institution_kind_mapping)

In [9]:
def grouper(x):
    if x < 1990:
        return '-1989'
    elif x < 1995:
        return '1990-1994'
    elif x < 2000:
        return '1995-1999'
    elif x < 2005:
        return '2000-2004'
    elif x < 2010:
        return '2005-2009'
    elif x < 2015:
        return '2010-2014'
    elif x < 2020:
        return '2015-2019'
    else:
        return '2020-'



patents_inst['patent_application_year'] = patents_inst['application_date'].str.slice(stop=4)
patents_inst = patents_inst[~patents_inst['patent_application_year'].isnull()]
patents_inst['patent_application_year'] = patents_inst['patent_application_year'].astype(int)
patents_inst['year_group'] = patents_inst['patent_application_year'].apply(grouper)

In [10]:
patents_n = patents_inst.shape[0]

year_min = patents_inst['patent_application_year'].min()
year_max = patents_inst['patent_application_year'].max()

institutions_n = len(patents_inst['institution_id'].unique()) 
subtitle = f'Patent applications: {year_min}-{year_max}, Patents: N={patents_n}, Institutions: N={institutions_n}'

In [11]:
patents_inst.shape

(35592, 107)

## Patents database

In [12]:

# Plot multiple line chart using plotly express showing number of patents by year and decision and kind
patents_by_year = patents_inst.groupby(['patent_application_year','decision', 'kind'])['patent_id'].count().reset_index().rename(columns={'patent_id':'count'})

# Order by year and decision
patents_by_year_k = patents_by_year.groupby(['patent_application_year', 'kind'])['count'].sum().reset_index()


fig = px.line(patents_by_year_k, x="patent_application_year", y="count", 
              #width=800, height=600, 
              color="kind", title=f'<b>Number of patent submissions by the PRIs</b><br>{subtitle}',  labels={'year_group':'Years', 'count':'Patent submissions', 'kind':'PRI kind'})
cs_py.plot(fig, filename = 'patents_kind_years', auto_open=False)
fig.show()

In [13]:
fig.write_html("docs/_includes/patents_kind_years.html")

In [14]:
# Plot multiple line chart using plotly express showing number of patents by year and decision and kind
patents_by_year = patents_inst.groupby(['year_group','decision', 'kind'])['patent_id'].count().reset_index().rename(columns={'patent_id':'count'})
intrv_patents = patents_by_year.groupby(['year_group', 'kind'])['count'].sum().reset_index()

fig = px.bar(intrv_patents, x="year_group", y="count", color="kind", title=f'<b>Number of patent submissions by the PRIs</b><br>{subtitle}', barmode='stack', 
             #width=800, height=600, 
             color_discrete_sequence=px.colors.qualitative.Dark24, labels={'year_group':'Years', 'count':'Patent submissions', 'kind':'PRI kind'})
cs_py.plot(fig, filename = 'patents_kind', auto_open=False)
fig.show()

In [15]:
fig.write_html("docs/_includes/patents_kind_years_groupped.html")

In [16]:
# Plot multiple line chart using plotly express showing number of patents by year and decision and kind
patents_by_year = patents_inst.groupby(['year_group','decision'])['patent_id'].count().reset_index().rename(columns={'patent_id':'count'})
intrv_patents = patents_by_year.groupby(['year_group', 'decision'])['count'].sum().reset_index()


fig = px.bar(intrv_patents, x="year_group", y="count", color="decision", title=f'<b>Status of patent submission decisions in relation to the patent submission date</b><br>{subtitle}', barmode='stack', 
             #width=1400, height=600, 
             color_discrete_sequence=px.colors.qualitative.Dark24, labels={'year_group':'Years', 'count':'Patents', 'decision':'Decision'})


cs_py.plot(fig, filename = 'patents_decisions', auto_open=False)

fig.show()

In [17]:
fig.write_html("docs/_includes/patents_decisions.html")

## Patents influence

### Filter the patenting activity based on decision types and submission year

In [18]:
# Filter patents by year and decision
patents_inst = patents_inst[(patents_inst['patent_application_year'] >= 2010) & (patents_inst['patent_application_year'] <= 2019)]
patents_inst = patents_inst[patents_inst['decision'].isin(decision_types_influencing)]

patents_n = patents_inst.shape[0]

year_min = patents_inst['patent_application_year'].min()
year_max = patents_inst['patent_application_year'].max()

institutions_n = len(patents_inst['institution_id'].unique()) 
subtitle = f'Patent applications: {year_min}-{year_max}, Patents: N={patents_n}, Institutions: N={institutions_n}'

### Visualizations

In [19]:
df_nace_kind = patents_nace.copy()#patents_nace.sum(level=0, axis=1).copy()
df_nace_kind['kind'] = patents_inst['kind']
#df_nace_kind['patent_application_year'] = patents_inst['patent_application_year']
df_nace_kind['year_group'] = patents_inst['patent_application_year'].apply(grouper)
df_nace_kind['institution'] = patents_inst['institution']



In [20]:
cl_data = patents_nace.copy()
cl_data['institution'] = patents_inst['institution']
cl_data['kind'] = patents_inst['kind']
cl_data = cl_data.groupby(['kind','institution']).sum().reset_index()
cl_data['size'] = cl_data.sum(axis=1)


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.



In [21]:
# Perform dimensionality reduction using NMF and clustering using kmeans numerical columns and graph the clusters

# Use DBSCAN  as clustering method with cosine similarity as distance

def cluster_and_plot(df, n_components=2, random_state=42):


    def get_color_palette(clusters):
        base_palette = cc.circle_mgbm_67_c31
        color_i = (np.array(clusters/clusters.max())*(len(base_palette)-1)).astype(int)
        # Get the colors from the colorcet palette based on color indexes
        selected_colors = [base_palette[i] for i in color_i]
        selected_colors_out = {}
        
        for i in range(len(clusters)):
               r,g,b = selected_colors[i]
               selected_colors_out[str(clusters[i])] = '#%02x%02x%02x' % (int(r*255), int(g*255), int(b*255))
             
        return selected_colors_out

    

    # Perform dimensionality reduction using NMF
    nmf = KernelPCA(n_components=n_components, random_state=random_state, kernel='cosine')
    nmf.fit(df.drop(['institution', 'size'], axis=1))
    nmf_features = nmf.transform(df.drop(['institution', 'size'], axis=1))
    nmf_features = pd.DataFrame(nmf_features, columns=['nmf_feature_'+str(i) for i in range(n_components)])
    nmf_features['institution'] = df['institution']
    nmf_features['size'] = df['size'].astype(int)
    
    nmf_features['disp_size'] = nmf_features['size'] 
    # Perform clustering using kmeans
    #clusters = KMeans(n_clusters=n_clusters, random_state=random_state)
    clusters = OPTICS(metric='cosine', eps=1, min_samples = 3)
    
    clusters.fit(df.drop(['institution', 'size'], axis=1))
    nmf_features['cluster'] = clusters.labels_
    nmf_features['cluster']= nmf_features['cluster'] - nmf_features['cluster'].min()

    color_palette = get_color_palette(nmf_features['cluster'].unique())
    nmf_features['cluster']= nmf_features['cluster'].astype(str)
    nmf_features['cluster_color'] = nmf_features['cluster'].map(lambda x: color_palette[x])
    
    
    fig = px.scatter(nmf_features, x="nmf_feature_0", y="nmf_feature_1", 
                     color='cluster', 
                     color_discrete_map= color_palette, 
                     size='size',
                     hover_name="institution", 
                     title=f'<b>Projection of PRIs based on the impact of patenting activity</b><br>{subtitle}', 
                     size_max=80,
                    
                     height=800,
                     #width=800, height=600,  
                     labels={'nmf_feature_0':'C1','nmf_feature_1':'C2',  'cluster':'Cluster', 'size':'Submitted patents'})
    
    # Set the labels for the sunburst segments
    #fig.update_traces(hovertemplate='<b>%{text}</b><br>Liczba patentów: %{size}', textfont=dict(size=20, color='#000000'))
    
    # Hide legend
    fig.update_layout(showlegend=False)
    
    return fig,nmf_features



#
cl_data_pu = cl_data[cl_data['size'] > 0]


del cl_data_pu['kind']
cp_fig,nmf_features = cluster_and_plot(cl_data_pu, n_components=2, random_state=42)


cp_fig.write_html("docs/_includes/institute_nace_impact_clusters.html")

cp_fig


dropping on a non-lexsorted multi-index without a level parameter may impact performance.


Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.


dropping on a non-lexsorted multi-index without a level parameter may impact performance.


Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.


dropping on a non-lexsorted multi-index without a level parameter may impact performance.


Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.


divide by zero encountered in true_divide



In [22]:
# Assign clusters to institutes
patents_inst_clustered = patents_inst.merge(nmf_features[['institution','cluster', 'cluster_color']], left_on='name', right_on='institution', how='left')

# Find three most active applicants for each cluster 
def get_top_applicants(df, top_n=3):
    top_applicants = df.groupby(['cluster', 'institution_y']).size().reset_index(name='count').sort_values(['cluster', 'count'], ascending=False).groupby('cluster').head(top_n)
    # Assign rank 
    top_applicants['rank'] = top_applicants.groupby('cluster')['count'].rank(ascending=False)
    return top_applicants

top_applicants_cluster = get_top_applicants(patents_inst_clustered, top_n=3)[['cluster','rank','institution_y','count']]
top_applicants_cluster['cluster'] = top_applicants_cluster['cluster'].astype(int)
top_applicants_cluster['rank'] = top_applicants_cluster['rank'].astype(int)
top_applicants_cluster.sort_values(['cluster','rank'], inplace=True)
top_applicants_cluster.columns = ['Cluster','Rank','Institution','Patent Applications']

top_applicants_cluster.to_html('docs/_includes/cluster_leaders.html', index=False)


In [23]:
sector_codes = [c1 for c1,c2 in df_nace_kind.columns.values if len(c1) == 1]

#df_nace_kind.T.groupby(level=0).sum().T

In [24]:
df_nace_kind.columns = [c1+c2 for c1,c2 in df_nace_kind.columns.values]

In [25]:
# Group df_nace_kind by fist level of columns and sum
df_nace_kind_sum = df_nace_kind.groupby(['year_group', 'kind']).sum().reset_index()


In [26]:
patent_sector = df_nace_kind.melt(id_vars=['kind', 'year_group', 'institution'], value_vars=df_nace_kind.columns[:-3].values.tolist(), var_name='nace', value_name='percent')


In [27]:
patent_sector['nace_code'] = patent_sector['nace'].str[:1]
patent_sector['nace'] = patent_sector['nace'].str[1:]


In [28]:

# Define top n instutions and other group by the sum of percent in given nace category 
def top_n_institutions(plot_data, n=10):
        
    l_df = plot_data.groupby(['nace_code', 'nace'])['percent'].nlargest(n).reset_index()
    l_df['institution'] = l_df['level_2'].apply(lambda i: plot_data.iloc[i]['institution'])
    del l_df['level_2']

    l_df_all = plot_data.groupby(['nace_code', 'nace'])['percent'].sum().reset_index()
    l_df_all['institution'] = 'Other'

    l_df_all['percent'] =  l_df_all['percent']  - l_df.groupby(['nace_code', 'nace']).sum().reset_index()['percent']

    return pd.concat([l_df, l_df_all], axis=0)


# Create plotly sunburst chart with nace_code as first level of the pie  and nace as outer layer
def plot_pie(plot_data, title):

    parent_df = plot_data.groupby('nace_code').sum().reset_index()
    parent_df['percent'] = parent_df['percent'] / parent_df['percent'].sum() * 100
    
    parent_ids = parent_df['nace_code'].tolist()
    parent_labels = parent_df['nace_code'].tolist()
    parent_values = parent_df['percent'].tolist()
    
    child_df = plot_data.groupby(['nace_code','nace']).sum().reset_index()
    child_df['percent'] = child_df['percent'] / child_df['percent'].sum() * 100
    
    child_ids = child_df['nace'].tolist()
    child_labels = child_df['nace'].tolist()
    child_parents = child_df['nace_code'].tolist()
    child_values = child_df['percent'].tolist()
    
    
    
    child_child_df = plot_data.groupby(['nace_code', 'nace', 'institution']).sum().reset_index()
    
    child_child_df['percent'] = child_child_df['percent'] / child_child_df['percent'].sum() * 100
    
    child_child_ids = [ n+'/'+i for n,i in zip( child_child_df['nace'].tolist(), child_child_df['institution'].tolist() ) ]
    child_child_labels = child_child_df['institution'].tolist()
    child_child_parents = child_child_df['nace'].tolist()
    child_child_values = child_child_df['percent'].tolist()
    
    
    
    # Get ids
    ids = parent_ids + child_ids + child_child_ids
    values = parent_values + child_values  + child_child_values
    
    # Get parents
    parents = [''] * len(parent_ids) + child_parents + child_child_parents
    
    # ids 
    labels = parent_labels + child_labels  + child_child_labels
    
    
    def do_mapping(id):
        if id in code_label_mapping:
            return code_label_mapping[id]
        return id 
    
    
    
    customdata = [do_mapping(id) for id in ids ] 
    
    
    
    sunburst = go.Sunburst(
                      labels=labels,
                      values=values,
                      branchvalues='total',
                      customdata=customdata,
                      parents=parents,
                      ids=ids,
                      marker={'colors': ['nace_code']})

    # Create a Figure object and add the sunburst to it
    fig = go.Figure(sunburst)

    # Set the size of the figure
    fig.update_layout(title=title, height=800)

    # Set the labels for the sunburst segments
    fig.update_traces(hovertemplate='<b>%{customdata}</b><br>Percent: %{value:.2f}%', textfont=dict(size=20, color='#000000'))

    # Set the colors for the sunburst segments
    #fig.marker.colors = px.colors.qualitative.Dark24

    return fig



plot_data = patent_sector.groupby(['kind', 'nace_code', 'nace', 'institution'])['percent'].sum().reset_index()

plot_data = top_n_institutions(plot_data, n=15)

pie_f = plot_pie(plot_data, f'<b>The influence of individual PRIs on the specific sectors of the economy</b> <br>{subtitle}')

#cs_py.plot(pie_f, filename = 'institute_nace_impact_pie', auto_open=False)


pie_f

In [29]:
pie_f.write_html("docs/_includes/institute_nace_impact_pie.html")

In [30]:
patent_sector

Unnamed: 0,kind,year_group,institution,nace,percent,nace_code
0,,,,01,0.003045,A
1,,,,01,0.001852,A
2,,,,01,0.001269,A
3,,,,01,0.008689,A
4,,,,01,0.003045,A
...,...,...,...,...,...,...
3134467,Scientific institution,2015-2019,Sieć Badawcza Łukasiewicz - Łódzki Instytut Te...,99,0.000000,U
3134468,Scientific institution,2015-2019,Sieć Badawcza Łukasiewicz - Łódzki Instytut Te...,99,0.000000,U
3134469,Scientific institution,2015-2019,Sieć Badawcza Łukasiewicz - Łódzki Instytut Te...,99,0.000000,U
3134470,Scientific institution,2015-2019,Sieć Badawcza Łukasiewicz - Łódzki Instytut Te...,99,0.000000,U


In [31]:
# Plot top sectors by percentage weight
def plot_top_sectors(top_sectors):
    
    fig = px.bar(top_sectors, y="nace", x="percent", color="nace", title=f'<b>Sectors with the biggest patenting activity</b> <br>{subtitle}', labels={'nace':'Sector', 'percent':'Expected patents'}, orientation='h',   
                 height=800, #width=1400
                 )
    fig.update_xaxes(type='log')
    fig.update_layout(showlegend=False)
    # Set the labels for the sunburst segments
    
    return fig 
    
top_sectors = patent_sector.groupby(['nace'])['percent'].sum().reset_index().sort_values(['percent','nace'], ascending=[False, True]).groupby(['nace', 'percent']).head(5)
top_sectors['nace'] = top_sectors['nace'].map(code_label_mapping)

fig_top_sectors = plot_top_sectors(top_sectors.head(25))

#cs_py.plot(fig_top_sectors, filename = 'fig_top_sectors', auto_open=False)

#fig_top_sectors

In [32]:
fig_top_sectors

In [33]:
fig_top_sectors.write_html("docs/_includes/top_sectors_activity.html")

In [34]:
# Get top instiuions by each nace sector based on sum of percentage weight
def top_inst_by_sector(patent_sector, n=3):

    pds = []
    for nace in patent_sector['nace'].unique():
        top_nace = patent_sector[patent_sector['nace'] == nace].groupby(['institution'])['percent'].sum().reset_index().sort_values(['percent'], ascending=False).head(n)
        top_nace['rank'] = range(n)
        top_nace['rank'] = top_nace['rank']+1
        top_nace['nace'] = nace
        
        pds.append(top_nace)
        
    return pd.concat(pds)
        
        
    
psl_df =patent_sector#[patent_sector['year_group']=='2015-2019']    
pu_sector_leaders = top_inst_by_sector(psl_df[psl_df['kind']==institution_kind_mapping['PUBLIC_UNIVERSITY']], n=3)
si_sector_leaders = top_inst_by_sector(psl_df[psl_df['kind']==institution_kind_mapping['SCIENTIFIC_INSTITUTION']], n=3)

psl_merge_df = pd.merge(pu_sector_leaders, on=['nace', 'rank'], right=si_sector_leaders, how='left', suffixes=('_pu', '_si'))
psl_merge_df['nace'] = psl_merge_df['nace'].map(code_label_mapping)
psl_merge_df.index = pd.MultiIndex.from_frame(psl_merge_df[['nace', 'rank']])
del psl_merge_df['rank']
del psl_merge_df['nace']


# Round to 4 decimal places
psl_merge_df = psl_merge_df.round(3)



psl_merge_df.rename(columns={'institution_pu':'Public university', 'percent_pu':'Expected patents', 'institution_si':'Scientific institution', 'percent_si':'Expected patents'}, inplace=True)

psl_merge_df.to_html('docs/_includes/sector_leaders.html')

In [35]:
psl_merge_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Public university,Expected patents,Scientific institution,Expected patents
nace,rank,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"A01 - Crop and animal production, hunting and related service activities",1,Politechnika Lubelska,13.782,Sieć Badawcza Łukasiewicz - Instytut Metali Ni...,4.490
"A01 - Crop and animal production, hunting and related service activities",2,Akademia Górniczo-Hutnicza im. Stanisława Stas...,6.027,Sieć Badawcza Łukasiewicz - Instytut Metalurgi...,1.532
"A01 - Crop and animal production, hunting and related service activities",3,Politechnika Łódzka,3.780,Sieć Badawcza Łukasiewicz - Przemysłowy Instyt...,0.764
A02 - Forestry and logging,1,Politechnika Lubelska,0.131,Centralny Instytut Ochrony Pracy - Państwowy I...,0.024
A02 - Forestry and logging,2,Zachodniopomorski Uniwersytet Technologiczny w...,0.045,Instytut Technologii Bezpieczeństwa MORATEX,0.020
...,...,...,...,...,...
T98 - Undifferentiated goods- and services-producing activities of private households for own use,2,Politechnika Poznańska,0.005,Instytut Ochrony Roślin - Państwowy Instytut B...,0.006
T98 - Undifferentiated goods- and services-producing activities of private households for own use,3,Szkoła Główna Gospodarstwa Wiejskiego w Warszawie,0.005,Instytut Zootechniki - Państwowy Instytut Bada...,0.004
U99 - Activities of extraterritorial organisations and bodies,1,Uniwersytet Przyrodniczy we Wrocławiu,0.043,Instytut Technologiczno-Przyrodniczy - Państwo...,0.006
U99 - Activities of extraterritorial organisations and bodies,2,Politechnika Wrocławska,0.028,Instytut Ochrony Roślin - Państwowy Instytut B...,0.004


In [36]:
# Create  stacked area chart showing percent of patents by nace through years using plotly go library
def plot_stacked_area(dfs, x, y, color, title, stitles):

    #fig = go.Figure()
    fig = make_subplots(rows=1, cols=len(dfs),subplot_titles=stitles)
    
    
    for i in range(len(dfs)):
        df = dfs[i]
        showlegend = i == 0
        unique_colors = df[color].unique()
        palette = n_colors('rgb(0, 0, 255)', 'rgb(255, 0, 0)', len(unique_colors), colortype = 'rgb')
        
        for c_i in range(len(unique_colors)):
            color_val = unique_colors[c_i]
            df_c = df[df[color]==color_val]
            line_color = palette[c_i]
                
            fig.add_trace(go.Scatter(x=df_c[x], y=df_c[y], name=color_val, mode='lines', stackgroup='one', legendgroup=color_val,line=dict(color=line_color), groupnorm='percent',  showlegend=showlegend), col=i+1, row=1)
    #fig.update_layout(title=title)
            
    fig.update_layout(
        height=1200,
        showlegend=True,
        legend_orientation="h",
        legend={'traceorder':'normal'},
        title=title)
    
    
    fig.update_xaxes(type='category')
    fig.update_yaxes(type='linear',
            range=[1, 100],
            ticksuffix='%')
    
    # Order legend by value 

    return fig
    
    
    
    
pu_si_sector = patent_sector.groupby(['year_group', 'nace_code'])['percent'].sum().reset_index()
pu_si_sector['nace_code'] = pu_si_sector['nace_code'].map(code_label_mapping)

prct_imp_years = plot_stacked_area([pu_si_sector], 'year_group', 'percent', 'nace_code', f'<b>Trend in the patenting activity and its relative impact on the NACE sectors</b><br>{subtitle}', stitles=['Influenced NACE sectors'])

#cs_py.plot(prct_imp_years, filename = 'prct_impact_years', auto_open=False)


In [37]:
prct_imp_years

In [38]:
prct_imp_years.write_html("docs/_includes/prct_impact_years.html")

In [39]:
ps_gd = patent_sector.groupby(['kind','institution','nace','nace_code'])['percent'].sum().reset_index()

ps_gd2 = ps_gd.groupby(['kind','institution','nace_code'])['percent'].sum().reset_index().sort_values(['percent'], ascending=False)
ps_gd2['nace'] = ps_gd2['nace_code']


ps_gd3 = ps_gd.groupby(['kind','institution'])['percent'].sum().reset_index().sort_values(['percent'], ascending=False)
ps_gd3['nace'] = 'Total'
ps_gd3['nace_code'] = 'Total'

patent_sector_all = pd.concat([ps_gd,ps_gd2,ps_gd3])



In [40]:
patents_sector_influence = patent_sector_all.pivot_table(index='institution', columns='nace', values='percent').fillna(0)

In [41]:
patents_sector_influence

nace,01,02,03,05,06,07,08,09,10,11,...,M,N,O,P,Q,R,S,T,Total,U
institution,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Akademia Górniczo-Hutnicza im. Stanisława Staszica w Krakowie,6.026797,0.033635,0.001753,0.121217,4.777648,0.300472,1.462834,23.753465,2.048378,0.862307,...,57.693332,11.994557,1.444748,14.120479,1.209519,0.248316,2.324854,0.023371,810.0,0.007890
Akademia Kaliska im. Prezydenta Stanisława Wojciechowskiego,0.001852,0.000093,0.000000,0.000000,0.006343,0.001127,0.000309,0.041330,0.081311,0.000648,...,1.014051,0.189344,0.003858,0.061033,0.004753,0.013542,0.177415,0.000000,10.0,0.000000
Akademia Marynarki Wojennej im. Bohaterów Westerplatte,0.015312,0.000307,0.000000,0.000094,0.040073,0.002090,0.001391,0.150538,0.012623,0.003110,...,0.542500,0.126991,0.018431,0.115382,0.016011,0.001283,0.029519,0.000040,8.0,0.000000
Akademia Sztuk Pięknych im. Władysława Strzemińskiego w Łodzi,0.036909,0.004614,0.000000,0.000000,0.001265,0.001265,0.000000,0.008852,0.004559,0.000000,...,0.233204,0.054819,0.002529,0.032995,0.015105,0.000760,0.092103,0.000000,4.0,0.000000
Akademia Sztuk Pięknych w Gdańsku,0.008033,0.000871,0.000000,0.000030,0.000401,0.000156,0.010707,0.004758,0.003384,0.000156,...,0.116660,0.040613,0.000844,0.006433,0.000015,0.000871,0.007443,0.000000,2.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wojskowy Instytut Techniki Inżynieryjnej im. profesora Józefa Kosackiego,0.073015,0.000457,0.000000,0.004424,0.228805,0.053943,0.023314,0.441115,0.275520,0.083405,...,6.415070,0.880546,0.103259,0.719594,0.078868,0.033212,0.489864,0.000161,61.0,0.000632
Wojskowy Instytut Techniki Pancernej i Samochodowej,0.001790,0.000000,0.000000,0.000135,0.000596,0.000199,0.000135,0.000802,0.009214,0.001022,...,0.163374,0.035609,0.000288,0.004534,0.000032,0.001520,0.020268,0.000000,2.0,0.000000
Zachodniopomorski Uniwersytet Technologiczny w Szczecinie,2.447629,0.044942,0.005584,0.125801,6.233069,0.344071,3.019904,7.826034,10.048709,3.470519,...,64.385701,12.513201,2.812551,22.201493,2.813160,0.390135,3.725601,0.002940,787.0,0.014595
Śląski Uniwersytet Medyczny w Katowicach,0.115945,0.001462,0.001098,0.001678,0.092973,0.002925,0.045367,0.116445,0.404297,0.149044,...,3.127741,0.468076,0.221260,1.319743,0.452033,0.025320,0.286772,0.000627,38.0,0.000534


In [42]:
patent_sector

Unnamed: 0,kind,year_group,institution,nace,percent,nace_code
0,,,,01,0.003045,A
1,,,,01,0.001852,A
2,,,,01,0.001269,A
3,,,,01,0.008689,A
4,,,,01,0.003045,A
...,...,...,...,...,...,...
3134467,Scientific institution,2015-2019,Sieć Badawcza Łukasiewicz - Łódzki Instytut Te...,99,0.000000,U
3134468,Scientific institution,2015-2019,Sieć Badawcza Łukasiewicz - Łódzki Instytut Te...,99,0.000000,U
3134469,Scientific institution,2015-2019,Sieć Badawcza Łukasiewicz - Łódzki Instytut Te...,99,0.000000,U
3134470,Scientific institution,2015-2019,Sieć Badawcza Łukasiewicz - Łódzki Instytut Te...,99,0.000000,U


In [43]:
# Group patents_inst by institution id 


patents_inst_clustered['lat'] = patents_inst_clustered['point'].map(lambda x: str(x[1:-6]).split(', ')[0]).astype(float)
patents_inst_clustered['lon'] = patents_inst_clustered['point'].map(lambda x: str(x[1:-6]).split(', ')[1]).astype(float)

plot_data = patents_inst_clustered.groupby(['cluster','cluster_color','institution_id','name','lat','lon'])['id'].count().reset_index()
plot_data.rename(columns={'id':'count'}, inplace=True)


plot_data = plot_data.merge(patents_sector_influence, left_on='name', right_index=True, how='left', suffixes=('', ''))

for c in plot_data.columns:
    if c not in ['cluster', 'cluster_color', 'institution_id','name','lat','lon','count']:
        # Calculate decil 
        #plot_data[c]= plot_data[c].rank(pct=True) * 100
        #plot_data[c]= np.round(plot_data[c].rank(pct=True), 1) * 100
        
        
        # Perform the Box-Cox transformation
        #transformed_data, _ = stats.boxcox(plot_data[c])
        # Box-Cox transformation
        #plot_data[c]= transformed_data
        
        #plot_data[c] =  plot_data[c]  -  plot_data[c].min()
        
        plot_data[c] = plot_data[c].astype(float) / plot_data[c].max() * 100 +1
        if c in code_label_mapping:
            plot_data.rename(columns={c:code_label_mapping[c]}, inplace=True)
        




In [44]:
plot_data[['name',code_label_mapping['B']]].sort_values(code_label_mapping['B'], ascending=False)

Unnamed: 0,name,B - MINING AND QUARRYING
92,Akademia Górniczo-Hutnicza im. Stanisława Stas...,101.000000
58,Politechnika Śląska,98.187590
28,Instytut Techniki Górniczej KOMAG,90.132015
21,Główny Instytut Górnictwa,69.013046
135,Zachodniopomorski Uniwersytet Technologiczny w...,58.696901
...,...,...
76,Lotnicza Akademia Wojskowa,1.010783
145,Uniwersytet Szczeciński,1.009895
143,Państwowa Uczelnia Zawodowa im. Ignacego Mości...,1.009895
50,Instytut Fizjologii i Patologii Słuchu,1.007017


In [45]:
# Plot points from df on map of Poland using graph objects showing location of institutions with size of point based on number of patents
def plot_patent_map_go(df, lat, lon, color, title, columns=None):
    fig = go.Figure()
    
    
    if columns is None:
        columns =  [ col for col in df.columns if col not in ['cluster','cluster_color','institution_id','name','lat','lon', 'count', 'Total']]
        columns = [ 'Total', *sorted(columns)]
    
    
    def create_layout_button(column):
        vis = [c == column for c in columns]
        return dict(label = column,
                    method = 'update',
                    args = [{'visible': vis,
                             'title': column,
                             'showlegend': True}])

    def create_trace(column):
        return go.Scattermapbox(
                    lat=df[lat],
                    lon=df[lon],
                    mode='markers',
                    marker=go.scattermapbox.Marker(
                        size=df[column],
                        autocolorscale=False,
                        #colorscale='Phase',
                        allowoverlap=True,
                        color=df[color]
                    ),

                    text=df['name'],
                    name = column
                    
                )


    buttons = []
    
    for column in columns:
        fig.add_trace(create_trace(column))
        buttons.append(create_layout_button(column))
            

    #fig.update_traces(cluster=dict(enabled=True))
    fig.update_layout(
        title=title,
        hovermode='closest',
        mapbox=go.layout.Mapbox(
            accesstoken=open("auth/.mapbox_token").read(),
            bearing=0,
            center=go.layout.mapbox.Center(
                lat=52.3,
                lon=19
            ),
            pitch=0,
            zoom=5.5
        ),
        autosize = True,
        height=800,
        #width=800
    )
    
    # Hide legend
    fig.update_layout(showlegend=False)
    
    # Hide all traces
    fig.update_traces(visible=False)
    
    # Show trace for Total
    fig.data[0].visible = True
    
    fig.update_layout(
        updatemenus=[
            go.layout.Updatemenu(
            active=0,
            buttons=buttons,
            #yanchor="bottom",
            name='NACE',
            xanchor="right",
            x=1
            )
            
        ])
   

    return fig


In [46]:
plot_data['cluster_color'].unique()

array(['#d99084', '#d29479', '#44b2b9', '#43b0c7', '#54add0', '#6ba9d6',
       '#83a4d8', '#999fd7', '#af99d0', '#c094c4', '#cc91b5', '#d58fa6',
       '#c79a71', '#da8e96', '#d99085', '#ba9f6d', '#aba46c', '#9aa970',
       '#88ad7a', '#76af89', '#65b199', '#53b2a9'], dtype=object)

In [47]:

inst_influence_map = plot_patent_map_go(plot_data, 'lat', 'lon',  'cluster_color', f'<b>Map of influence of PRIs on the Polish economy based on the patenting activity<b> <br>{subtitle}  <br><sup><b>Select the proper NACE sector using the dropdown menu on the top right</b></sup>')

In [48]:
inst_influence_map

In [49]:
inst_influence_map.write_html("docs/_includes/inst_influence_map.html")