In [1]:
import pandas as pd
import chart_studio
import chart_studio.plotly as cs_py
import json

def load_json(json_file):
    with open(json_file) as f:
        data = json.load(f)
    return data

# load chart_studio credentials from json
chart_studio.tools.set_credentials_file(**load_json('auth/.chart_studio.json'))

In [2]:
# Read patents data
patents = pd.read_excel('out\inst_patents.xlsx')
insts = pd.read_excel('out\polon_discp.xlsx')

## Typy sektorów wg. klasyfikacji NACE

In [3]:
nace_df = pd.read_excel('concordance\\nace.xlsx')

nace_code_labels  = nace_df[(nace_df['Code'].str.len() <=2) ].to_dict(orient='records')


code_label_mapping={}

for d in nace_code_labels:
    if d['Code'] == d['Sector']:
        code_label_mapping[d['Code']] = d['Code'] + ' - ' +d['Label']
    else:
        code_label_mapping[d['Code']] =  d['Sector'] +d['Code'] + ' - ' +d['Label']

In [4]:
patents_nace = pd.read_excel('out\patents_nace.xlsx', header=[0,1], index_col=[0])

In [5]:
# Join patents and insts on institution name
patents_inst = patents.merge(insts, left_on='institution_id', right_on='uid', how='left')

In [6]:
patents_inst.shape

(35619, 105)

In [7]:
def grouper(x):
    if x < 1990:
        return '-1989'
    elif x < 1995:
        return '1990-1994'
    elif x < 2000:
        return '1995-1999'
    elif x < 2005:
        return '2000-2004'
    elif x < 2010:
        return '2005-2009'
    elif x < 2015:
        return '2010-2014'
    elif x < 2020:
        return '2015-2019'
    else:
        return '2020-'



patents_inst['patent_application_year'] = patents_inst['application_date'].str.slice(stop=4)
patents_inst = patents_inst[~patents_inst['patent_application_year'].isnull()]
patents_inst['patent_application_year'] = patents_inst['patent_application_year'].astype(int)
patents_inst['year_group'] = patents_inst['patent_application_year'].apply(grouper)

# Replace nan in decision
patents_inst['decision'] = patents_inst['decision'].fillna('Brak informacji o decyzji')

In [8]:
import plotly.express as px

# Plot multiple line chart using plotly express showing number of patents by year and decision and kind
patents_by_year = patents_inst.groupby(['patent_application_year','decision', 'kind'])['patent_id'].count().reset_index().rename(columns={'patent_id':'count'})

# Order by year and decision
patents_by_year_k = patents_by_year.groupby(['patent_application_year', 'kind'])['count'].sum().reset_index()


fig = px.line(patents_by_year_k, x="patent_application_year", y="count", 
              #width=800, height=600, 
              color="kind", title='Liczba patentów wg typu placówki naukowej')
cs_py.plot(fig, filename = 'patents_kind_years', auto_open=False)
fig.show()

In [9]:
fig.write_html("docs/patents_kind_years.html")

In [10]:
# Plot multiple line chart using plotly express showing number of patents by year and decision and kind
patents_by_year = patents_inst.groupby(['year_group','decision', 'kind'])['patent_id'].count().reset_index().rename(columns={'patent_id':'count'})
intrv_patents = patents_by_year.groupby(['year_group', 'kind'])['count'].sum().reset_index()

fig = px.bar(intrv_patents, x="year_group", y="count", color="kind", title='Liczba patentów wg typu placówki naukowej', barmode='stack', 
             #width=800, height=600, 
             color_discrete_sequence=px.colors.qualitative.Dark24, labels={'year_group':'Grupa lat', 'count':'Liczba patentów', 'kind':'Rodzaj'})
cs_py.plot(fig, filename = 'patents_kind', auto_open=False)
fig.show()

In [11]:
fig.write_html("docs/patents_kind.html")

In [12]:
# Plot multiple line chart using plotly express showing number of patents by year and decision and kind
patents_by_year = patents_inst.groupby(['year_group','decision'])['patent_id'].count().reset_index().rename(columns={'patent_id':'count'})
intrv_patents = patents_by_year.groupby(['year_group', 'decision'])['count'].sum().reset_index()


fig = px.bar(intrv_patents, x="year_group", y="count", color="decision", title='Liczba patentów wg decyzji w latach', barmode='stack', 
             #width=1400, height=600, 
             color_discrete_sequence=px.colors.qualitative.Dark24, labels={'year_group':'Grupa lat', 'count':'Liczba patentów', 'decision':'Decyzja'})


cs_py.plot(fig, filename = 'patents_decisions', auto_open=False)

fig.show()

In [13]:
fig.write_html("docs/patents_decisions.html")

In [14]:
df_nace_kind = patents_nace.copy()#patents_nace.sum(level=0, axis=1).copy()
df_nace_kind['kind'] = patents_inst['kind']
#df_nace_kind['patent_application_year'] = patents_inst['patent_application_year']
df_nace_kind['year_group'] = patents_inst['patent_application_year'].apply(grouper)
df_nace_kind['institution'] = patents_inst['institution']



In [15]:
cl_data = patents_nace.copy()
cl_data['institution'] = patents_inst['institution']
cl_data['kind'] = patents_inst['kind']
cl_data = cl_data.groupby(['kind','institution']).sum().reset_index()
cl_data['size'] = cl_data.sum(axis=1)


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.



In [16]:
# Perform dimensionality reduction using NMF and clustering using kmeans numerical columns and graph the clusters

# Use DBSCAN  as clustering method with cosine similarity as distance

def cluster_and_plot(df, n_components=2, random_state=42):
    from sklearn.decomposition import NMF, PCA, KernelPCA
    #from sklearn.cluster import KMeans
    from sklearn.cluster  import OPTICS 
    import plotly.express as px
    import plotly.graph_objects as go
    import numpy as np
    import pandas as pd
    
    

    # Perform dimensionality reduction using NMF
    nmf = KernelPCA(n_components=n_components, random_state=random_state, kernel='cosine')
    nmf.fit(df.drop(['institution', 'size'], axis=1))
    nmf_features = nmf.transform(df.drop(['institution', 'size'], axis=1))
    nmf_features = pd.DataFrame(nmf_features, columns=['nmf_feature_'+str(i) for i in range(n_components)])
    nmf_features['institution'] = df['institution']
    nmf_features['size'] = df['size'].astype(int)

    # Perform clustering using kmeans
    #clusters = KMeans(n_clusters=n_clusters, random_state=random_state)
    clusters = OPTICS(metric='cosine', min_samples = 3)
    
    clusters.fit(df.drop(['institution', 'size'], axis=1))
    nmf_features['cluster'] = clusters.labels_

    
    fig = px.scatter(nmf_features, x="nmf_feature_0", y="nmf_feature_1", color="cluster", size='size', hover_name="institution", title='Mapa placówek badawczych wg wpływu na sektory gospodarki', 
                     #width=800, height=600,  
                     labels={'nmf_feature_0':'C1','nmf_feature_1':'C2',  'cluster':'Klaster', 'size':'Liczba patentów'})

    # Set the labels for the sunburst segments
    #fig.update_traces(hovertemplate='<b>%{text}</b><br>Liczba patentów: %{size}', textfont=dict(size=20, color='#000000'))

    return fig,nmf_features



#
cl_data_pu = cl_data[cl_data['size'] > 0]
#cl_data_pu = cl_data_pu[cl_data_pu['kind']=='PUBLIC_UNIVERSITY']


del cl_data_pu['kind']
cp_fig,nmf_features = cluster_and_plot(cl_data_pu, n_components=2, random_state=42)
cs_py.plot(cp_fig, filename = 'institute_nace_impact_clusters', auto_open=False)


cp_fig



dropping on a non-lexsorted multi-index without a level parameter may impact performance.


Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.


dropping on a non-lexsorted multi-index without a level parameter may impact performance.


Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.


dropping on a non-lexsorted multi-index without a level parameter may impact performance.


Feature names only support names that are all strings. Got feature names with dtypes: ['tuple']. An error will be raised in 1.2.


divide by zero encountered in true_divide



In [17]:
cp_fig.write_html("docs/institute_nace_impact_clusters.html")

In [18]:
nmf_features

Unnamed: 0,nmf_feature_0,nmf_feature_1,institution,size,cluster
0,0.046720,0.558417,Polsko-Japońska Akademia Technik Komputerowych,3,-1
1,-0.434817,0.118639,Wyższa Szkoła Gospodarki w Bydgoszczy,21,-1
2,0.382872,0.385286,Wyższa Szkoła Zawodowa Kosmetyki i Pielęgnacji...,1,-1
3,-0.390650,0.010813,Akademia Górniczo-Hutnicza im. Stanisława Stas...,1129,6
4,-0.150205,0.083406,Akademia Kaliska im. Prezydenta Stanisława Woj...,23,-1
...,...,...,...,...,...
210,0.492614,0.162855,Wojskowy Instytut Medyczny - Państwowy Instytu...,4,-1
211,-0.247800,0.231626,Wojskowy Instytut Techniczny Uzbrojenia,183,-1
212,-0.238013,0.076180,Wojskowy Instytut Techniki Inżynieryjnej im. p...,84,-1
213,-0.482910,-0.031081,Wojskowy Instytut Techniki Pancernej i Samocho...,11,-1


In [19]:
# Assign clusters to institutes
patents_inst_clustered = patents_inst.merge(nmf_features[['institution','cluster']], left_on='name', right_on='institution', how='left')

In [20]:
patents_inst

Unnamed: 0.1,Unnamed: 0,id,patent_id,title,decision,applicant,application_date,ipc_classification,ipc_classification_secondary,date_from,...,fos_dziedzina nauk medycznych i nauk o zdrowiu,fos_dziedzina nauk rolniczych,fos_dziedzina nauk społecznych,fos_dziedzina nauk teologicznych,fos_dziedzina nauk ścisłych i przyrodniczych,fos_dziedzina sztuki,location,point,patent_application_year,year_group
0,0,P.428896,,Ekran dźwiękochłonny,Brak informacji o decyzji,WYŻSZA SZKOŁA GOSPODARKI W BYDGOSZCZY,2019-03-06,E01F 8/00,E04B 1/86,2020-10-12,...,1.0,0.0,1.0,0.0,0.0,0.0,"2, Garbary, Okole, Bydgoszcz, województwo kuja...","(53.1272129, 17.9937357, 0.0)",2019,2015-2019
1,1,P.428889,,Sterowanie i konstrukcja stanowiska do badań t...,Brak informacji o decyzji,WYŻSZA SZKOŁA GOSPODARKI W BYDGOSZCZY,2019-02-12,G01N 3/56,G01N 19/00,2021-02-04,...,1.0,0.0,1.0,0.0,0.0,0.0,"2, Garbary, Okole, Bydgoszcz, województwo kuja...","(53.1272129, 17.9937357, 0.0)",2019,2015-2019
2,2,P.428899,,Regulator przekształtnika energoelektroniczneg...,Decyzja o odmowie udzielenia patentu,WYŻSZA SZKOŁA GOSPODARKI W BYDGOSZCZY,2019-02-12,G05F 1/67,H02S 40/30,2022-03-21,...,1.0,0.0,1.0,0.0,0.0,0.0,"2, Garbary, Okole, Bydgoszcz, województwo kuja...","(53.1272129, 17.9937357, 0.0)",2019,2015-2019
3,3,P.433021,,Zintegrowany zespół narzędziowy do obróbki mat...,Brak informacji o decyzji,WYŻSZA SZKOŁA GOSPODARKI W BYDGOSZCZY,2020-02-24,B23Q 15/00,,2021-08-30,...,1.0,0.0,1.0,0.0,0.0,0.0,"2, Garbary, Okole, Bydgoszcz, województwo kuja...","(53.1272129, 17.9937357, 0.0)",2020,2020-
4,4,P.433019,,Modułowy zestaw do ochrony przed hałasem,Brak informacji o decyzji,WYŻSZA SZKOŁA GOSPODARKI W BYDGOSZCZY,2020-02-24,E01F 8/00,E04B 1/86,2021-08-30,...,1.0,0.0,1.0,0.0,0.0,0.0,"2, Garbary, Okole, Bydgoszcz, województwo kuja...","(53.1272129, 17.9937357, 0.0)",2020,2020-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35614,7,P.431099,Pat.241603,Sposób wytwarzania aglomeratu z odpadowych str...,Patent udzielony,"POLITECHNIKA ŁÓDZKA, SIEĆ BADAWCZA ŁUKASIEWICZ...",2019-09-10,B09B 3/27,"['C08L 89/06', 'B01J 2/14']",2022-11-07,...,,,,,,,"Łódź, województwo łódzkie, Polska","(51.7687323, 19.4569911, 0.0)",2019,2015-2019
35615,8,P.431101,Pat.241605,Sposób wytwarzania aglomeratu z odpadowych str...,Patent udzielony,"POLITECHNIKA ŁÓDZKA, SIEĆ BADAWCZA ŁUKASIEWICZ...",2019-09-10,B09B 3/27,"['C08L 89/06', 'B01J 2/14']",2022-11-07,...,,,,,,,"Łódź, województwo łódzkie, Polska","(51.7687323, 19.4569911, 0.0)",2019,2015-2019
35616,9,P.431102,Pat.241606,Sposób wytwarzania aglomeratu z odpadowych str...,Patent udzielony,"POLITECHNIKA ŁÓDZKA, SIEĆ BADAWCZA ŁUKASIEWICZ...",2019-09-10,B09B 3/27,"['C08L 89/06', 'B01J 2/14']",2022-11-07,...,,,,,,,"Łódź, województwo łódzkie, Polska","(51.7687323, 19.4569911, 0.0)",2019,2015-2019
35617,10,P.431100,Pat.241604,Sposób wytwarzania aglomeratu z odpadowych str...,Patent udzielony,"POLITECHNIKA ŁÓDZKA, SIEĆ BADAWCZA ŁUKASIEWICZ...",2019-09-10,B09B 3/27,"['C08L 89/06', 'B01J 2/14']",2022-11-07,...,,,,,,,"Łódź, województwo łódzkie, Polska","(51.7687323, 19.4569911, 0.0)",2019,2015-2019


In [21]:
sector_codes = [c1 for c1,c2 in df_nace_kind.columns.values if len(c1) == 1]

#df_nace_kind.T.groupby(level=0).sum().T

In [22]:
df_nace_kind.columns = [c1+c2 for c1,c2 in df_nace_kind.columns.values]

In [23]:
# Group df_nace_kind by fist level of columns and sum
df_nace_kind_sum = df_nace_kind.groupby(['year_group', 'kind']).sum().reset_index()


In [24]:
patent_sector = df_nace_kind.melt(id_vars=['kind', 'year_group', 'institution'], value_vars=df_nace_kind.columns[:-3].values.tolist(), var_name='nace', value_name='percent')


In [25]:
patent_sector['nace_code'] = patent_sector['nace'].str[:1]
patent_sector['nace'] = patent_sector['nace'].str[1:]


In [26]:
import plotly.graph_objects as go

# Define top n instutions and other group by the sum of percent in given nace category 
def top_n_institutions(plot_data, n=10):
        
    l_df = plot_data.groupby(['nace_code', 'nace'])['percent'].nlargest(n).reset_index()
    l_df['institution'] = l_df['level_2'].apply(lambda i: plot_data.iloc[i]['institution'])
    del l_df['level_2']

    l_df_all = plot_data.groupby(['nace_code', 'nace'])['percent'].sum().reset_index()
    l_df_all['institution'] = 'Other'

    l_df_all['percent'] =  l_df_all['percent']  - l_df.groupby(['nace_code', 'nace']).sum().reset_index()['percent']

    return pd.concat([l_df, l_df_all], axis=0)


# Create plotly sunburst chart with nace_code as first level of the pie  and nace as outer layer
def plot_pie(plot_data, title):

    parent_df = plot_data.groupby('nace_code').sum().reset_index()
    parent_df['percent'] = parent_df['percent'] / parent_df['percent'].sum() * 100
    
    parent_ids = parent_df['nace_code'].tolist()
    parent_labels = parent_df['nace_code'].tolist()
    parent_values = parent_df['percent'].tolist()
    
    child_df = plot_data.groupby(['nace_code','nace']).sum().reset_index()
    child_df['percent'] = child_df['percent'] / child_df['percent'].sum() * 100
    
    child_ids = child_df['nace'].tolist()
    child_labels = child_df['nace'].tolist()
    child_parents = child_df['nace_code'].tolist()
    child_values = child_df['percent'].tolist()
    
    
    
    child_child_df = plot_data.groupby(['nace_code', 'nace', 'institution']).sum().reset_index()
    
    child_child_df['percent'] = child_child_df['percent'] / child_child_df['percent'].sum() * 100
    
    child_child_ids = [ n+'/'+i for n,i in zip( child_child_df['nace'].tolist(), child_child_df['institution'].tolist() ) ]
    child_child_labels = child_child_df['institution'].tolist()
    child_child_parents = child_child_df['nace'].tolist()
    child_child_values = child_child_df['percent'].tolist()
    
    
    
    # Get ids
    ids = parent_ids + child_ids + child_child_ids
    values = parent_values + child_values  + child_child_values
    
    # Get parents
    parents = [''] * len(parent_ids) + child_parents + child_child_parents
    
    # ids 
    labels = parent_labels + child_labels  + child_child_labels
    
    
    def do_mapping(id):
        if id in code_label_mapping:
            return code_label_mapping[id]
        return id 
    
    
    
    customdata = [do_mapping(id) for id in ids ] 
    
    
    
    sunburst = go.Sunburst(
                      labels=labels,
                      values=values,
                      branchvalues='total',
                      customdata=customdata,
                      parents=parents,
                      ids=ids,
                      marker={'colors': ['nace_code']})

    # Create a Figure object and add the sunburst to it
    fig = go.Figure(sunburst)

    # Set the size of the figure
    #fig.update_layout(width=800, height=600, title=title)

    # Set the labels for the sunburst segments
    fig.update_traces(hovertemplate='<b>%{customdata}</b><br>Percent: %{value:.2f}%', textfont=dict(size=20, color='#000000'))

    # Set the colors for the sunburst segments
    #fig.marker.colors = px.colors.qualitative.Dark24

    return fig



plot_data = patent_sector.groupby(['kind', 'nace_code', 'nace', 'institution'])['percent'].sum().reset_index()

plot_data = top_n_institutions(plot_data, n=15)

pie_f = plot_pie(plot_data, 'Wpływ poszczególnych instytucji na dane sektory gospodarki')

cs_py.plot(pie_f, filename = 'institute_nace_impact', auto_open=False)


pie_f

In [27]:
pie_f.write_html("docs/institute_nace_impact.html")

In [28]:
patent_sector

Unnamed: 0,kind,year_group,institution,nace,percent,nace_code
0,NONPUBLIC_UNIVERSITY,2015-2019,Wyższa Szkoła Gospodarki w Bydgoszczy,01,0.003045,A
1,NONPUBLIC_UNIVERSITY,2015-2019,Wyższa Szkoła Gospodarki w Bydgoszczy,01,0.001852,A
2,NONPUBLIC_UNIVERSITY,2015-2019,Wyższa Szkoła Gospodarki w Bydgoszczy,01,0.001269,A
3,NONPUBLIC_UNIVERSITY,2020-,Wyższa Szkoła Gospodarki w Bydgoszczy,01,0.008689,A
4,NONPUBLIC_UNIVERSITY,2020-,Wyższa Szkoła Gospodarki w Bydgoszczy,01,0.003045,A
...,...,...,...,...,...,...
3134467,SCIENTIFIC_INSTITUTION,2015-2019,Sieć Badawcza Łukasiewicz - Łódzki Instytut Te...,99,0.000000,U
3134468,SCIENTIFIC_INSTITUTION,2015-2019,Sieć Badawcza Łukasiewicz - Łódzki Instytut Te...,99,0.000000,U
3134469,SCIENTIFIC_INSTITUTION,2015-2019,Sieć Badawcza Łukasiewicz - Łódzki Instytut Te...,99,0.000000,U
3134470,SCIENTIFIC_INSTITUTION,2015-2019,Sieć Badawcza Łukasiewicz - Łódzki Instytut Te...,99,0.000000,U


In [29]:
# Plot top sectors by percentage weight
def plot_top_sectors(top_sectors):
    fig = px.bar(top_sectors, y="nace", x="percent", color="nace", title='Największe sektory wg udziału patentów', labels={'nace':'Sektor', 'percent':'Sum of percentage weights (log scale)'}, orientation='h',  
                 #height=700, width=1400
                 )
    fig.update_xaxes(type='log')
    fig.update_layout(showlegend=False)
    return fig 
    
top_sectors = patent_sector.groupby(['nace'])['percent'].sum().reset_index().sort_values(['percent','nace'], ascending=[False, True]).groupby(['nace', 'percent']).head(5)
top_sectors['nace'] = top_sectors['nace'].map(code_label_mapping)

fig_top_sectors = plot_top_sectors(top_sectors)

cs_py.plot(fig_top_sectors, filename = 'fig_top_sectors', auto_open=False)

#fig_top_sectors

'https://plotly.com/~piotrsobecki/52/'

In [30]:
# Get top instiuions by each nace sector based on sum of percentage weight
def top_inst_by_sector(patent_sector, n=3):

    pds = []
    for nace in patent_sector['nace'].unique():
        top_nace = patent_sector[patent_sector['nace'] == nace].groupby(['institution'])['percent'].sum().reset_index().sort_values(['percent'], ascending=False).head(n)
        top_nace['rank'] = range(3)
        top_nace['rank'] = top_nace['rank']+1
        top_nace['nace'] = nace
        
        pds.append(top_nace)
        
    return pd.concat(pds)
        
        
    
psl_df =patent_sector[patent_sector['year_group']=='2015-2019']    
    
pu_sector_leaders = top_inst_by_sector(psl_df[psl_df['kind']=='PUBLIC_UNIVERSITY'], n=3)
si_sector_leaders = top_inst_by_sector(psl_df[psl_df['kind']=='SCIENTIFIC_INSTITUTION'], n=3)

psl_merge_df = pd.merge(pu_sector_leaders, on=['nace', 'rank'], right=si_sector_leaders, how='left', suffixes=('_pu', '_si'))
psl_merge_df['nace'] = psl_merge_df['nace'].map(code_label_mapping)
psl_merge_df.index = pd.MultiIndex.from_frame(psl_merge_df[['nace', 'rank']])
del psl_merge_df['rank']
del psl_merge_df['nace']
psl_merge_df.to_excel('out/sector_leaders.xlsx')

In [31]:
# Create  stacked area chart showing percent of patents by nace through years using plotly go library
def plot_stacked_area(dfs, x, y, color, title, stitles):
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    from plotly.colors import n_colors
    
    #fig = go.Figure()
    fig = make_subplots(rows=1, cols=len(dfs),subplot_titles=stitles)
    
    
    for i in range(len(dfs)):
        df = dfs[i]
        showlegend = i == 0
        unique_colors = df[color].unique()
        palette = n_colors('rgb(0, 0, 255)', 'rgb(255, 0, 0)', len(unique_colors), colortype = 'rgb')
        
        for c_i in range(len(unique_colors)):
            color_val = unique_colors[c_i]
            df_c = df[df[color]==color_val]
            line_color = palette[c_i]
                
            fig.add_trace(go.Scatter(x=df_c[x], y=df_c[y], name=color_val, mode='lines', stackgroup='one', legendgroup=color_val,line=dict(color=line_color), groupnorm='percent',  showlegend=showlegend), col=i+1, row=1)
    #fig.update_layout(title=title)
            
    fig.update_layout(
        showlegend=True,
        legend={'traceorder':'normal'},
        title=title)
    
    
    fig.update_xaxes(type='category')
    fig.update_yaxes(type='linear',
            range=[1, 100],
            ticksuffix='%')
    
    # Order legend by value 

    return fig
    
# get top 5 sectors
pu_si_sector = patent_sector[patent_sector['kind'].isin(['PUBLIC_UNIVERSITY','SCIENTIFIC_INSTITUTION'])]
pu_si_sector = pu_si_sector.groupby(['year_group', 'kind', 'nace_code'])['percent'].sum().reset_index()
pu_si_sector_top = pu_si_sector.groupby('nace_code')['percent'].sum().nlargest(15).index

gd = pu_si_sector[pu_si_sector['nace_code'].isin(pu_si_sector_top)]
gd['nace_code'] = gd['nace_code'].map(code_label_mapping)

pu = gd[gd['kind']=='PUBLIC_UNIVERSITY']
si = gd[gd['kind']=='SCIENTIFIC_INSTITUTION']

prct_imp_years = plot_stacked_area([pu, si], 'year_group', 'percent', 'nace_code', 'Wpływ patentów na sektory gospodarki na przestrzeni lat', stitles=['Uniwersytety publiczne', 'Instytuty naukowe'])

cs_py.plot(prct_imp_years, filename = 'prct_impact_years', auto_open=False)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



'https://plotly.com/~piotrsobecki/55/'

In [32]:
prct_imp_years.write_html("docs/prct_impact_years.html")

In [33]:
ps_gd = patent_sector.groupby(['kind','institution','nace','nace_code'])['percent'].sum().reset_index()

ps_gd2 = ps_gd.groupby(['kind','institution','nace_code'])['percent'].sum().reset_index().sort_values(['percent'], ascending=False)
ps_gd2['nace'] = ps_gd2['nace_code']


ps_gd3 = ps_gd.groupby(['kind','institution'])['percent'].sum().reset_index().sort_values(['percent'], ascending=False)
ps_gd3['nace'] = 'Total'
ps_gd3['nace_code'] = 'Total'

patent_sector_all = pd.concat([ps_gd,ps_gd2,ps_gd3])



In [34]:
patents_sector_influence = patent_sector_all.pivot_table(index='institution', columns='nace', values='percent').fillna(0)

In [35]:
patents_sector_influence

nace,01,02,03,05,06,07,08,09,10,11,...,M,N,O,P,Q,R,S,T,Total,U
institution,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Akademia Górniczo-Hutnicza im. Stanisława Staszica w Krakowie,8.093215,0.044758,0.002771,0.156278,6.932532,0.453912,2.029773,35.546001,2.787400,1.100290,...,79.165765,17.205899,2.114956,20.663622,1.754697,0.411416,3.330637,0.028227,1129.0,0.010203
Akademia Kaliska im. Prezydenta Stanisława Wojciechowskiego,0.009574,0.000093,0.000059,0.001660,0.016334,0.003224,0.027833,0.054724,1.570042,0.247684,...,2.236534,0.594543,0.018249,0.274537,0.038334,0.024587,0.289257,0.000000,24.0,0.000016
Akademia Marynarki Wojennej im. Bohaterów Westerplatte,0.261498,0.001800,0.000024,0.002044,0.498771,0.015216,0.020047,1.200414,0.068980,0.016605,...,4.026826,0.844368,0.100795,0.643740,0.081100,0.013103,0.240483,0.000201,50.0,0.000000
Akademia Sztuk Pięknych im. Władysława Strzemińskiego w Łodzi,0.038176,0.004614,0.000000,0.000000,0.002170,0.001265,0.000181,0.009214,0.011981,0.005069,...,0.298917,0.066405,0.003615,0.033357,0.015829,0.001122,0.094276,0.000000,5.0,0.000000
Akademia Sztuk Pięknych w Gdańsku,0.019622,0.000871,0.000000,0.000133,0.008471,0.000156,0.012777,0.011489,0.087940,0.049203,...,0.682673,0.157753,0.009636,0.010676,0.007466,0.004182,0.026065,0.000104,11.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wyższa Szkoła Gospodarki w Bydgoszczy,0.052602,0.003352,0.000047,0.004287,0.155191,0.003448,0.037215,0.201763,0.062126,0.010039,...,1.533321,0.461543,0.036290,0.211892,0.032926,0.014752,0.084979,0.000052,22.0,0.000000
Wyższa Szkoła Zawodowa Kosmetyki i Pielęgnacji Zdrowia w Warszawie,0.004173,0.000000,0.000095,0.000142,0.000142,0.000190,0.002324,0.001470,0.020109,0.003794,...,0.157691,0.021437,0.012805,0.071566,0.028550,0.001233,0.016552,0.000000,2.0,0.000000
Zachodniopomorski Uniwersytet Technologiczny w Szczecinie,4.143641,0.069150,0.007841,0.190834,9.828948,0.593443,4.747241,11.894031,13.362332,4.761539,...,99.136862,19.677500,4.192498,33.227938,4.072993,0.645591,5.680443,0.005686,1228.0,0.021643
Śląski Uniwersytet Medyczny w Katowicach,0.188244,0.001555,0.001883,0.005003,0.218510,0.012146,0.079880,0.207087,0.757428,0.256034,...,5.670271,0.864119,0.388332,2.440184,0.782333,0.043164,0.521479,0.000627,67.0,0.000984


In [36]:
patent_sector

Unnamed: 0,kind,year_group,institution,nace,percent,nace_code
0,NONPUBLIC_UNIVERSITY,2015-2019,Wyższa Szkoła Gospodarki w Bydgoszczy,01,0.003045,A
1,NONPUBLIC_UNIVERSITY,2015-2019,Wyższa Szkoła Gospodarki w Bydgoszczy,01,0.001852,A
2,NONPUBLIC_UNIVERSITY,2015-2019,Wyższa Szkoła Gospodarki w Bydgoszczy,01,0.001269,A
3,NONPUBLIC_UNIVERSITY,2020-,Wyższa Szkoła Gospodarki w Bydgoszczy,01,0.008689,A
4,NONPUBLIC_UNIVERSITY,2020-,Wyższa Szkoła Gospodarki w Bydgoszczy,01,0.003045,A
...,...,...,...,...,...,...
3134467,SCIENTIFIC_INSTITUTION,2015-2019,Sieć Badawcza Łukasiewicz - Łódzki Instytut Te...,99,0.000000,U
3134468,SCIENTIFIC_INSTITUTION,2015-2019,Sieć Badawcza Łukasiewicz - Łódzki Instytut Te...,99,0.000000,U
3134469,SCIENTIFIC_INSTITUTION,2015-2019,Sieć Badawcza Łukasiewicz - Łódzki Instytut Te...,99,0.000000,U
3134470,SCIENTIFIC_INSTITUTION,2015-2019,Sieć Badawcza Łukasiewicz - Łódzki Instytut Te...,99,0.000000,U


In [37]:
# Group patents_inst by institution id 
import scipy.stats as stats



patents_inst_clustered['lat'] = patents_inst_clustered['point'].map(lambda x: str(x[1:-6]).split(', ')[0]).astype(float)
patents_inst_clustered['lon'] = patents_inst_clustered['point'].map(lambda x: str(x[1:-6]).split(', ')[1]).astype(float)

plot_data = patents_inst_clustered.groupby(['cluster','institution_id','name','lat','lon'])['id'].count().reset_index()
plot_data.rename(columns={'id':'count'}, inplace=True)


plot_data = plot_data.merge(patents_sector_influence, left_on='name', right_index=True, how='left', suffixes=('', ''))

for c in plot_data.columns:
    if c not in ['cluster','institution_id','name','lat','lon','count']:
        # Calculate decil 
        #plot_data[c]= plot_data[c].rank(pct=True) * 100
        #plot_data[c]= np.round(plot_data[c].rank(pct=True), 1) * 100
        
        
        # Perform the Box-Cox transformation
        #transformed_data, _ = stats.boxcox(plot_data[c])
        # Box-Cox transformation
        #plot_data[c]= transformed_data
        
        #plot_data[c] =  plot_data[c]  -  plot_data[c].min()
        
        plot_data[c] = plot_data[c].astype(float) / plot_data[c].max() * 100 +1
        if c in code_label_mapping:
            plot_data.rename(columns={c:code_label_mapping[c]}, inplace=True)
        




In [38]:
plot_data[['name',code_label_mapping['B']]].sort_values(code_label_mapping['B'], ascending=False)

Unnamed: 0,name,B - MINING AND QUARRYING
212,Główny Instytut Górnictwa,101.000000
69,Politechnika Śląska,62.967986
213,Instytut Techniki Górniczej KOMAG,62.771287
57,Politechnika Wrocławska,41.534993
148,Akademia Górniczo-Hutnicza im. Stanisława Stas...,36.572470
...,...,...
59,Instytut Botaniki im. Władysława Szafera Polsk...,1.005923
93,Narodowy Instytut Kardiologii Stefana Kardynał...,1.004983
97,Akademia Wychowania Fizycznego i Sportu im. Ję...,1.004372
116,Wyższa Szkoła Zawodowa Kosmetyki i Pielęgnacji...,1.003365


In [43]:
# Plot points from df on map of Poland using graph objects showing location of institutions with size of point based on number of patents
def plot_patent_map_go(df, lat, lon, color, title, columns=None):
    import plotly.graph_objects as go
    fig = go.Figure()
    
    
    if columns is None:
        columns =  [ col for col in df.columns if col not in ['cluster','institution_id','name','lat','lon', 'count']]
        columns = [ 'Total', *sorted(columns)]
    
    
    def create_layout_button(column):
        vis = [c == column for c in columns]
        return dict(label = column,
                    method = 'update',
                    args = [{'visible': vis,
                             'title': column,
                             'showlegend': True}])

    def create_trace(column):
        return go.Scattermapbox(
                    lat=df[lat],
                    lon=df[lon],
                    mode='markers',
                    marker=go.scattermapbox.Marker(
                        size=df[column],
                        allowoverlap=True,
                        color=df[color]
                    ),

                    text=df['name'],
                    name = column
                    
                )


    buttons = []
    
    for column in columns:
        fig.add_trace(create_trace(column))
        buttons.append(create_layout_button(column))
            

    #fig.update_traces(cluster=dict(enabled=True))
    fig.update_layout(
        title=title,
        hovermode='closest',
        mapbox=go.layout.Mapbox(
            accesstoken=open("auth/.mapbox_token").read(),
            bearing=0,
            center=go.layout.mapbox.Center(
                lat=52.3,
                lon=19
            ),
            pitch=0,
            zoom=5.5
        ),
        #height=900,
        #width=900
    )
    
    # Hide legend
    fig.update_layout(showlegend=False)
    
    # Hide all traces
    fig.update_traces(visible=False)
    
    # Show trace for Total
    fig.data[0].visible = True
    
    fig.update_layout(
        updatemenus=[
            go.layout.Updatemenu(
            active=0,
            buttons=buttons,
            #yanchor="bottom",
            name='NACE',
            xanchor="right",
            x=1
            )
            
        ])
   

    return fig


In [48]:

inst_influence_map = plot_patent_map_go(plot_data, 'lat', 'lon',  'cluster', 'Map of influence of Public Research Institutions on Polish economy based on patenting activity  <br><sup><b>Select the proper NACE sector using the dropdown menu on the top right</b></sup>')

In [49]:
inst_influence_map

In [55]:
inst_influence_map.write_html("docs/inst_influence_map.html")