# Imports and Data

In [None]:
import pandas as pd
import altair as alt

from scipy.stats import zscore
from IPython.display import IFrame

In [4]:
merged = pd.read_csv('../data/anon_example.csv', index_col=0)

In [5]:
merged.head()

Unnamed: 0,auth_key,tag,gender_1,prob_1,h_index,n_citations,n_items,hits
0,K0,sociology,male,0.98,6,203,59,20.0
1,K1,sociology,female,0.99,7,277,22,13.0
2,K2,sociology,male,1.0,1,5,3,2.0
3,K3,sociology,male,1.0,7,114,21,2.0
4,K4,sociology,male,1.0,2,12,5,5.0


# minor cleaning

In [9]:
merged = merged[merged['hits'].isnull()==False]

In [10]:
def clean_IV(value):
    
    if value == 'False':
        return False
    
    else:
        return int(value)

In [11]:
merged['h_index'] = merged['h_index'].apply(clean_IV)
merged['n_citations'] = merged['n_citations'].apply(clean_IV)
merged['n_items'] = merged['n_items'].apply(clean_IV)

In [12]:
merged['z_hits'] = zscore(merged['hits'])
merged['z_h'] = zscore(merged['h_index'])
merged['z_cites'] = zscore(merged['n_citations'])
merged['z_items'] = zscore(merged['n_items'])

In [13]:
merged['has_hits'] = merged['hits'].apply(lambda x: True if x>0 else False)

In [14]:
cor_data = merged[['z_hits', 'z_h', 'z_cites', 'z_items']].copy()

# Correlation Heatmap

In [15]:
def heatmap_correlations(df):

    cor_data = (
        df.corr()
        .stack()
        .reset_index()
        .rename(
            columns={0: 'correlation', 'level_0': 'variable', 'level_1': 'variable2'},
        )
    )
    cor_data['correlation_label'] = cor_data['correlation'].map('{:.2f}'.format)  # Round to 2 decimal

    base = alt.Chart(cor_data).encode(
        x='variable2:O',
        y='variable:O',
    )

    # Text layer with correlation labels
    # Colors are for easier readability
    text = base.mark_text().encode(
        text='correlation_label',
        color=alt.condition(
            alt.datum.correlation > 0.5,
            alt.value('white'),
            alt.value('black'),
        ),
    )

    # The correlation heatmap itself
    cor_plot = base.mark_rect().encode(
        color='correlation:Q',
    )

    return cor_plot + text  # The '+' means overlaying the text and rect layer

## overall heatmap
- expected correlations between typical impact/performance scores
- negligible correlations with hits

In [25]:
heatmap = heatmap_correlations(cor_data).properties(height=500, width=500)

In [26]:
heatmap.save('../local/heatmap.html')

  for col_name, dtype in df.dtypes.iteritems():


In [33]:
IFrame(src='../local/heatmap.html', width=750, height=650)

In [46]:
merged[merged['tag']=='sociology'][['z_hits', 'z_h', 'z_cites', 'z_items']]

Unnamed: 0,z_hits,z_h,z_cites,z_items
0,-0.092588,-0.232306,-0.217772,0.998186
1,-0.165971,-0.106051,-0.158799,-0.085342
2,-0.281288,-0.863581,-0.375567,-0.641749
3,-0.281288,-0.106051,-0.288700,-0.114627
4,-0.249838,-0.737326,-0.369988,-0.583180
...,...,...,...,...
1523,-0.270804,-0.989836,-0.379552,-0.700318
1530,-0.239354,0.272714,-0.089465,-0.261050
1531,-0.260321,-0.232306,-0.288700,0.148934
1532,-0.270804,-0.737326,-0.370785,-0.407472


## Heatmaps per subject
- mostly similar to overall heatmap
- low to moderate correltion between hits and citations/h score for "social science" and communication science

In [53]:
for t in set(merged['tag']):
    t_corr = merged[merged['tag']==t][['z_hits', 'z_h', 'z_cites', 'z_items']].copy()
    heatmap = heatmap_correlations(t_corr).properties(height=500, width=500)
    heatmap.properties(title=t)
    heatmap.properties(title=t).save(f'../local/heatmap_{t}.html')

  for col_name, dtype in df.dtypes.iteritems():
  for col_name, dtype in df.dtypes.iteritems():
  for col_name, dtype in df.dtypes.iteritems():
  for col_name, dtype in df.dtypes.iteritems():
  for col_name, dtype in df.dtypes.iteritems():


In [54]:
IFrame(src='../local/heatmap_communication science.html', width=750, height=650)

In [55]:
IFrame(src='../local/heatmap_social science.html', width=750, height=650)

In [81]:
tag_h = alt.Chart(merged).mark_bar().encode(
    x=alt.X('h_index'),
    y=alt.Y('count()'),
    facet='tag'
)

In [82]:
tag_h.save('../local/tag_h.html')
IFrame(src='../local/tag_h.html', width=1200, height=450)

# Scatter plots differentiated by gender and subject
- much lower spread for female scientists
- visible differences between subjects

In [72]:
gender_tag = alt.Chart(merged[merged['gender_1'].isnull()==False]).mark_circle().encode(
    x='n_citations',
    y='hits'
).facet(
    row='tag',
    column='gender_1'
)

In [80]:
gender_tag.save('../local/gender_tag.html')
IFrame(src='../local/gender_tag.html', width=1000, height=1800)

In [87]:
gender_tag = alt.Chart(merged).mark_bar().encode(
    x=alt.X('n_citations'),
    y=alt.Y('count()')
).facet(
    row='tag',
    column='has_hits'
)

In [88]:
gender_tag