In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


### Paths

In [None]:
survey_response = "/home/nikhil/projects/neuroinformatics_tools/ecr-fair/survey/openpub_survey_response.csv"

### Load data

In [None]:
survey_df = pd.read_csv(survey_response)
n_respondents = survey_df.shape[0]
print(f"Number of respondents: {n_respondents}")
survey_df.head()

### Palettes

In [None]:
coolwarm_4 = ["#0081a7", "#00afb9", "#fed9b7", "#f07167"]
sns.palettes.color_palette(coolwarm_4)
sns.palplot(coolwarm_4)

In [None]:
coolwarm_7= ["#001524","#15616d","#8aa79f","#ffecd1","#ff7d00","#bc5308","#78290f"]
sns.palettes.color_palette(coolwarm_7)
sns.palplot(coolwarm_7)

In [None]:
cat_1 = ["#335c67","#fff3b0","#e09f3e","#9e2a2b","#540b0e"]
sns.palettes.color_palette(cat_1)
sns.palplot(cat_1)

In [None]:
cat_2 = ["#335c67","#fff3b0","#e09f3e","#9e2a2b"]
sns.palettes.color_palette(cat_2)
sns.palplot(cat_2)

In [None]:
reds = ["#590d22","#800f2f","#a4133c","#c9184a","#ff4d6d","#ff758f","#ff8fa3","#ffb3c1","#ffccd5"]
reds = reds[::-1]
sns.palettes.color_palette(reds)
sns.palplot(reds)

### Daily difficulties

In [None]:
daily_tasks_cols = survey_df.columns[3:10]

plot_df = survey_df[daily_tasks_cols].copy()

plot_df = plot_df.rename(columns={daily_tasks_cols[0]   : "capture",
                                    daily_tasks_cols[1] : "organization",
                                    daily_tasks_cols[2] : "processing",
                                    daily_tasks_cols[3] : "maintenance",
                                    daily_tasks_cols[4] : "QC",
                                    daily_tasks_cols[5] : "annotation",
                                    daily_tasks_cols[6] : "publication"})

plot_counts = plot_df.apply(pd.value_counts).fillna(0)
plot_counts["time_burden"] = ["most", "high", "above-avg", "average", "below-avg", "low", "least"]
plot_counts = plot_counts.set_index("time_burden")
plot_counts

In [None]:
g = sns.heatmap(plot_counts.T, annot=True, cmap=coolwarm_7, cbar=True)
g.set_title("Time-burden of data-tasks (counts)")
g.set_xticklabels(g.get_xticklabels(), rotation=45)


### FAIR experience

In [None]:
fair_cols = survey_df.columns[12:16]
plot_df = survey_df[fair_cols].copy()

plot_df = plot_df.rename(columns={fair_cols[0]   : "Findability",
                                    fair_cols[1] : "Accessibility",
                                    fair_cols[2] : "Interoperability",
                                    fair_cols[3] : "Reusability"})


plot_df.head()


In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 8), sharex=False)

for i, col in enumerate(plot_df.columns):
    ax = axes.flatten()[i]
    
    if col == "Reusability":
        plot_df["Reusability"] = plot_df["Reusability"].str.split(",")
        plot_df["Reusability"] = plot_df["Reusability"].explode("Reusability").str.strip()

    col_df = plot_df[col].copy()
    
    plot_counts = col_df.value_counts().to_frame().T.copy()
    
    if col == "Findability":
        # plot_counts = plot_counts[plot_counts > 1].dropna(axis=1)
        plot_counts = plot_counts.rename(columns={"The dataset’s existence could only be known through personal contact with the dataset owner(s) (i.e. “insider knowledge”)":
                                                   "The dataset’s existence could only be known through personal contact with the dataset owner(s)"})

    elif col == "Accessibility":
        plot_counts = plot_counts[plot_counts > 5].dropna(axis=1)
        plot_counts["Received incomplete data and needed back & forth communicatoin with the authors"] = n_respondents - plot_counts.sum(axis=1).values[0]
    
    elif col == "Reusability":
        plot_counts = plot_counts[plot_counts > 1].dropna(axis=1)
        plot_counts["The dataset was too messy for reuse in reasonable timeframe"] = n_respondents - plot_counts.sum(axis=1).values[0]
        plot_counts = plot_counts.rename(columns={"There was enough information about how the data was collected so that an independent group could reproduce the original experiment":
                                                    "There was enough information for an independent group to reproduce the original experiment"})
    
    elif col == "Interoperability":
        plot_counts = plot_counts[plot_counts > 1].dropna(axis=1)
        plot_counts["Short description (with errors) was available"] = n_respondents - plot_counts.sum(axis=1).values[0]

    else:
        print(f"unknown column: {col}")

    plot_counts_percent = plot_counts / n_respondents * 100
    plot_counts_percent.plot(kind="barh", stacked=True, color=cat_2, legend=True, ax=ax, width=0.25, align="center")
    # ax.legend(title=col, loc="upper center", bbox_to_anchor=(0.5, 2.2), fontsize=10)
    ax.legend(fontsize=9, loc="upper center")
    ax.set_title(col)
    # ax.set_xticklabels("")
    ax.set_yticklabels("")
    ax.set_xlabel("Counts (%)")
    sns.despine(left=True, bottom=False)
plt.tight_layout()

### Fairnes of my data

In [None]:
self_fair_cols = survey_df.columns[16:24]

plot_df = survey_df[self_fair_cols].copy()

plot_df = plot_df.rename(columns={self_fair_cols[0]   : "F1",
                                    self_fair_cols[1] : "F2",
                                    self_fair_cols[2] : "A1",
                                    self_fair_cols[3] : "A2",
                                    self_fair_cols[4] : "I1",
                                    self_fair_cols[5] : "I2",
                                    self_fair_cols[6] : "R1",
                                    self_fair_cols[7] : "R2"})


plot_counts = plot_df.apply(pd.value_counts).fillna(0)
plot_counts["FAIRness"] = ["very poor","poor","below avg","above avg","good","excellent"] +  ["not sure"]
plot_counts = plot_counts.set_index("FAIRness")
plot_counts

In [None]:
g = sns.heatmap(plot_counts.T, annot=True, cmap=coolwarm_7, cbar=True)
g.set_title("FAIRness of my data (counts)")
g.set_xticklabels(g.get_xticklabels(), rotation=45)


### reproducibility timeframe

In [None]:
repro_cols = survey_df.columns[24:27]
plot_df = survey_df[repro_cols].copy()
time_dict = {"< 1 week": 1,
            "1 week - 1 month": 2,
            "1 month - 6 months": 3,
            ">  6 months": 4}

plot_df = plot_df.replace(time_dict)
plot_df.head()

In [None]:
# plt.plot(plot_df[repro_cols[1]],plot_df[repro_cols[2]], "o", jitter=True)
g = sns.jointplot(x=plot_df[repro_cols[1]], y=plot_df[repro_cols[2]], 
                  kind="hist", color=coolwarm_5[-1], space=1,
                  height=6, xlim=(1, 4), ylim=(1, 4))
g.set_axis_labels("You to reproduce someone else's work","Someone else to reproduce your work", fontsize=12)

g.ax_joint.set_xticks(range(1, 5))
g.ax_joint.set_xticklabels(time_dict.keys(), rotation=45)
g.ax_joint.set_yticks(range(1, 5))
g.ax_joint.set_yticklabels(time_dict.keys(), rotation=45)


### Challenges in rerpoducibility

In [None]:
challenge_cols = survey_df.columns[28:33]
challenge_cols

In [None]:
plot_df = survey_df[challenge_cols].copy()

plot_df = plot_df.rename(columns={challenge_cols[0]   : "Finding data details",
                                    challenge_cols[1] : "Accessing data",
                                    challenge_cols[2] : "Interoperability of metadata",
                                    challenge_cols[3] : "Reusability issues (e.g. privacy/provenence)",
                                    challenge_cols[4] : "Lack of academic incentives"})                            


plot_counts = plot_df.apply(pd.value_counts).fillna(0)
plot_counts["difficulty/time burden"] = ["very high","high","average","low","very low"] 
plot_counts = plot_counts.set_index("difficulty/time burden")
plot_counts

In [None]:
g = sns.heatmap(plot_counts.T, annot=True, cmap=coolwarm_7, cbar=True)
g.set_title("Challenges in reproducing work (rank-counts)")
g.set_xticklabels(g.get_xticklabels(), rotation=45)
