In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
import plotly.graph_objects as go

In [None]:
df = pd.read_pickle('data/processed.pickle')
df.head()

# Meta-Data
### Years Published

In [None]:
count_years = df.year_published.value_counts().reset_index()
count_years.head()

In [None]:
count_years['count'].sum()

In [None]:
years_plt = sns.catplot(count_years, kind='bar', x='year_published', y='count', color='darkgrey')
years_plt.set(xlabel='Year Published', ylabel='Number Published')
plt.savefig('plots/years_published.pdf')

# List formatter

In [None]:
def list_formatter(items):
    if len(items) == 1:
        return ("Only " + items[0]).title()
    else:
        return (', '.join(items[0:len(items) - 1]) + " and " + items[len(items) - 1]).title()

## Paper Types

In [None]:
paper_types = pd.Series([paper_type for paper_type in df.types]).value_counts().reset_index()
paper_types.columns = ["paper_type", 'no_type']
paper_types['proportion'] = (paper_types.no_type / paper_types.no_type.sum()) * 100
paper_types.paper_type = paper_types.paper_type.apply(lambda pt: list_formatter(pt))
paper_types

In [None]:
paper_types.no_type.sum()

In [None]:
ax_type = sns.barplot(paper_types, y='paper_type', x='proportion', color='darkgrey')
ax_type.set(xlabel = 'Proportion (%)', ylabel='Paper Type')
plt.xlim([0, 101])
plt.savefig('plots/paper_types.pdf', bbox_inches='tight')

## Degree of Automation

In [None]:
auto = pd.Series([tag.replace('approach:', '') for tags in df.tags for tag in tags if tag.startswith('approach')]).value_counts().reset_index()
auto.columns = ["degree_of_automation", 'number']
auto['proportion'] = (auto.number / auto.number.sum()) * 100
auto.head()

In [None]:
auto.number.sum()

In [None]:
pd.concat([df.title, df.tags.apply(lambda labels: [label for label in labels if 'approach' in label])], axis=1)

In [None]:
auto.degree_of_automation = auto.degree_of_automation.str.replace('_', ' ').str.title()
auto

In [None]:
ax_doa = sns.barplot(auto, y='degree_of_automation', x='proportion', color='darkgrey')
ax_doa.set(xlabel = 'Proportion (%)', ylabel='Degree of Automation')
plt.xlim([0, 101])
plt.savefig('plots/auto_degree.pdf', bbox_inches='tight')

## Interaction type

In [None]:
inter = pd.Series([tag.replace('interaction:', '') for tags in df.tags for tag in tags if tag.startswith('interaction')]).value_counts().reset_index()
inter.columns = ["interaction", 'number']
inter['proportion'] = (inter.number / inter.number.sum()) * 100
inter.head()

In [None]:
inter.number.sum()

In [None]:
pd.concat([df.title, df.tags.apply(lambda labels: [label for label in labels if 'interaction' in label])], axis=1)

In [None]:
inter.interaction = inter.interaction.str.title()
inter

In [None]:
ax_int = sns.barplot(inter, y='interaction', x='proportion', color='darkgrey')
ax_int.set(xlabel = 'Proportion (%)', ylabel='Interaction Type')
plt.xlim([0, 101])
plt.savefig('plots/interaction.pdf', bbox_inches='tight')

# Skills

In [None]:
skills = pd.Series([skill for skill in df.skills]).value_counts().reset_index()
skills.columns = ["skills", 'number']

skills['proportion'] = (skills.number / skills.number.sum()) * 100
skills.skills = skills.skills.apply(lambda pt: list_formatter(pt))
skills

In [None]:
skills.number.sum()

In [None]:
ax_skill = sns.barplot(skills, y='skills', x='proportion', color='darkgrey')
ax_skill.set(xlabel = 'Proportion (%)', ylabel='Skills Graded')
plt.xlim([0, 101])
plt.savefig('plots/skills.pdf', bbox_inches='tight')

# Skill-Category Complete Mapping
## 1D Plots

In [None]:
def skill_cat_formatter(sk):
    sk = [s.split('_') for s in sk]

    if len(sk) == 1:
        return " using ".join(sk[0]).title()
    elif len(sk) == 2:
        if sk[0][0] == sk[1][0]:
            return (sk[0][0] + ' using ' + sk[0][1] + ' and ' + sk[1][1]).title()
        if sk[0][1] == sk[1][1]:
            return (sk[0][0] + ' and ' + sk[1][0] + ' using ' + sk[0][1]).title()
        else:
            return (" and ".join([' using '.join(s) for s in sk])).title()
    else:
        return sk

In [None]:
skill_cat_count = df.skill_cat.value_counts().reset_index().copy()
skill_cat_count.columns = ["skill_cat", 'number']
skill_cat_count['proportion'] = (skill_cat_count.number / skill_cat_count.number.sum()) * 100
skill_cat_count.skill_cat = skill_cat_count.skill_cat.apply(lambda sk: skill_cat_formatter(sk))
skill_cat_count.skill_cat = skill_cat_count.skill_cat.str.replace('Ml', 'ML')
skill_cat_count

In [None]:
skill_cat_count.number.sum()

In [None]:
ax_skill_cat = sns.barplot(skill_cat_count[skill_cat_count.number > 2], y='skill_cat', x='proportion', color='darkgrey')
ax_skill_cat.set(xlabel = 'proportion (%)')
plt.xlim([0, 101])
plt.savefig('plots/skill_cat.pdf', bbox_inches='tight')

In [None]:
skill_cat_exp = df.explode('skill_cat').skill_cat.value_counts().reset_index()
skill_cat_exp.columns = ["skill_cat", 'number']
skill_cat_exp['proportion'] = (skill_cat_exp.number / skill_cat_exp.number.sum()) * 100
skill_cat_exp.skill_cat = skill_cat_exp.skill_cat.str.replace('_', ' ').str.title()
skill_cat_exp.head()

In [None]:
skill_cat_exp.number.sum()

In [None]:
ax_skill_cat_exp = sns.barplot(skill_cat_exp, y='skill_cat', x='proportion', color='darkgrey')
ax_skill_cat_exp.set(xlabel = 'Proportion (%)', ylabel='Skills Graded by Approach')
plt.xlim([0, 101])
plt.savefig('plots/skill_cat_exp.pdf', bbox_inches='tight')

## Skill Category 2D plots

In [None]:
skill_cat_2d = df[['title', 'skill_cat']].explode('skill_cat').copy()
skill_cat_2d.head()

In [None]:
split_df = skill_cat_2d.skill_cat.str.split('_', expand=True)
skill_cat_2d_split = pd.concat([skill_cat_2d, split_df],axis=1).drop(columns='skill_cat')
skill_cat_2d_split.columns = ['title', 'skill', 'category']
skill_cat_2d_split.head()

In [None]:
skill_cat_2d_grp = skill_cat_2d_split.groupby(['skill', 'category']).count().reset_index()
skill_cat_2d_grp.columns = ['skill', 'category', 'number']
skill_cat_2d_grp = skill_cat_2d_grp.sort_values('number', ascending=False)
skill_cat_2d_grp.head()
skill_cat_2d_grp.to_csv('data/skill_cat.csv')

In [None]:
sns.scatterplot(skill_cat_2d_grp, y='skill', x='category', size='number', sizes=(10, 500), legend=False)
plt.savefig('plots/skill_cat_2d_bubble.pdf', bbox_inches='tight')

### Bipartie

In [None]:
skill_cat_bp = skill_cat_2d_grp.copy()

skill_cat_bp.skill = skill_cat_bp.skill.map(lambda x: x[:4])
skill_cat_bp.category = skill_cat_bp.category.map(lambda x: x[:4])

skill_cat_bp['proportion'] = (skill_cat_bp.number / skill_cat_bp.number.sum()) * 50

skill_cat_bp

In [None]:
# Adapted from documentation
color_map_skill = {'corr':'b', 'read':'g', 'main':'r', 'docu':'c'}


B = nx.Graph()
B.add_nodes_from(['corr', 'read', 'main', 'docu'], bipartite=0)
B.add_nodes_from(['dyna', 'stat', 'ml'], bipartite=1)

bp_raw = list(skill_cat_bp.itertuples(index=False,name=None))

for e in bp_raw:
    B.add_edge(e[0], e[1], count=e[2], width=e[3], color=color_map_skill[e[0]])


# Separate by group
l, r = nx.bipartite.sets(B)
pos = {}

# Update position for node from each group
pos.update((node, (1, index)) for index, node in enumerate(l))
pos.update((node, (2, index)) for index, node in enumerate(r))

nodes = B.nodes()
node_colors = [color_map_skill[n] if n in color_map_skill else 'm' for n in nodes]

edges = B.edges()
edge_colors = [B[u][v]['color'] for u, v in edges]
weights = [B[u][v]['width'] for u, v in edges]

nx.draw(B, pos, edge_color=edge_colors, width=weights, node_size=1500, node_color=node_colors)

# node labels
nx.draw_networkx_labels(B, pos, font_size=14, font_family="sans-serif")

# edge weight labels
edge_labels = nx.get_edge_attributes(B, "count")

nx.draw_networkx_edge_labels(B, pos, edge_labels, label_pos=0.8)
plt.savefig('plots/skill_cat_bipartie.png')

### Sankey Diagram

In [None]:
skill_cat_sankey = skill_cat_2d_grp.copy()

skill_cat_sankey.sort_values(by=['skill', 'category']).reset_index().drop(columns=['index'])

In [None]:
CORRECTNESS_COLOR = '#E6798A'
MAINTAINABILITY_COLOR = '#59A1E6'
READABILITY_COLOR = '#6AAF46'
DOCUMENTATION_COLOR = '#B59945'

fig = go.Figure(data=[go.Sankey(
    node = dict(
        pad = 15,
        thickness = 20,
        line = dict(color="black", width = 0.5),
        label = skill_cat_sankey.skill.unique().tolist() + skill_cat_sankey.category.unique().tolist(),
        color = [CORRECTNESS_COLOR, READABILITY_COLOR, MAINTAINABILITY_COLOR, DOCUMENTATION_COLOR, 'grey', 'grey', 'grey']
    ),
    link = dict(
        source = [0, 0, 0, 1, 2, 2, 3, 3],
        target = [4, 5, 6, 6, 4, 6, 5, 6],
        value = skill_cat_sankey.number.tolist(),
        color = [CORRECTNESS_COLOR, CORRECTNESS_COLOR, CORRECTNESS_COLOR, READABILITY_COLOR, MAINTAINABILITY_COLOR, MAINTAINABILITY_COLOR, DOCUMENTATION_COLOR, DOCUMENTATION_COLOR]
    )
)])

fig.update_layout(
    hovermode = 'x',
    font=dict(size = 10, color = 'black'),
    plot_bgcolor='white',
    paper_bgcolor='white'
)

fig.show()
fig.write_image("plots/skill_cat_sankey.pdf")

## Skill Category Over Time
### Skill Category Combined

In [None]:
skill_cat_time = df[['skill_cat', 'year_published']].copy()
skill_cat_time = skill_cat_time.explode('skill_cat')
skill_cat_time.head()

In [None]:
skill_cat_time_grp = skill_cat_time.copy()
skill_cat_time_grp.skill_cat = skill_cat_time_grp.skill_cat.str.replace('_', ' ').str.title()
skill_cat_time_grp.year_published = skill_cat_time_grp.year_published.astype(str)
skill_cat_time_grp['cumulative_sum'] = 0
skill_cat_time_grp = skill_cat_time_grp.groupby(['skill_cat', 'year_published']).count().groupby(level=0).cumsum().reset_index()

skill_cat_time_grp.head()

In [None]:
sns.lineplot(skill_cat_time_grp, x='year_published', y='cumulative_sum', hue='skill_cat')
plt.legend(loc='upper left')
plt.savefig('plots/skill_cat_time.pdf')

### Skill Category Split

In [None]:
skill_cat_time_s = df[['skill_cat', 'year_published']].copy()
skill_cat_time_s = skill_cat_time_s.explode('skill_cat')
split = skill_cat_time_s.skill_cat.str.split('_', expand=True)
skill_cat_time_s = pd.concat([skill_cat_time_s, split], axis=1)
skill_cat_time_s = skill_cat_time_s.drop(columns='skill_cat')
skill_cat_time_s.columns = ['year_published', 'skill', 'category']
skill_cat_time_s.head()

In [None]:
skill_cat_time_s['total'] = 0
skill_cat_s_g = skill_cat_time_s.groupby(['year_published', 'skill', 'category']).count().reset_index().sort_values(['skill', 'category'])
skill_cat_s_g.category = skill_cat_s_g.category.replace('ml', 'machine learning')
skill_cat_s_g

In [None]:
color_map_year = {2017:'b', 2018:'g', 2019:'r', 2020:'c', 2021:'m'}
hatch_map_year = {2017:'//', 2018:'+', 2019:'\\', 2020:'-', 2021:'o'}
MARKER_SCALE = 100

def plot_years(sub_ax, skill, cat):

    temp_df = skill_cat_s_g[skill_cat_s_g.skill == skill]
    temp_df = temp_df[temp_df.category == cat]

    for index, row in temp_df.iterrows():
        x = [row.year_published for i in range(row.total)]
        y = [1 for i in range(row.total)]

        sub_ax.scatter(x, y, c='darkgrey', hatch=(3 * hatch_map_year[row.year_published]), label=row.year_published, s=(row.total * MARKER_SCALE))
        sub_ax.set(xlim=[2016, 2022])



In [None]:

import numpy as np
import matplotlib.patches as mpatches

fig = plt.figure()
grid = fig.add_gridspec(4, 3, wspace=0, hspace=0)

axs = grid.subplots()

skills = ['correctness', 'maintainability', 'readability', 'documentation']
category = ['dynamic', 'static', 'machine learning']

for (s, c), ax in np.ndenumerate(axs):
    plot_years(ax, skills[s], category[c])
    ax.set(xticks=[], yticks=[], xlabel=category[c].title())

    if c == 0:
        ax.set(ylabel=skills[s].title())

    if c == 0 and s == 3:
        patches = [mpatches.Patch(label=year, hatch=(3 * hatch)) for year, hatch in hatch_map_year.items()]
        ax.legend(handles=patches, loc='lower left')

fig.supxlabel('Approach Implemented')
fig.supylabel('Skill Graded')
plt.savefig('plots/skill_cat_time_sep.pdf')

# Grading/Feedback Techniques

In [None]:
tech_df = pd.Series([tag.replace('technique:', '') for tags in df.tags for tag in tags if tag.startswith('technique')]).value_counts().reset_index()
tech_df.columns = ["technique", 'number']
tech_df['proportion'] = (tech_df.number / tech_df.number.sum()) * 100
tech_df

In [None]:
def add_cats(tech):
    if tech in ['unit_testing', 'property_based_testing', 'ci', 'output_matching']:
        return 'dynamic'
    if tech in ['pattern_matching', 'static_analysis', 'code_metrics', 'cluster',
                'style_check', 'program_repair', 'rule_based', 'dsl_rules', 'model_solution_req',
                'model_solution_closeness', 'code_repair_for_feedback']:
        return 'static'
    if tech in ['machine_learning', 'misc']:
        return 'other'

In [None]:
tech_df['category'] = tech_df.technique.apply(lambda tech: add_cats(tech))
tech_df

In [None]:
tech_df.category.isna().sum()

In [None]:
tech_df[tech_df.category.isna()]

In [None]:
tech_df = tech_df.sort_values(by=['category', 'proportion'], ascending=[False, True])
tech_df.technique = tech_df.technique.str.replace('_', ' ').str.title()
tech_df.head()

In [None]:
ax_tech_cat = sns.FacetGrid(tech_df[tech_df.number > 5], hue='category', aspect=3)
ax_tech_cat.map_dataframe(plt.barh, y='technique', width='proportion')

ax_tech_cat.set(xlim=[0, 101], xlabel='Proportion (%)', ylabel='Technique Used To Grade')

hatches = ['//', 'x', '\\']

for hatch_pattern, these_bars in zip(hatches, ax_tech_cat.ax.containers):
    for this_bar in these_bars:
        this_bar.set_hatch(3 * hatch_pattern)


ax_tech_cat.add_legend()
plt.savefig('plots/technique_cat.pdf')

# Language

In [None]:
families = pd.Series([family for family in df.lang_family]).value_counts().reset_index()
families.columns = ["language_family", 'number']
families['proportion'] = (families.number / families.number.sum()) * 100
families.language_family = families.language_family.apply(lambda fl: list_formatter(fl))
families.language_family = families.language_family.str.replace('Oop', 'OOP')
families

In [None]:
families.number.sum()

In [None]:
ax_lang = sns.barplot(families, y='language_family', x='proportion', color='darkgrey')
ax_lang.set(xlabel = 'proportion (%)', ylabel = 'Language Paradigm')
plt.xlim([0, 101])
plt.savefig('plots/lang_family.pdf', bbox_inches='tight')

## Evaluation

In [None]:
evaluation  = pd.Series([tag.replace('evaluation:', '') for tags in df.tags for tag in tags if tag.startswith('evaluation')]).value_counts().reset_index()
evaluation.columns = ["evaluation", 'number']
evaluation['proportion'] = (evaluation.number / evaluation.number.sum()) * 100
evaluation.evaluation = evaluation.evaluation.str.replace('_', ' ').str.title()
evaluation

In [None]:
## TRUNCATED UNDER 5 RESULTS
ax_eval = sns.barplot(evaluation[evaluation['number'] > 4], y='evaluation', x='proportion', color='darkgrey')
ax_eval.set(xlabel = 'Proportion (%)', ylabel='Evaluation Technique')
plt.xlim([0, 101])
plt.savefig('plots/evaluation.pdf', bbox_inches='tight')

## Data Availiablity

In [None]:
data_availability  = pd.Series([tag.replace('data_available:', '') for tags in df.tags for tag in tags if tag.startswith('data_available')]).value_counts().reset_index()
data_availability.columns = ["data_available", 'number']
data_availability['proportion'] = (data_availability.number / data_availability.number.sum()) * 100
data_availability

In [None]:
data_availability.number.sum()

In [None]:
ax_da = sns.barplot(data_availability, y='data_available', x='proportion', color='darkgrey')
plt.xlim([0, 101])
ax_da.set(xlabel = 'proportion (%)')
plt.savefig('plots/data_availability.pdf', bbox_inches='tight')