In [None]:
import pandas as pd

In [None]:
df = pd.read_pickle('data/processed.pickle')
df.head()

# Discussion Stats
## Grading
### Correctness

In [None]:
correct_df = df.copy(deep=True)
correct_df.skills = correct_df.skills.astype(str)
correct_df = correct_df[correct_df.skills.str.contains("^\['correctness'\]$")]
correct_df['tech'] = correct_df.tags.apply(lambda labels: [label for label in labels if label.startswith('technique')])
correct_df[['key', 'title', 'skill_cat', 'tech']]

In [None]:
(correct_df.shape[0] / df.shape[0]) * 100

In [None]:
correct_df.skill_cat.value_counts()

In [None]:
(correct_df.category.value_counts() / correct_df.shape[0]) * 100

In [None]:
correct_df['only_dynamic'] = df.category.apply(lambda x: 'dynamic' in str(x) and 'static' not in str(x) and 'ml' not in str(x))
(correct_df[correct_df.only_dynamic].explode('tech').tech.value_counts() / correct_df[correct_df.only_dynamic].shape[0]) * 100

In [None]:
correct_df['only_static'] = df.category.apply(lambda x: 'dynamic' not in str(x) and 'static' in str(x) and 'ml' not in str(x))
(correct_df[correct_df.only_static].explode('tech').tech.value_counts() / correct_df[correct_df.only_static].shape[0]) * 100

In [None]:
correct_df['dynamic_static'] = df.category.apply(lambda x: 'dynamic' in str(x) and 'static' in str(x) and 'ml' not in str(x))
(correct_df[correct_df.dynamic_static].explode('tech').tech.value_counts() / correct_df[correct_df.dynamic_static].shape[0]) * 100

In [None]:
correct_df['ml'] = df.category.apply(lambda x: 'ml' in str(x))
ml_df = correct_df.loc[correct_df.ml]
ml_df[['key', 'title', 'category']]

### Maintainability

In [None]:
maint_df = df.copy(deep=True)
maint_df.skills = maint_df.skills.astype(str)
maint_df = maint_df[maint_df.skills.str.contains("^\['maintainability'\]$")].copy()
maint_df

In [None]:
maint_df['tech'] = maint_df.tags.apply(lambda labels: [label for label in labels if label.startswith('technique')])
(maint_df.explode('tech').tech.value_counts() / maint_df.shape[0]) * 100

### Readability

In [None]:
# Analysing hint based problem solving strategy among novice programmers through gamification technique - Might be removed
read_df = df.copy(deep=True)
read_df.skills = read_df.skills.astype(str)
read_df = read_df[read_df.skills.str.contains("^\['readability'\]$")]
read_df[['key', 'title', 'category']]

In [None]:
(read_df.shape[0] / df.shape[0]) * 100

In [None]:
read_df.category.value_counts()

### Correctness, Documentation

In [None]:
doc_df = df.copy(deep=True)
doc_df.skills = doc_df.skills.astype(str)
doc_df[doc_df.skills.str.contains("^\['correctness', 'documentation'\]$")]

### Combination Graders

In [None]:
(df.skills.value_counts() / df.shape[0]) * 100

In [None]:
(df.skill_cat.value_counts() / df.shape[0]) * 100

In [None]:
combo_df = df.copy(deep=True)
combo_df.skills = combo_df.skills.astype(str)
combo_df[combo_df.skills.str.contains("^\['correctness', 'readability', 'maintainability']$")]

### Code Repair papers

In [None]:
cr_df = df.copy()
cr_df['tech'] = cr_df.tags.apply(lambda labels: [label for label in labels if 'code_repair' in label or 'program_repair' in label])
cr_df.tech = cr_df.tech.map(lambda labels: None if len(labels) == 0 else labels)
cr_df[~cr_df.tech.isna()]

### Data for supplementary material

In [None]:
df[['key', 'skill_cat']].explode('skill_cat').sort_values('skill_cat')

## Techniques
### Data for supplementary material

In [None]:
def add_cats(tech):
    if tech in ['unit_testing', 'property_based_testing', 'ci', 'output_matching']:
        return 'dynamic'
    if tech in ['pattern_matching', 'static_analysis', 'code_metrics', 'cluster',
                'style_check', 'program_repair', 'rule_based', 'dsl_rules', 'model_solution_req',
                'model_solution_closeness', 'code_repair_for_feedback']:
        return 'static'
    else:
        return 'other'

In [None]:
tech_df = df.copy()
tech_df.tags = tech_df.tags.apply(lambda labels: [str(label) for label in labels if label.startswith('technique')])
tech_df = tech_df.explode('tags')
tech_df.tags = tech_df.tags.str.replace('technique:', '')
tech_df['category'] = tech_df.tags.apply(lambda tech: add_cats(tech))
tech_df[["key", 'category', "tags"]].sort_values(by=['category', 'tags'])

## Degree of Automation

In [None]:
doa_df = df.copy()
doa_df['approach'] = doa_df.tags.apply(lambda labels: [str(label) for label in labels if label.startswith('approach')])
doa_df = doa_df.explode('approach')
doa_df.approach.value_counts()

In [None]:
(doa_df.approach.value_counts() /  doa_df.shape[0]) * 100

In [None]:
(doa_df[doa_df.approach.str.contains('fully')].skills.value_counts() / doa_df.shape[0]) * 100

#### CI/CD

In [None]:
df.explode('tags')[df.explode('tags').tags.str.contains('ci_cd')]

#### Semi

In [None]:
doa_df[doa_df.approach.str.contains('semi')].explode('tags').tags.value_counts()

### Data for supplementary

In [None]:
doa_df[["key", "approach"]].sort_values(by='approach')

## Language Paradigms

In [None]:
lp_df = df.copy()
lp_df.lang_family.value_counts()

In [None]:
(lp_df.lang_family.value_counts() / lp_df.shape[0]) * 100

In [None]:
lp_df_query = lp_df.copy()
lp_df_query.lang_family = lp_df_query.lang_family.astype(str)
lp_df_query[lp_df_query.lang_family.str.contains('Query')]

### Data for supplementary

In [None]:
lp_df.explode('lang_family')[["key", "lang_family"]].sort_values(by='lang_family')

## Evaluation

In [None]:
eval_df = df.copy()
eval_df.tags = eval_df.tags.apply(lambda labels: [str(label) for label in labels if label.startswith('evaluation')])
(eval_df.tags.value_counts() / eval_df.shape[0]) * 100

In [None]:
eval_df = eval_df.explode('tags')
(eval_df.tags.value_counts() / eval_df.shape[0]) * 100

In [None]:
eval_df[eval_df.tags.str.contains('manual_grading')]

In [None]:
eval_df[eval_df.tags.str.contains('none')]

### Evaluation Only

In [None]:
eval_df[eval_df.types.map(len) == 1 & eval_df.types.map(lambda t: t[0] == 'evaluation')]

### Data for supplementary

In [None]:
eval_df[["key", "tags"]].sort_values(by='tags')

## Data Availability
### Data for supplementary

In [None]:
data_df = df.copy()
data_df.tags = data_df.tags.apply(lambda labels: [str(label) for label in labels if label.startswith('data_available')])
data_df[["key", "tags"]].sort_values(by='tags')