# Template errors search analysis

## Settings

In [None]:
import ast
from random import sample, seed

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
# Set path to following csv files (or use preset default)

ranking_file_path = '~/Desktop/data/output/java/ranking_char_by_char.csv'
steps_file_path = '~/Desktop/data/output/java/steps_with_groups_cnt.csv'

In [None]:
# Reading and preparing data

df_ranking = pd.read_csv(ranking_file_path)
df_steps = pd.read_csv(steps_file_path)

df_ranking_by_step = df_ranking.groupby('step_id')
df_ranking = df_ranking.sort_values(by='origin_class')

In [None]:
seed(24)


def count_commits(df_issues, df_steps):
    commit_cnts = {}
    groups_cnt_by_step = pd.Series(df_steps['groups_cnt'].values, index=df_steps['id'])

    for issue in df_issues['origin_class'].unique():
        df = df_issues[df_issues['origin_class'] == issue]
        commit_cnts[issue] = df.apply(lambda x: round(groups_cnt_by_step[x['step_id']] * x['frequency']), axis=1).sum()

    return commit_cnts


def get_commit_cnt(issue, df_issues, df_steps):
    COMMIT_CNTS = count_commits(df_issues, df_steps)
    return COMMIT_CNTS[issue]


def get_step_difficulty(step_id, df_steps):
    return df_steps[df_steps['id'] == step_id].iloc[0]['difficulty']

In [None]:
# Configuring plots

diff_palette = dict(zip(['hard', 'medium', 'easy'], sns.color_palette("nipy_spectral_r").as_hex()))


def draw_occurrence_cnt(df_issues, title):
    plt.figure(figsize=(30, 7))
    g = sns.countplot(x=df_issues['origin_class'], order=df_issues['origin_class'].value_counts().index)
    g.set_yscale("log")
    plt.xticks(rotation=45, ha="right", rotation_mode='anchor')
    plt.ylabel('occurrence count')
    plt.title(title)
    plt.show()


def draw_commit_cnt(df_issues, df_issues_steps, title):
    COMMIT_CNTS = count_commits(df_issues, df_issues_steps)
    issues = np.array(df_issues['origin_class'].values)
    cnts = np.array([COMMIT_CNTS[issue] for issue in issues])
    idx = np.argsort(cnts)[::-1]
    issues = np.array(issues)[idx]
    cnts = np.array(cnts)[idx]
    plt.figure(figsize=(30, 7))
    g = sns.barplot(x=issues, y=cnts)
    g.set_yscale("log")
    plt.xticks(rotation=45, ha="right", rotation_mode='anchor')
    plt.xlabel('origin_class')
    plt.ylabel('commit count')
    plt.title(title)
    plt.show()


def draw_step_stats(step, df_issues, df_issues_steps):
    COMMIT_CNTS = count_commits(df_issues, df_issues_steps)
    issues = df_issues['origin_class'].values
    df_value_counts = df_issues['origin_class'].value_counts()
    occurrence_cnts = [df_value_counts[issue] for issue in issues]
    commit_cnts = [COMMIT_CNTS[issue] for issue in issues]
    plt.figure(figsize=(20, 7))
    sns.lineplot(x=issues, y=occurrence_cnts, marker='o', label='occurrence count')
    sns.lineplot(x=issues, y=commit_cnts, marker='o', label='commit count')
    plt.xticks(rotation=45, ha="right", rotation_mode='anchor')
    plt.title(
        f'Occurrence and commit counts for step {step} (difficulty: {get_step_difficulty(step, df_issues_steps)})')
    plt.legend()
    plt.show()


def draw_issues_disribution(step, df_issues, df_issues_steps):
    df_issues_in = df_issues[~df_issues['pos_in_template'].isna()]
    df_issues = df_issues[df_issues['pos_in_template'].isna()].drop_duplicates(
        subset=['origin_class', 'frequency', 'pos_in_template'])
    df = pd.concat([df_issues, df_issues_in], ignore_index=True).sort_values(by=['frequency'], ascending=False)
    plt.figure(figsize=(20, 7))
    sns.scatterplot(x=range(df.shape[0]), y=df['frequency'], hue=df['origin_class'],
                    style=~df['pos_in_template'].isna(), s=100, edgecolor='black')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.title(
        f'Issues distribution for step {step} (difficulty: {get_step_difficulty(step, df_issues_steps)})')
    plt.show()


def draw_occurence_cnt_by_difficulty(df_issues, df_steps, title):
    plt.figure(figsize=(30, 7))
    g = sns.countplot(x=df_issues['origin_class'],
                      hue=df_issues['step_id'].map(lambda x: get_step_difficulty(x, df_steps)),
                      hue_order=['easy', 'medium', 'hard'], palette=diff_palette,
                      order=df_issues['origin_class'].value_counts().index)
    g.set_yscale("log")
    plt.xticks(rotation=45, ha="right", rotation_mode='anchor')
    plt.ylabel('occurrence count')
    plt.title(title)
    plt.legend(loc='upper right')
    plt.show()


def draw_commit_cnt_by_difficulty(df_issues, df_issues_steps, title):
    issues = np.array(df_issues['origin_class'].values)
    cnts = np.array([get_commit_cnt(issue, df_issues[df_issues['step_id'].isin(
        df_issues_steps[df_issues_steps['difficulty'] == get_step_difficulty(step_id, df_issues_steps)]['id'])],
                                    df_issues_steps)
                     for issue, step_id in df_issues[['origin_class', 'step_id']].values])
    idx = np.argsort(cnts)[::-1]
    issues = np.array(issues)[idx]
    cnts = np.array(cnts)[idx]
    hue = df_issues['step_id'].map(lambda x: get_step_difficulty(x, df_issues_steps)).values
    plt.figure(figsize=(30, 7))
    g = sns.barplot(x=issues, y=cnts, hue=hue, hue_order=['easy', 'medium', 'hard'], palette=diff_palette, ci=None)
    g.set_yscale("log")
    plt.xticks(rotation=45, ha="right", rotation_mode='anchor')
    plt.xlabel('origin_class')
    plt.ylabel('commit count')
    plt.title(title)
    plt.show()


class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

## Steps dataset description

Step is a task where user is asked to write some code.

Attributes considered in the analysis:
* `id`: - *identifier of step*
* `difficulty` - [`easy`, `medium`, `hard`] *according to success_rate [sr<=0.33, 0.33<sr<0.66, sr>=0.66]*
* `groups_cnt` - *number of submission series (group of submissions by one user) used in errors search*

## Ranking dataset description

Issues ranking is the result of running template errors search algorithm on submissions.

Ranking consists of blocks, each containing a sequence of issues for a single step sorted by frequency of being uncorrected.

Issue is considered uncorrected in a submission group if it appeares in every submission of the group.
Then frequency of being uncorrected is number of submission groups where the issue was not corrected divided by total number of groups for the step (`groups_cnt` in terms of step attributes).

Attributes considered in the analysis:
* `step_id` - *identifier of corresponding step*
* `origin_class` - *class of the issue*
* `frequency` - *percentage of submission groups in which the issue was not corrected (from 0 to 1)*
* `pos_in_template` - *position in corresponding template (`None` if no template line matched the issue line)*

## Generalized statistics

**Occurrence count** for a specific issue class in a ranking block is number of different issues with this issue class within the block.

**Commit count** for a specific issue class in a ranking block is number of times an issue with this issue class appeared among submissions within the block.
In other words, commit count is sum of number of submission groups in which an issue was not corrected for every issue with this issue class.

Histograms below present occurrence and commit counts summed over all ranking blocks.

In [None]:
draw_occurrence_cnt(df_ranking, title='Total occurrence counts')
draw_commit_cnt(df_ranking, df_steps, title='Total commit counts')

In [None]:
df_template_errors = df_ranking[~df_ranking['pos_in_template'].isna()].sort_values(by='origin_class')

draw_occurrence_cnt(df_template_errors, 'Total occurrence count for template errors')
draw_occurence_cnt_by_difficulty(df_template_errors, df_steps,
                                 'Total occurrence count for template errors by step difficulty')

draw_commit_cnt(df_template_errors, df_steps, 'Total commit count for template errors')
draw_commit_cnt_by_difficulty(df_template_errors, df_steps, 'Total commit count for template errors by step difficulty')

## Statistics by step

For demonstration purposes, graphs below represent statistics for individual steps, selected randomly.

In [None]:
random_steps = sample(df_ranking['step_id'].unique().tolist(), 8)

### Occurrence and commit counts by step

In [None]:
for step_id in random_steps:
    df = df_ranking_by_step.get_group(step_id)
    draw_step_stats(step_id, df, df_steps)

### Detailed issues distribution by step

Plots below present issues distribution for a specified step. Every point represent an issue with its **index number among considered issues** on x-axis and its **frequency** on y-axis.
Color of the point specifies issue class, marker of the point specifies whether issue is considered as template error or not.

*Note: for the convenience of analysis, errors not from the template with the same class and frequency were excluded from consideration.*

In [None]:
for step_id in random_steps:
    df = df_ranking_by_step.get_group(step_id)
    draw_issues_disribution(step_id, df, df_steps)

### Code template marked with detected errors by step

Output below represents code templates of selected steps with detected errors marked in comments.
Here **ISSUE** is for `origin_class` and **FREQ** is for `frequency`.

In [None]:
df_steps['code_templates'] = df_steps['code_templates'].map(lambda x: ast.literal_eval(x))

In [None]:
for step_id in random_steps:
    output = ''
    output = output + f'{bcolors.BOLD}================================= STEP_ID={step_id} ================================={bcolors.ENDC}' + '\n'
    df = df_ranking_by_step.get_group(step_id)
    template = df_steps[df_steps['id'] == step_id].iloc[0]['code_templates']
    for i in range(len(template)):
        df_cur = df[df['pos_in_template'] == i]
        for issue, freq in df_cur[['origin_class', 'frequency']].values:
            output = output + f'{bcolors.FAIL}// ISSUE: {issue}, FREQ: {freq:.3f}{bcolors.ENDC}' + '\n'
        output = output + template[i] + '\n'
    print(output)