In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns
from tqdm import tqdm

tqdm.pandas()


# Example Analysis of Dataset

In [None]:
submissions_df = pd.read_pickle('../data/analysis_df.pickle')
# Exclude directories that require manual data cleaning before release
exclude = ['../data/proc/18~19/18~19_Submission_686']
submissions_df = submissions_df[~submissions_df.dir.isin(exclude)]
submissions_df.head()

In [None]:
template_df = pd.read_pickle('../data/template_df.pickle')
template_df.head()

### Count of submissions

In [None]:
submissions_df.groupby('dir').count().shape[0]

### Number of classes
#### Split categories

In [None]:
public_class_regex = '.*public class.*\n?{'
private_class_regex = '.*private class.*\n?{'
protected_class_regex = '.*protected class.*\n?{'
class_regex = '^class.*\n?{'
abstract_class_regex = '.*abstract class.*\n?{'
enum_regex = '.*enum.*\n?{'
interface_regex = '.*interface.*\n?{'

def process_classes(df):
    proc_df = df.copy()

    proc_df['no_public_classes'] = proc_df.src.apply(lambda src: len(re.findall(public_class_regex, src)))
    proc_df['no_protected_classes'] = proc_df.src.apply(lambda src: len(re.findall(protected_class_regex, src)))
    proc_df['no_private_classes'] = proc_df.src.apply(lambda src: len(re.findall(private_class_regex, src)))
    proc_df['no_package_private_classes'] = proc_df.src.apply(lambda src: len(re.findall(class_regex, src)))
    proc_df['no_abstract_classes'] = proc_df.src.apply(lambda src: len(re.findall(abstract_class_regex, src)))
    proc_df['no_enums'] = proc_df.src.apply(lambda src: len(re.findall(enum_regex, src)))
    proc_df['no_interfaces'] = proc_df.src.apply(lambda src: len(re.findall(interface_regex, src)))

    proc_df['total'] = proc_df[['no_public_classes', 'no_protected_classes', 'no_private_classes', 'no_package_private_classes', 'no_abstract_classes', 'no_enums', 'no_interfaces']].sum(axis=1)

    proc_df = proc_df.drop(['file_name', 'src'], axis=1)

    return proc_df

In [None]:
def group_stack_classes(df):
    grouped = df.groupby('dir').sum()

    grouped = grouped.loc[:, (grouped != 0).any(axis=0)]
    stacked = grouped.stack().reset_index()
    stacked.columns = ['dir', 'class_type', 'class_count']

    return stacked

In [None]:
sub_classes = process_classes(submissions_df)
template_classes = process_classes(template_df)

stacked_sub_classes =  group_stack_classes(sub_classes)
stacked_template_classes =  group_stack_classes(template_classes)

stacked_sub_classes.tail(10)

In [None]:
stacked_template_classes.head(10)

In [None]:
stacked_sub_classes = stacked_sub_classes[stacked_sub_classes['class_type'] != 'total'].copy()

stacked_sub_classes['source'] = 'submissions'
stacked_template_classes['source'] = 'template'

stacked_sub_classes['class_type'] = stacked_sub_classes['class_type'].apply(lambda class_type: " ".join(class_type.split('_')[1:]).title())

In [None]:
stacked_template_classes.head()

In [None]:
ax = sns.boxplot(stacked_sub_classes, x='class_count', y='class_type')
ax.set(xlabel='Total Classes', ylabel='Class Type')

# Manually done using the results from the cell above
ax.vlines(9, -0.5, 0.5, color='red')
ax.vlines(1, 3.5, 4.5, color='red')

plt.tight_layout()
plt.savefig('plots/classes_sep.png')

In [None]:
stacked_sub_classes.groupby("class_type").describe().unstack()

In [None]:
stacked_sub_classes.groupby("class_type")['class_count'].median()

In [None]:
stacked_template_classes.groupby("class_type").describe().unstack()

In [None]:
stacked_template_classes.groupby("class_type")['class_count'].median()

#### Total classes

In [None]:
sub_classes_total = sub_classes[['dir', 'total']].copy()
template_classes_total = template_classes[['dir', 'total']].copy()

sub_classes_total.head()

In [None]:
sub_classes_total['source'] = 'submissions'
template_classes_total['source'] = 'template'

classes_total = pd.concat([sub_classes_total, template_classes_total])

temp_df = classes_total.groupby(['source', 'dir']).sum().reset_index()
temp_df.head()

In [None]:
ax = sns.boxplot(temp_df[temp_df['source'] == 'submissions'], x='total')
ax.axvline(temp_df[temp_df['source'] == 'template'].iloc[0].total, color='red')
ax.set(xlabel='Total Classes')

plt.savefig('plots/classes_total.png')

In [None]:
sub_classes_total[['dir', 'total']].groupby('dir').sum().describe()

In [None]:
sub_classes_total[['dir', 'total']].groupby('dir').sum().median()

In [None]:
template_classes_total[['dir', 'total']].groupby('dir').sum().describe()

In [None]:
template_classes_total[['dir', 'total']].groupby('dir').sum().median()

### LOC

In [None]:
javadoc_regex = '/\*\*[^\{\}]*\*/'
comment_regex = '/\*[^\*][^\{\}]*\*/|//' # Including multiline and single line comments
sloc_regex = ';|(\n?\s*{)|}'

def process_sloc(df):
    df_loc = df.copy()

    df_loc['raw_lines'] = df.src.apply(lambda src: len(src.split('\n')))
    # These do not sum to raw lines, as I count statement \n { == 1 line not two, and comments lines are counts if they are at the end of a source line
    df_loc['comment_lines'] = df.src.progress_apply(lambda src: sum([len(comment.split('*')) for comment in re.findall(comment_regex, src)])) # Adjust for "/*\n" counting as two lines
    df_loc['javadoc_lines'] = df.src.progress_apply(lambda src: sum([len(docstring.strip('\n').split('*')) - 2 for docstring in re.findall(javadoc_regex, src)])) # Adjust for "/**\n" counting as three lines
    # Physical SLOC
    df_loc['sloc'] = df.src.progress_apply(lambda src: len(re.findall(sloc_regex, src)))
    df_loc['whitespace'] = df.src.progress_apply(lambda src: len([line for line in src.split('\n') if len(line.strip()) == 0]) - 1)
    df_loc['whitespace'] = df_loc.whitespace.progress_apply(lambda count: 0 if count < 0 else count)

    df_loc = df_loc.drop(['src'], axis=1)

    return df_loc

In [None]:
submission_sloc = process_sloc(submissions_df)
submission_sloc.head()

In [None]:
template_sloc = process_sloc(template_df)
template_sloc.head()

#### Project Level

In [None]:
def stack_loc(df):
    grouped = df[['dir', 'raw_lines', 'comment_lines', 'javadoc_lines', 'sloc', 'whitespace']].groupby('dir').sum()
    grouped = grouped.loc[:, (grouped != 0).any(axis=0)]
    stacked = grouped.stack().reset_index()
    stacked.columns = ['dir', 'count_type', 'count']

    return stacked

In [None]:
stacked_sub_sloc = stack_loc(submission_sloc)
stacked_sub_sloc.head(10)

In [None]:
stacked_template_sloc = stack_loc(template_sloc)
stacked_sub_sloc.head()

In [None]:
stacked_sub_sloc.count_type = stacked_sub_sloc.count_type.apply(lambda count_type: " ".join(count_type.split('_')).title())

ax = sns.boxplot(stacked_sub_sloc[stacked_sub_sloc['count'] < 6000], y='count_type', x='count')
ax.set(xlabel='Total Lines', ylabel='Count Type')

# Manually done using the results from the cell above
ax.vlines(1302, -0.5, 0.5, color='red')
ax.vlines(79, 0.5, 1.5, color='red')
ax.vlines(405, 1.5, 2.5, color='red')
ax.vlines(643, 2.5, 3.5, color='red')
ax.vlines(113, 3.5, 4.5, color='red')

plt.tight_layout()
plt.savefig('plots/sloc.png')

In [None]:
stacked_sub_sloc.groupby("count_type").describe().unstack()

In [None]:
stacked_sub_sloc.groupby("count_type")['count'].median()

In [None]:
stacked_template_sloc.groupby("count_type").describe().unstack()

In [None]:
stacked_template_sloc.groupby("count_type")['count'].median()

### LOC File-Level

In [None]:
def stack_loc_file(df):
    grouped = df[['dir', 'file_name', 'raw_lines', 'comment_lines', 'javadoc_lines', 'sloc', 'whitespace']].groupby(['dir', 'file_name']).sum()
    grouped = grouped.loc[:, (grouped != 0).any(axis=0)]
    stacked = grouped.stack().reset_index()
    stacked.columns = ['dir', 'file_name', 'count_type', 'count']

    return stacked

In [None]:
stacked_sub_sloc_file = stack_loc_file(submission_sloc)
stacked_sub_sloc_file.head()

In [None]:
stacked_sub_sloc_file.count_type = stacked_sub_sloc.count_type.apply(lambda count_type: " ".join(count_type.split('_')).title())

ax = sns.boxplot(stacked_sub_sloc_file[stacked_sub_sloc_file['count_type'] == 'Raw Lines'], x='count')
ax.set(xlabel='Total Lines')

plt.tight_layout()
plt.savefig('plots/sloc_file.png')

In [None]:
stacked_sub_sloc_file[stacked_sub_sloc_file['count_type'] == 'Raw Lines'].describe()

In [None]:
stacked_sub_sloc_file[stacked_sub_sloc_file['count_type'] == 'Raw Lines']['count'].median()

### Median Sized Submission

In [None]:
med_df = pd.concat([temp_df, stacked_sub_sloc[stacked_sub_sloc['count_type'] == 'Sloc']]).groupby('dir').sum().drop(columns=['count_type', 'source']).reset_index()
med_df.head()

In [None]:
medians = med_df[['total', 'count']].median()
medians

In [None]:
med_df[(med_df['total'] == medians['total']) & (med_df['count'] >= medians['count'] - 20) & (med_df['count'] <= medians['count'] + 20)]

### Duplication Validation

In [None]:
pot_dup_df = med_df[med_df.duplicated(['total', 'count'], keep=False)].groupby(['total', 'count'])['dir'].apply(list).reset_index()
pot_dup_df = pd.DataFrame(pot_dup_df['dir'].to_list(), index=pot_dup_df.index)
pot_dup_df.head()

In [None]:
files_df = submissions_df[['dir', 'file_name']].groupby('dir')['file_name'].apply(list).reset_index()
files_df.head()

In [None]:
files_dict = dict(zip(files_df['dir'], files_df['file_name'].tolist()))

In [None]:
dups_df = pd.DataFrame(columns=['0', '1'])

for row in pot_dup_df.iterrows():
    data = row[1]

    if files_dict[data[0]] == files_dict[data[1]]:
        dups_df = pd.concat([dups_df, pd.DataFrame({'0': data[0], '1': data[1]}, index=[0])])

    if data[2] is not None:
        if files_dict[data[0]] == files_dict[data[2]]:
            dups_df = pd.concat([dups_df, pd.DataFrame({'0': data[0], '1': data[2],}, index=[0])])
        elif files_dict[data[1]] == files_dict[data[2]]:
            dups_df = pd.concat([dups_df, pd.DataFrame({'0': data[1], '1': data[2],}, index=[0])])

dups_df = dups_df.reset_index(drop=True)
dups_df.to_csv('../data/dups.csv')
dups_df

## Iteration

In [None]:
for_regex = 'for\s*\([^;]*;[^;]*;.*\)\s*\{'
for_each_regex = 'for\s*\([^:]*:[^;]*\)\s*\{'
while_regex = 'while\s*\(.*\)\s*\n?\{'
do_while_regex = 'do\s*{[^}]*}\s*while\s*\(.*\);'

def interation_analysis(df):
    df_iter = df.copy()

    df_iter['for'] = df.src.apply(lambda src: len(re.findall(for_regex, src)))
    df_iter['for_list'] = df.src.apply(lambda src: re.findall(for_regex, src))
    df_iter['for_each'] = df.src.apply(lambda src: len(re.findall(for_each_regex, src)))
    df_iter['while'] = df.src.apply(lambda src: len(re.findall(while_regex, src)))
    df_iter['do_while'] = df.src.apply(lambda src: len(re.findall(do_while_regex, src)))

    return df_iter

In [None]:
submission_iter = interation_analysis(submissions_df)
submission_iter.head()

In [None]:
template_iter = interation_analysis(template_df)
template_iter.head()

#### Project Level

In [None]:
def stack_iter(df):
    grouped = df[['dir', 'for', 'for_each', 'while', 'do_while']].groupby('dir').sum()

    grouped = grouped.loc[:, (grouped != 0).any(axis=0)]
    stacked = grouped.stack().reset_index()
    stacked.columns = ['dir', 'count_type', 'count']

    return stacked

In [None]:
stacked_sub_iter = stack_iter(submission_iter)
stacked_sub_iter.head()

In [None]:
stacked_template_iter = stack_iter(template_iter)
stacked_template_iter.head()

In [None]:
stacked_sub_iter.count_type = stacked_sub_iter.count_type.apply(lambda count_type: " ".join(count_type.split('_')).title())

ax = sns.boxplot(stacked_sub_iter, y='count_type', x='count')

ax.set(xlabel='Total', ylabel='Interation Type')

# Manually done using the results from the cell above
ax.vlines(14, -0.5, 0.5, color='red')
ax.vlines(4, 0.5, 1.5, color='red')
ax.vlines(1, 1.5, 2.5, color='red')

plt.tight_layout()
plt.savefig('plots/iteration.png')

In [None]:
stacked_sub_iter[stacked_sub_iter['count_type'] == 'While']['count'].mean()

In [None]:
stacked_sub_iter.groupby("count_type").describe().unstack()

In [None]:
stacked_sub_iter.groupby("count_type")['count'].median()

In [None]:
stacked_template_iter.groupby("count_type").describe().unstack()

In [None]:
stacked_template_iter.groupby("count_type")['count'].median()