In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns


# Example Analysis of Dataset

In [None]:
submissions_df = pd.read_pickle('../data/analysis_df.pickle')
submissions_df.head()

In [None]:
template_df = pd.read_pickle('../data/template_df.pickle')
template_df.head()

### Number of classes
#### Split categories

In [None]:
public_class_regex = '.*public class.*\n?{'
private_class_regex = '.*private class.*\n?{'
protected_class_regex = '.*protected class.*\n?{'
class_regex = '^class.*\n?{'
abstract_class_regex = '.*abstract class.*\n?{'
enum_regex = '.*enum.*\n?{'
interface_regex = '.*interface.*\n?{'

def process_classes(df):
    proc_df = df.copy()

    proc_df['no_public_classes'] = proc_df.src.apply(lambda src: len(re.findall(public_class_regex, src)))
    proc_df['no_protected_classes'] = proc_df.src.apply(lambda src: len(re.findall(protected_class_regex, src)))
    proc_df['no_private_classes'] = proc_df.src.apply(lambda src: len(re.findall(private_class_regex, src)))
    proc_df['no_package_private_classes'] = proc_df.src.apply(lambda src: len(re.findall(class_regex, src)))
    proc_df['no_abstract_classes'] = proc_df.src.apply(lambda src: len(re.findall(abstract_class_regex, src)))
    proc_df['no_enums'] = proc_df.src.apply(lambda src: len(re.findall(enum_regex, src)))
    proc_df['no_interfaces'] = proc_df.src.apply(lambda src: len(re.findall(interface_regex, src)))

    proc_df['total'] = proc_df[['no_public_classes', 'no_protected_classes', 'no_private_classes', 'no_package_private_classes', 'no_abstract_classes', 'no_enums', 'no_interfaces']].sum(axis=1)

    proc_df = proc_df.drop(['file_name', 'src'], axis=1)

    return proc_df

In [None]:
def group_stack_classes(df):
    grouped = df.groupby('dir').sum()

    grouped = grouped.loc[:, (grouped != 0).any(axis=0)]
    stacked = grouped.stack().reset_index().drop('dir', axis=1)
    stacked.columns = ['class_type', 'class_count']

    return stacked

In [None]:
sub_classes = process_classes(submissions_df)
template_classes = process_classes(template_df)

stacked_sub_classes =  group_stack_classes(sub_classes)
stacked_template_classes =  group_stack_classes(template_classes)

stacked_sub_classes.head(10)

In [None]:
stacked_template_classes.head(10)

In [None]:
stacked_sub_classes = stacked_sub_classes[stacked_sub_classes['class_type'] != 'total'].copy()

stacked_sub_classes['source'] = 'submissions'
stacked_template_classes['source'] = 'template'

stacked_sub_classes['class_type'] = stacked_sub_classes['class_type'].apply(lambda class_type: " ".join(class_type.split('_')[1:]).title())

In [None]:
stacked_template_classes.head()

In [None]:
ax = sns.boxplot(stacked_sub_classes, x='class_count', y='class_type')
ax.set(xlabel='Total Classes', ylabel='Class Type')

# Manually done using the results from the cell above
ax.vlines(9, -0.5, 0.5, color='red')
ax.vlines(1, 3.5, 4.5, color='red')

plt.savefig('plots/classes_sep.png')

#### Total classes

In [None]:
sub_classes_total = sub_classes[['dir', 'total']].copy()
template_classes_total = template_classes[['dir', 'total']].copy()

sub_classes_total.head()

In [None]:
sub_classes_total['source'] = 'submissions'
template_classes_total['source'] = 'template'

classes_total = pd.concat([sub_classes_total, template_classes_total])

temp_df = classes_total.groupby(['source', 'dir']).sum().reset_index()
temp_df.head()

In [None]:
ax = sns.boxplot(temp_df[temp_df['source'] == 'submissions'], x='total')
ax.axvline(temp_df[temp_df['source'] == 'template'].iloc[0].total, color='red')
ax.set(xlabel='Total Classes')

plt.savefig('plots/classes_total.png')

### LOC

In [None]:
df_loc = df.copy()
df_loc.head()

In [None]:
comment_regex = '(\/\*\*|\*|\/\/)'
sloc_regex = ';|(\n?\s*{)|}'


df_loc['raw_lines'] = df.src.apply(lambda src: len(src.split('\n')))
# These do not sum to raw lines, as I count statement \n { == 1 line not two
df_loc['comment_lines'] = df.src.apply(lambda src: len(re.findall(comment_regex, src)))
# Physical SLOC
df_loc['sloc'] = df.src.apply(lambda src: len(re.findall(sloc_regex, src)))
df_loc['whitespace'] = df.src.apply(lambda src: len([line for line in src.split('\n') if len(line.strip()) == 0]) - 1)
df_loc['whitespace'] = df_loc.whitespace.apply(lambda count: 0 if count < 0 else count)

df_loc = df_loc.drop(['src'], axis=1)

df_loc.head()

#### Project Level

In [None]:
grouped = df_loc[['dir', 'raw_lines', 'comment_lines', 'sloc', 'whitespace']].groupby('dir').sum()

grouped = grouped.loc[:, (grouped != 0).any(axis=0)]
stacked = grouped.stack().reset_index().drop('dir', axis=1)
stacked.columns = ['count_type', 'count']
stacked.head(10)

In [None]:
sns.boxplot(stacked, y='count_type', x='count')

#### File Level

In [None]:
stacked = df_loc[['raw_lines', 'comment_lines', 'sloc', 'whitespace']].stack().reset_index().drop('level_0', axis=1)
stacked.columns = ['count_type', 'count']
stacked.head(10)

In [None]:
sns.boxplot(stacked, y='count_type', x='count')

### Iteration

In [None]:
df_iter = df.copy()
df.head()

In [None]:
for_regex = 'for\s*\([^;]*;[^;]*;.*\)\s*{'
for_each_regex = 'for\s*\([^:]*:[^;]*\)\s*\{'
while_regex = 'while\s*\(.*\)\s*\n?\{'
do_while_regex = 'do\s*{[^}]*}\s*while\s*\(.*\);'

df_iter['for'] = df.src.apply(lambda src: len(re.findall(for_regex, src)))
df_iter['for_each'] = df.src.apply(lambda src: len(re.findall(for_each_regex, src)))
df_iter['while'] = df.src.apply(lambda src: len(re.findall(while_regex, src)))
df_iter['do_while'] = df.src.apply(lambda src: len(re.findall(do_while_regex, src)))

df_iter.head()

#### File level

In [None]:
stacked = df_iter[['for', 'for_each', 'while', 'do_while']].stack().reset_index().drop('level_0', axis=1)
stacked.columns = ['count_type', 'count']
stacked.head(10)

In [None]:
sns.boxplot(stacked, y='count_type', x='count')

#### Project Level

In [None]:
grouped = df_iter[['dir', 'for', 'for_each', 'while', 'do_while']].groupby('dir').sum()

grouped = grouped.loc[:, (grouped != 0).any(axis=0)]
stacked = grouped.stack().reset_index().drop('dir', axis=1)
stacked.columns = ['count_type', 'count']
stacked.head(10)

In [None]:
sns.boxplot(stacked, y='count_type', x='count')