In [None]:
import pandas as pd
import re
import seaborn as sns
import codemetrics as cm

# Example Analysis of Dataset

In [None]:
df = pd.read_pickle('../data/analysis_df.pickle')
df.head()

### Number of classes
#### Split categories

In [None]:
public_class_regex = '.*public class.*\n?{'
private_class_regex = '.*private class.*\n?{'
protected_class_regex = '.*protected class.*\n?{'
class_regex = '^class.*\n?{'
abstract_class_regex = '.*abstract class.*\n?{'
enum_regex = '.*enum.*\n?{'
interface_regex = '.*interface.*\n?{'

df_classes = df.copy()

df_classes['no_public_classes'] = df.src.apply(lambda src: len(re.findall(public_class_regex, src)))
df_classes['no_protected_classes'] = df.src.apply(lambda src: len(re.findall(protected_class_regex, src)))
df_classes['no_private_classes'] = df.src.apply(lambda src: len(re.findall(private_class_regex, src)))
df_classes['no_package_private_classes'] = df.src.apply(lambda src: len(re.findall(class_regex, src)))
df_classes['no_abstract_classes'] = df.src.apply(lambda src: len(re.findall(abstract_class_regex, src)))
df_classes['no_enums'] = df.src.apply(lambda src: len(re.findall(enum_regex, src)))
df_classes['no_interfaces'] = df.src.apply(lambda src: len(re.findall(interface_regex, src)))

df_classes = df_classes.drop(['file_name', 'src'], axis=1)

df_classes.head()

In [None]:

grouped = df_classes.groupby('dir').sum()

grouped = grouped.loc[:, (grouped != 0).any(axis=0)]
stacked = grouped.stack().reset_index().drop('dir', axis=1)
stacked.columns = ['class_type', 'class_count']
stacked.head(10)

In [None]:
sns.boxplot(stacked, x='class_count', y='class_type')

#### Total classes

In [None]:
df_classes['total'] = df_classes[['no_public_classes', 'no_protected_classes', 'no_private_classes', 'no_package_private_classes', 'no_abstract_classes', 'no_enums', 'no_interfaces']].sum(axis=1)
df_classes.head()

In [None]:
grouped = df_classes.groupby('dir').sum()
sns.boxplot(grouped, x='total')

### LOC

In [None]:
df_loc = df.copy()
df_loc.head()

In [None]:
comment_regex = '(\/\*\*|\*|\/\/)'
sloc_regex = ';|(\n?\s*{)|}'


df_loc['raw_lines'] = df.src.apply(lambda src: len(src.split('\n')))
# These do not sum to raw lines, as I count statement \n { == 1 line not two
df_loc['comment_lines'] = df.src.apply(lambda src: len(re.findall(comment_regex, src)))
# Physical SLOC
df_loc['sloc'] = df.src.apply(lambda src: len(re.findall(sloc_regex, src)))
df_loc['whitespace'] = df.src.apply(lambda src: len([line for line in src.split('\n') if len(line.strip()) == 0]) - 1)
df_loc['whitespace'] = df_loc.whitespace.apply(lambda count: 0 if count < 0 else count)

df_loc = df_loc.drop(['src'], axis=1)

df_loc.head()

#### Project Level

In [None]:
grouped = df_loc[['dir', 'raw_lines', 'comment_lines', 'sloc', 'whitespace']].groupby('dir').sum()

grouped = grouped.loc[:, (grouped != 0).any(axis=0)]
stacked = grouped.stack().reset_index().drop('dir', axis=1)
stacked.columns = ['count_type', 'count']
stacked.head(10)

In [None]:
sns.boxplot(stacked, y='count_type', x='count')

#### File Level

In [None]:
stacked = df_loc[['raw_lines', 'comment_lines', 'sloc', 'whitespace']].stack().reset_index().drop('level_0', axis=1)
stacked.columns = ['count_type', 'count']
stacked.head(10)

In [None]:
sns.boxplot(stacked, y='count_type', x='count')

### Iteration

In [None]:
df_iter = df.copy()
df.head()

In [None]:
for_regex = 'for\s*\([^;]*;[^;]*;.*\)\s*{'
for_each_regex = 'for\s*\([^:]*:[^;]*\)\s*\{'
while_regex = 'while\s*\(.*\)\s*\n?\{'
do_while_regex = 'do\s*{[^}]*}\s*while\s*\(.*\);'

df_iter['for'] = df.src.apply(lambda src: len(re.findall(for_regex, src)))
df_iter['for_each'] = df.src.apply(lambda src: len(re.findall(for_each_regex, src)))
df_iter['while'] = df.src.apply(lambda src: len(re.findall(while_regex, src)))
df_iter['do_while'] = df.src.apply(lambda src: len(re.findall(do_while_regex, src)))

df_iter.head()

#### File level

In [None]:
stacked = df_iter[['for', 'for_each', 'while', 'do_while']].stack().reset_index().drop('level_0', axis=1)
stacked.columns = ['count_type', 'count']
stacked.head(10)

In [None]:
sns.boxplot(stacked, y='count_type', x='count')

#### Project Level

In [None]:
grouped = df_iter[['dir', 'for', 'for_each', 'while', 'do_while']].groupby('dir').sum()

grouped = grouped.loc[:, (grouped != 0).any(axis=0)]
stacked = grouped.stack().reset_index().drop('dir', axis=1)
stacked.columns = ['count_type', 'count']
stacked.head(10)

In [None]:
sns.boxplot(stacked, y='count_type', x='count')