In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
import os

from unipark import Preprocessor
from unipark import MetadataManipulator as MdManipulator
from unipark import CodeBookPageManipulator as CbpManipulator
from unipark import CodeBookParser

from unipark.utils.frame import get_finishers, get_pausers, get_nonstarters
from unipark.utils.plot import plot_worldmap, create_plot_from_truth_matrix
from unipark.utils.plots import single, free

from functools import reduce
from wordcloud import WordCloud, STOPWORDS

In [2]:
# load the paths to the input CSV, codebook, and figures save directory from the paths.json file located in the same directory
with open('configuration.json') as f:
    configuration = json.load(f)
input_csv = configuration['paths']['input_csv']
codebook_path = configuration['paths']['codebook_path']
figures_save_dir = configuration['paths']['figures_save_dir_autogen']

In [3]:
def page_id_unifyer(x):
    # Use this in case you performed the survey multiple times and concatenated the csvs --> page ids in multiple surveys are not identical and need to be normalized.
    tl = {  # page map
        110: 10,
        120: 20,
        130: 30,
        210: 10,
        220: 20,
        230: 30,
        310: 10,
        230: 20,
        330: 30,
    }
    if int(x) in tl:
        x=str(tl[int(x)])
    return x

#cbp = CodeBookParser(codebook_path ,reassign_page_id=page_id_unifyer)
cbp = CodeBookParser(codebook_path)

codebook = cbp.get_codebook()
page_name_by_id = {}
for page in codebook['pages']:
    page_name_by_id[page['id']] = page['title']

def to_named_page(x):
    return page_name_by_id[str(x)] if str(x) in page_name_by_id else str(x)

In [4]:
pproc = Preprocessor(pd.read_csv(input_csv,sep=";"), to_named_page)
pproc.drop_nonstarters()
pproc.apply_manipulator(MdManipulator())
for pagebook in cbp.get_codebook()['pages']:
    pproc.apply_manipulator(CbpManipulator(pagebook))

In [5]:
print('# {}'.format(cbp.get_codebook()['title']))
for page in pproc.pages:
    if(len(pproc.question_ids_by_page[page]) > 0):
        print('## {}'.format(page))

# Understanding Requirements Engineering Debt
## Demographics
## P1 Causes
## P2 Value (1)
## P2 Value (2)
## P3 Symptoms (1)
## P3 Symptoms (2)
## P4 Intentionality
## P5 Propagation
## P6 Detecting
## P7 Measuring
## P8 Tracking
## P9 Remediation


In [6]:
def page_to_markdown(page, headline_format='## {}', directory='./figs'):
    ret = headline_format.format(page) + '\n'
    question_ids = pproc.question_ids_by_page[page]
    path = os.path.join(directory, page.replace(' ', '_').replace('?',''))
    
    if not question_ids:
        ret = ret[:-1] + ': Empty\n'
        return ret
    
    for question_id in question_ids:
        ret += question_to_markdown(question_id, directory=path)
    return ret

In [7]:
def question_to_markdown(question_id, directory='./figs'):
    ret = f'### {pproc.get_question_title(question_id)}\n'

    if not os.path.exists(directory):
        os.makedirs(directory)
    path = os.path.join(directory, question_id)
    
    style = pproc.style_by_question_id[question_id]
    columns = pproc.columns_by_question_id[question_id]
    my_data = pproc.get_data()
    if 'single' == style:
        ret += visualize_single(my_data, question_id=question_id, file_prefix=path)
    elif 'multiple' == style:
        # todo
        ret += default_eval_multiple_question(my_data, columns=columns, file_prefix=path)
    elif 'rank' == style:
        print(f'{style} not supported yet!')
    elif 'free' == style:
        ret += visualize_free(my_data, question_id=question_id, file_prefix=path)
    elif 'freematrix' == style:
        ret += visualize_freematrix(my_data, columns=columns, file_prefix=path)
    elif 'matrix' == style:
        # todo
        ret += default_eval_matrix(my_data, columns=columns, file_prefix=path)
    else:
        assert False, 'Unknown stlye "{}"'.format(style)
        
    return ret

In [8]:
def get_ordered_value_counts(value_counts, order):
    vcs_ordered = value_counts.copy()
    for i in order:
        if i not in vcs_ordered.index:
            vcs_ordered[i]=0
    forwards = dict(zip(order, np.arange(len(order))))
    backwards = dict(zip(np.arange(len(order)), order))
    vcs_ordered = vcs_ordered.rename(index=forwards).sort_index().rename(index=backwards)
    return vcs_ordered

In [9]:
# visualize a single choice question
def visualize_single(data: pd.DataFrame, question_id, file_prefix=None, show=False):
    columns = pproc.columns_by_question_id[question_id]
    vcs = data[columns[0]].value_counts()
    count = reduce((lambda a,b:a+b), vcs.apply(lambda x: int(x) if x else 0), 0)

    # in case there are no answers, abort
    if count == 0:
        return 'Nobody answered this question!'
    ret = 'This question was answered by {} participants.\n'.format(count)

    # for each configuration: construct the according visualization
    for config in configuration['visualizations']['single']:
        # only consider finishers if configured so
        if config['finishers']:
            data = get_finishers(data)
            vcs = data[columns[0]].value_counts()

        # order the values if configured so
        if config['ordered']:
            order = pproc.order_of_question_id[question_id]
            vcs = get_ordered_value_counts(vcs, order)

        # generate the appropriate chart type
        if config['chart'] == 'bar':
            ret += single.barchart(vcs, likert=config['likert'], file_prefix=file_prefix)
        elif config['chart'] == 'pie':
            ret += single.piechart(vcs, likert=config['likert'], file_prefix=file_prefix)
        
    return ret

In [10]:
def default_eval_multiple_question(data, columns, file_prefix=None, show=False):
    ret = ''
    bool_columns = [x for x in columns if data[x].dtype==bool]
    labels = [x[8:] for x in bool_columns]
    create_plot_from_truth_matrix(data[bool_columns],names=labels, with_exclusives=True)
    if file_prefix is not None:
        path = file_prefix + '_distribution_bar.png'
        plt.savefig(path, dpi=1200, bbox_inches='tight')
        ret += '![alt text]({} "Title")\n'.format(path)
    if show: _=plt.show()
    else: plt.clf()
    
    #ticks = data[bool_columns].apply(np.count_nonzero, axis=1)
    #sns.violinplot(x=ticks, cut=0)
    #if file_prefix is not None:
    #    path = file_prefix + '_vote-count_distribution_violin.png'
    #    plt.savefig(path, dpi=1200, bbox_inches='tight')
    #    ret += '![alt text]({} "Title")\n'.format(path)
    #if show: _=plt.show()
    #else: plt.clf()
        
    ticks = data[bool_columns].apply(np.count_nonzero, axis=1)
    sns.histplot(x=ticks, discrete=True)
    if file_prefix is not None:
        path = file_prefix + '_vote-count_distribution_hist.png'
        plt.savefig(path, dpi=1200, bbox_inches='tight')
        ret += '![alt text]({} "Title")\n'.format(path)
    if show: _=plt.show()
    else: plt.clf()    
        
    non_bool_columns = [x for x in columns if x not in bool_columns]
    for column in non_bool_columns:
        path = file_prefix + '_'+column.replace(' ', '_').replace('?','') + '_wordcloud.png' if file_prefix else None
        p = free.wordcloud(data[column], file_prefix=path)
        if path and p is not None:
            ret += 'Of those ({}) who filled out "{}" the following wordcloud could be built:\n'.format(
                np.count_nonzero(data[column].apply(lambda x: x is not None)),
                column)
            ret += '![alt text]({} "Title")\n'.format(path)
    
    return ret

In [11]:
def default_eval_matrix(data:pd.DataFrame, columns, file_prefix=None, show=False):
    ret = ''
    vcs = []
    for col in columns[0]:
        col_data = data[col]
        col_vc = col_data.value_counts()
        col_vc = col_vc.rename('count')
        col_vc = col_vc.to_frame()
        col_vc['answer'] = col_vc.index
        col_vc['col'] = col
        col_vc = col_vc.reset_index(drop=True)
        vcs.append(col_vc)
    counts = pd.concat(vcs)
    
    sns.barplot(data=counts, x='col', y='count', hue='answer')
    plt.xticks(rotation=45, horizontalalignment='right')
    plt.tight_layout()
    if file_prefix is not None:
        path = file_prefix + '_matrix_bar_q.png'
        plt.savefig(path, dpi=1200, bbox_inches='tight')
        ret += '![alt text]({} "Title")\n'.format(path)
    if show: _=plt.show()
    else: plt.clf()
        
    sns.barplot(data=counts, hue='col', y='count', x='answer')
    plt.xticks(rotation=45, horizontalalignment='right')
    plt.tight_layout()
    if file_prefix is not None:
        path = file_prefix + '_matrix_bar_a.png'
        plt.savefig(path, dpi=1200, bbox_inches='tight')
        ret += '![alt text]({} "Title")\n'.format(path)
    if show: _=plt.show()
    else: plt.clf()    
    return ret

In [12]:
# visualize a free question (list of free, natural language text)
def visualize_free(data, question_id, file_prefix=None, show=False):
    series = data[pproc.columns_by_question_id[question_id][0]]
    count = np.count_nonzero(series.apply(lambda x: x is not None))

    # in case there are no answers, abort
    if count == 0:
        return 'Nobody answered this question!'
    ret = 'This question was answered by {} participants.\n'.format(count)

    # for each configuration: construct the according visualization
    for config in configuration['visualizations']['free']:
        if config['plot'] == 'wordcloud':
            ret += free.wordcloud(series, file_prefix=file_prefix, show=show)
        elif config['plot'] == 'textlist':
            ret += free.textlist(series)
            
    return ret

In [13]:
# visualize a free matrix question (list of questions with a list of natural language answers)
def visualize_freematrix(data:pd.DataFrame, columns, file_prefix=None, show=False):
    ret = ""
    # for each configuration: construct the according visualization
    for config in configuration['visualizations']['free']:
        if config['plot'] == 'textlist':
            ret += free.multitextlist(data, columns)

    return ret

In [14]:
# generate the markdown report 
report = '# {}\n'.format(cbp.get_codebook()['title'])
print('# {}'.format(cbp.get_codebook()['title']))
for page in [pproc.pages[2]]:
    mpage = page_to_markdown(page, directory=figures_save_dir)
    print (mpage)
    report += mpage

# Understanding Requirements Engineering Debt
## P1 Causes
### In your experience, which of the following options qualify as causes of requirements engineering debt?
![alt text](./test/figs/auto_gen\P1_Causes\9267132_distribution_bar.png "Title")
![alt text](./test/figs/auto_gen\P1_Causes\9267132_vote-count_distribution_hist.png "Title")
Of those (4) who filled out "9267132  Other (please specify) string" the following wordcloud could be built:
![alt text](./test/figs/auto_gen\P1_Causes\9267132_9267132__Other_(please_specify)_string_wordcloud.png "Title")
### To what extentÂ do you agree with the statement: "The overall software process model has an influence on the likelihood of introducing requirements engineering debt."
This question was answered by 68 participants.
![['Fully disagree: 1', 'Rather disagree: 3', 'Neither disagree nor agree: 14', 'Rather agree: 33', 'Fully agree: 17']](./test/figs/auto_gen\P1_Causes\9267269_bar_8.png "Title")
### Please motivate your answer to the que

<Figure size 432x288 with 0 Axes>

<Figure size 2880x2160 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 2880x2160 with 0 Axes>

In [15]:
with open('Report.md', 'w+') as file:
    file.write(report)