# Evaluate Metdata columns

In [None]:
import os
import json
import pandas as pd
from unipark import Preprocessor, CodeBookParser
from unipark import MetadataManipulator as MdManipulator
from unipark.utils.frame import get_finishers, get_pausers, get_nonstarters
import matplotlib.pyplot as plt

In [None]:
# load the paths to the input CSV, codebook, and figures save directory from the paths.json file located in the same directory
with open('paths.json') as f:
    paths = json.load(f)
input_csv = paths['input_csv']
codebook_path = paths['codebook_path']
figures_save_dir = paths['figures_save_dir_metadata']

In [None]:
if not os.path.exists(figures_save_dir):
    os.makedirs(figures_save_dir)

In [None]:
cbp = CodeBookParser(codebook_path)

codebook = cbp.get_codebook()
page_name_by_id = {}
for page in codebook['pages']:
    page_name_by_id[page['id']] = page['title']
    
def to_named_page(x):
    return page_name_by_id[str(x)] if str(x) in page_name_by_id else str(x)

In [None]:
pproc = Preprocessor(pd.read_csv(input_csv,sep=";"))
pproc.apply_manipulator(MdManipulator())
pproc.drop_removable()

In [None]:
data = pproc.data
md = pproc.get_metadata()

## Participant count

In [None]:
print("Total participants:\t{}".format(len(md)))
print("\tfinished:\t{}".format(len(get_finishers(md))))
print("\tpaused:\t\t{}".format(len(get_pausers(md))))
print("\tunstarted:\t{}".format(len(get_nonstarters(md))))

In [None]:
dispcodes = data["dispcode_named"].value_counts().sort_index()
print(dispcodes)
dispcodes.plot.pie()

## Pages paticipants stopped on

In [None]:
ax = md["lastpage"].value_counts().rename(index=to_named_page).sort_index(ascending=False).plot.pie()
ax.set_title('Last page of participants')
plt.savefig(os.path.join(figures_save_dir,'last_page_pie.png'), dpi=1200, bbox_inches='tight')

### Those who paused stopped on page...

In [None]:
paused_on_page = get_pausers(md)["lastpage"].value_counts().rename(index=to_named_page).sort_index()
if len(paused_on_page) > 0:
    print(paused_on_page)
    ax = paused_on_page.plot.bar(legend=True)
    ax.set_title('#Participants who stopped on page')
    plt.savefig(os.path.join(figures_save_dir,'page_stopped_bar.png'), dpi=1200, bbox_inches='tight')
else:
    print('No pausers contained in this dataset')

## Platform distribution

In [None]:
pf = data['platform']
pff = get_finishers(data)['platform']
vcs = pd.DataFrame.from_dict({'all':pf.value_counts(), 'finishers':pff.value_counts()})
vcs_rel = pd.DataFrame.from_dict({'all':pf.value_counts()/len(pf), 'fins':pff.value_counts()/len(pff)})
ax = pf.value_counts().plot.pie()
ax.set_title('OS usage of participants')
plt.savefig(os.path.join(figures_save_dir,'os_distribution_pie.png'), dpi=1200, bbox_inches='tight')
plt.show()
ax = pff.value_counts().plot.pie()
ax.set_title('OS usage of finishers')

In [None]:
vcs = pd.DataFrame.from_dict({'all':pf.value_counts(), 'finishers':pff.value_counts()})
vcs_rel = pd.DataFrame.from_dict({'all':pf.value_counts()/len(pf), 'finishers':pff.value_counts()/len(pff)})
ax = vcs_rel.plot.bar()
ax.set_title("Relative OS usage of participants")
plt.savefig(os.path.join(figures_save_dir,'os_distribution_bar.png'), dpi=1200, bbox_inches='tight')

In [None]:
data['platform'].value_counts()/len(data)