In [None]:
import os
import numpy as np
import pandas as pd
import statistics
from matplotlib import pyplot as plt
import matplotlib as mpl

In [None]:
input_version = 5
codebook_version = 2
output_version = 4

output_flag = True
verbose_flag = False # in-notebook outputs

input_file = 'derived-dataframes/regression-data-v{}/codebook{}_longform.csv'.format(input_version, codebook_version)
output_dir = 'output/figures-v{}'.format(output_version)

if output_flag:
    try:
        os.mkdir(output_dir)
    except FileExistsError:
        print('High-level output directory already exists; no action taken.')

In [None]:
# I use LaTeX for some plot title formatting
# On my Ubuntu 20.04 system, my setup was:
# apt install texlive-latex-extra texstudio dvipng ghostscript cm-super texlive-latex-extra texlive-fonts-recommended
# pip install latex
# Not necessary for the figures actually included in the paper (just skip those cells)
supports_latex = True

In [None]:
# util for displaying dataframes
# the defaults are actually 60 & 20, but that gets annoying
def show(da, rows = 20, cols = 20, width = None):
    pd.set_option('display.max_rows', rows)
    pd.set_option('display.max_columns', cols)
    pd.set_option('display.max_colwidth', width)
    display(da)
    pd.reset_option('max_rows')
    pd.reset_option('max_columns')
    pd.reset_option('display.max_colwidth')

In [None]:
da1 = pd.read_csv(input_file, keep_default_na=False, na_values=['_'])
if verbose_flag:
    show(da1)

In [None]:
da1['code'].value_counts().sort_index()

## Get number of helpers
Helpers must send at least one annotated message chunk (and have User ID > 0)

In [None]:
hda = da1.copy()
hda['contains non-greeting'] = ~hda['code'].str.contains('greeting')
tmp = hda.groupby('document').aggregate(
    {'annotator' : 'nunique'}).reset_index().rename(
    columns={'annotator' : 'num annotators'})
hda = pd.merge(left=hda, right=tmp, how='left', on='document')

In [None]:
hda = hda.groupby(['document', 'speaker']).aggregate(
    {'code' : ['count', 'nunique'], 
     'num annotators' : 'first', 
     'contains non-greeting' : 'any'} #, lambda s : s.str.endswith('greeting').any()
    ).reset_index()
hda = hda[hda['speaker'] > 0].reset_index(drop=True)
hda.columns = hda.columns = [' '.join(col).strip() for col in hda.columns.values] # hda.columns.to_flat_index()
hda = hda.rename(columns={'code count' : 'annotation count', 
                          'code nunique' : 'num unique codes', 
                          'num annotators first' : 'num annotators',
                          'contains non-greeting any' : 'contains non-greeting'})
hda['avg annotation count'] = hda['annotation count'] / hda['num annotators']
hda

In [None]:
hda.hist('avg annotation count', bins=25, figsize=(10, 4))
plt.axvline(hda['avg annotation count'].median(), ymin=0, ymax=1, color='yellow', linestyle='--', 
            label='Median = {:.2f}'.format(hda['avg annotation count'].median()))
plt.axvline(hda['avg annotation count'].mean(), ymin=0, ymax=1, color='tab:orange', linestyle='--', 
            label='Mean = {:.2f}'.format(hda['avg annotation count'].mean()))
plt.xlabel('Total number of message chunks sent (for the whole chat)')
plt.ylabel('Number of helpers')
plt.title('Total chat activity per helper, n={} (averaged across annotators)'.format(len(hda)))
plt.legend()
plt.show()

print('{} helpers sent at least 1 non-greeting message chunk'.format(
    hda['contains non-greeting'].sum()))

for t in [2, 3, 4, 5, 10]:
    print('{} helpers sent at least {} message chunks'.format(
        len(hda[hda['avg annotation count'] >= t]), t))

In [None]:
hda[~hda['contains non-greeting'] & hda['annotation count'] >= 10]
if verbose_flag:
    show(hda[~hda['contains non-greeting']])

## Double check data validity

In [None]:
da1['document'].nunique()

In [None]:
len(da1)

In [None]:
# data validity check
print(tmp['num annotators'].value_counts()) # annotator count per document
dd = tmp.loc[tmp['num annotators'] == 1, 'document'].to_list()
da1[da1['document'].isin(dd)].groupby('document')['annotator'].unique()

In [None]:
print('Helping annotations:    ', (da1['code_primary'] == 'Helping').sum())
print('Questioning annotations:', (da1['code_primary'] == 'Questioning').sum())
print('Attitude annotations:   ', (da1['code_primary'].str.startswith('Attitude')).sum())
print('Structural annotations: ', (da1['code_primary'].str.startswith('Big picture')).sum())
# da1['code_primary'].value_counts()

In [None]:
# da1.loc[da1['code_primary'].str.startswith('Big picture of an interaction'), 'code'].value_counts()
print('Requests:', da1['code_primary'].str.startswith('Big picture of an interaction > request').sum())
print('Outcomes:', da1['code_primary'].str.startswith('Big picture of an interaction > resolveRequest').sum()) # ??

## Get request types by majority vote across annotators per interaction

In [None]:
da1.groupby(by=['document', 'conversation_number']).aggregate({'voted_conversation_requests' : 'first'}).value_counts()

### Measure dissent

In [None]:
# one entry per conversation-annotator
da2_condensed = da1[da1['code'].str.startswith('Big picture of an interaction > resolveRequest')]
da2_condensed.shape

In [None]:
requests_agree = da2_condensed['conversation_requests'] == da2_condensed['voted_conversation_requests']
requests_agree.value_counts()

In [None]:
da2_condensed.loc[~requests_agree]['voted_conversation_requests'].value_counts()

In [None]:
a = da2_condensed.loc[~requests_agree]['voted_conversation_requests'].tolist()
b = da2_condensed.loc[~requests_agree]['conversation_requests'].tolist()
vals, counts = np.unique(list(zip(a, b)), return_counts=True, axis=0)

print('MAJORITY         CLAIMED                         NO. INSTANCES')
for i in range(len(vals)):
    maj = vals[i][0]
    cla = vals[i][1]
    print(maj, ' ' * (15 - len(maj)), cla, ' ' * (30 - len(cla)), counts[i])

## Get outcomes by majority vote across annotators per interaction

In [None]:
da1.groupby(by=['document', 'conversation_number']).aggregate({'voted_conversation_outcome' : 'first'}).value_counts()

### Measure dissent

In [None]:
outcomes_agree = da2_condensed['conversation_outcome'] == da2_condensed['voted_conversation_outcome']
outcomes_agree.value_counts()

In [None]:
da2_condensed.loc[~outcomes_agree]['voted_conversation_outcome'].value_counts()

In [None]:
a = da2_condensed.loc[~outcomes_agree]['voted_conversation_outcome'].tolist()
b = da2_condensed.loc[~outcomes_agree]['conversation_outcome'].tolist()
c = da2_condensed.loc[~outcomes_agree]['voted_conversation_requests'].tolist()
vals, counts = np.unique(list(zip(a, b, c)), return_counts=True, axis=0)

print('REQUESTS              MAJORITY    CLAIMED     NO. INSTANCES')
for i in range(len(vals)):
    maj = vals[i][0]
    cla = vals[i][1]
    req = vals[i][2]
    print(req, ' ' * (20 - len(req)), maj, ' ' * (10 - len(maj)), cla, ' ' * (10 - len(cla)), counts[i])

In [None]:
# interestingly, most disagreement on outcome occurs in codeWrite requests

## Get outcomes per request type

In [None]:
convda = da1.groupby(by=['document', 'conversation_number'])
convda = convda.aggregate({'voted_conversation_requests' : 'first', 
                           'voted_conversation_outcome' : 'first'})
if verbose_flag:
    show(convda)

In [None]:
convda['success'] = convda['voted_conversation_outcome'] == 'S'

In [None]:
request_outcomes_da = convda.groupby(by=['voted_conversation_requests']).aggregate({'success' : ['sum', 'count']})
request_outcomes_da[('success', 'rate')] = request_outcomes_da[('success', 'sum')] / request_outcomes_da[('success', 'count')]
request_outcomes_da.sort_values(('success', 'count'), ascending=False)

## Content domain frequencies

In [None]:
# this should eventually move into the data preprocessing script
if codebook_version == 1:
    da1['code_contentDomain'] = np.where(da1['code'].str.startswith('General message attributes > contentDomain'), 
                                         da1['code'].str.split(' > ').str[2], 
                                         'N/A')

# get the relevant rows
conda = da1[da1['code_contentDomain'] != 'N/A'].copy()

# shorten some variable names to make the labels on the graph a little easier to look at
if codebook_version in {1, 2, 3}:
    conda['code_contentDomain'] = conda['code_contentDomain'].replace(
        {'proposedNewCode' : 'newCode', 
         'codeSpecifications' : 'specifications'}) 
if codebook_version in {4}:
    conda['code_contentDomain'] = conda['code_contentDomain'].replace(
        {'higherLevelInstruction' : 'holisticHelp', 
         'rapportBuilding' : 'rapport', 
         'codeSpecifications' : 'specifications'})

# compute the thing
def tmp(da):
    da = da['code_contentDomain'].value_counts()
    for c in conda['code_contentDomain'].unique():
        if not c in da.index:
            da[c] = 0
    return da.sort_index()

print(conda['code_contentDomain'].value_counts())

confr = conda.groupby(by='annotator').apply(tmp)
confr = confr.median(axis=0).sort_values(ascending=False)

In [None]:
fig = plt.figure(figsize=(12, 4))
plt.bar(confr.index, confr)
fl, ft, fa = 18, 20, 20
plt.xlabel('Content domain', fontsize=fa)
plt.ylabel('Number of instances', fontsize=fa)
plt.title('Content domain frequencies across all data', fontsize=ft)
plt.xticks(rotation=50, ha='right', fontsize=fl)
fig.show()
if output_flag:
    fig.savefig(os.path.join(output_dir, 'content-counts.png'), bbox_inches = 'tight')

### Do it again but per-request-type

#### Show them individually

In [None]:
confr = conda.groupby(by=['voted_conversation_requests', 'annotator']).apply(tmp)
confr = confr.groupby(by='voted_conversation_requests').aggregate('median')
confr

In [None]:
for request in confr.index:
    fr = confr.loc[request].sort_values(ascending=False)
    
    fig = plt.figure(figsize=(12, 4))
    plt.bar(fr.index, fr)
    fl, ft, fa = 18, 20, 20
    plt.xlabel('Content domain', fontsize=fa)
    plt.ylabel('Number of instances', fontsize=fa)
    plt.title('Content domain frequencies for request type {}'.format(request), fontsize=ft)
    plt.xticks(rotation=50, ha='right', fontsize=fl)
    if verbose_flag:
        fig.show()
    if output_flag:
        fig.savefig(os.path.join(output_dir, '{}-content-counts.png'.format(request)), bbox_inches = 'tight')
    if not verbose_flag:
        plt.close()

#### Show them all at once

In [None]:
content_domain_names = {
    'bug' : 'bug', 
    'codeOpinion' : 'code opinion', 
    'codingConcept' : 'coding concept', 
    'codingExperience': 'coding experience', 
    'developmentStrategy': 'development strategy', 
    'errorLocation': 'error location', 
    'errorMsg': 'error message', 
    'learningResources': 'learning resources', 
    'newCode': 'proposed new code', 
    'originalCode': 'original code', 
    'personalInfo': 'personal information', 
    'platformRelated': 'platform related', 
    'specifications': 'specifications', 
    'testCases': 'test cases'}

request_names = {
    'bugFix' : 'bug fixing', 
    'codeComprehension' : 'code comprehension', 
    'codeImprove' : 'code improvement', 
    'codeWrite' : 'code writing'}

In [None]:
confrp = confr.div(confr.sum(axis=1), axis=0)
confrp = confrp.rename(columns=content_domain_names)
confrp = confrp.rename(index=request_names)

In [None]:
confrp = confrp[confrp.sum().sort_values(ascending=False).index]
confrp = confrp.transpose()
confrp

In [None]:
xlen = len(confrp)
ax = confrp.iloc[:xlen].plot(kind='bar', figsize=(12, 4), width=0.8, zorder=3) # full-width version of Figure 4

bars = ax.patches
hatches = ''.join(h*xlen for h in 'xO/.')
for bar, hatch in zip(bars, hatches):
    bar.set_hatch(hatch)

ax.set_xlabel('"Content Domain"', fontsize=fa)
ax.set_ylabel('Proportion of Instances', fontsize=fa)
ax.set_title('"Content Domain" Frequencies Broken Down by "Issue Request"', fontsize=ft)
ax.legend(fontsize=fl-2)
plt.grid(zorder=0)
plt.xticks(rotation=30, ha='right', fontsize=fl-2)
plt.yticks(fontsize=fl-2)
if output_flag:
    plt.savefig(os.path.join(output_dir, 'request-content-counts.png'), bbox_inches = 'tight')
else:
    plt.show()

In [None]:
xlen = 5
ax = confrp.iloc[:xlen].plot(kind='bar', figsize=(12, 4), width=0.8, zorder=3) # FIGURE 4

bars = ax.patches
hatches = ''.join(h*xlen for h in 'xO/.')
for bar, hatch in zip(bars, hatches):
    bar.set_hatch(hatch)

ax.set_xlabel('"Content Domain"', fontsize=fa)
ax.set_ylabel('Proportion of Instances', fontsize=fa)
ax.set_title('"Content Domain" Frequencies Broken Down by "Issue Request"', fontsize=ft)
ax.legend(fontsize=fl-2)
plt.grid(zorder=0)
plt.xticks(rotation=30, ha='right', fontsize=fl-2)
plt.yticks(fontsize=fl-2)
if output_flag:
    plt.savefig(os.path.join(output_dir, 'request-content-counts.pdf'), bbox_inches = 'tight')
else:
    plt.show()

### Do it again but only for Learner questions

In [None]:
assert(codebook_version in {2, 3, 4}) # need this structure for this to work

qconda = conda[(conda['code_primary'] == 'Questioning') & conda['speakerIsLearner']] # & (conda['conversation_strict'] == True)
qconfr = qconda.groupby(by=['document', 'conversation_number', 'voted_conversation_requests', 'annotator']).apply(tmp)
qconfr = qconfr.groupby(by=['document', 'conversation_number', 'voted_conversation_requests']).aggregate('median')
qconfr = qconfr.groupby(by='voted_conversation_requests').aggregate('sum')
qconfr

In [None]:
assert(codebook_version in {2, 3, 4})

qconfrp = qconfr.div(qconfr.sum(axis=1), axis=0) # row-normalize
# qconfrp = qconfrp[qconfr.sum(axis=0).sort_values(ascending=False).index] # by total counts
qconfrp = qconfrp[qconfrp.sum(axis=0).sort_values(ascending=False).index] # by total proportions
qconfrp = qconfrp.rename(index=request_names)
qconfrp = qconfrp.rename(columns=content_domain_names, level=0)
qconfrp

In [None]:
assert(codebook_version in {2, 3, 4})

if supports_latex:
    mpl.rcParams['text.usetex'] = True

ax = qconfrp.transpose().iloc[:6].plot(kind='bar', figsize=(12, 4), width=0.8)
ax.set_ylabel('Proportion of Instances', fontsize=fa)
if supports_latex:
    x = '``Content Domain"'
    t = r'$\textsc{Learner}$ ``Questioning" $\to$ ``Content Domain" Frequencies Broken Down by ``Issue Request"'
else:
    x = '"Content Domain"'
    t = 'LEARNER "Questioning" > "Content Domain" Frequencies Broken Down by "Issue Request"'
ax.set_xlabel(x, fontsize=fa)
ax.set_title(t, fontsize=ft)
ax.legend(fontsize=fl-2)
plt.grid()
plt.xticks(rotation=50, ha='right', fontsize=fl)
if output_flag:
    plt.savefig(os.path.join(output_dir, 'request-learner-question-content-counts.png'), bbox_inches = 'tight')
else:
    plt.show()

In [None]:
mpl.rcParams['text.usetex'] = False

### Do it again but only for Helper explanations/help

In [None]:
assert(codebook_version in {2, 3, 4}) # need this structure for this to work

hconda = conda[(conda['code_primary'] == 'Helping') & ~conda['speakerIsLearner']] # & (conda['conversation_strict'] == True)
hconfr = hconda.groupby(by=['document', 'conversation_number', 'voted_conversation_requests', 'annotator']).apply(tmp)
hconfr = hconfr.groupby(by=['document', 'conversation_number', 'voted_conversation_requests']).aggregate('median')
hconfr = hconfr.groupby(by='voted_conversation_requests').aggregate('sum')
hconfr

In [None]:
assert(codebook_version in {2, 3, 4})

hconfrp = hconfr.div(hconfr.sum(axis=1), axis=0) # row-normalize

hconfrp = hconfrp[hconfrp.sum(axis=0).sort_values(ascending=False).index] # by total proportions
hconfrp = hconfrp.rename(index=request_names)
hconfrp = hconfrp.rename(columns=content_domain_names, level=0)
# hconfrp

In [None]:
assert(codebook_version in {2, 3, 4})

if supports_latex:
    mpl.rcParams['text.usetex'] = True

ax = hconfrp.transpose().iloc[:6].plot(kind='bar', figsize=(12, 4), width=0.8)
ax.set_ylabel('Proportion of Instances', fontsize=fa)
# ax.set_title('Helping content frequencies broken down by request type', fontsize=ft)
if supports_latex:
    x = '``Content Domain"'
    t = r'$\textsc{Helper}$ ``Helping" $\to$ ``Content Domain" Frequencies Broken Down by ``Issue Request"'
else:
    x = '"Content Domain"'
    t = 'HELPER "Helping" > "Content Domain" Frequencies Broken Down by "Issue Request"'
ax.set_xlabel(x, fontsize=fa)
ax.set_title(t, fontsize=ft)
ax.legend(fontsize=fl-2)
plt.xticks(rotation=50, ha='right', fontsize=fl)
plt.grid()
if output_flag:
    plt.savefig(os.path.join(output_dir, 'request-helper-helping-content-counts.png'), bbox_inches = 'tight')
else:
    plt.show()

In [None]:
mpl.rcParams['text.usetex'] = False

## Experience & personal info

### Learners

#### Coding experience

This doesn't work for codebook 4, because it's been pooled with other things!

In [None]:
assert(not codebook_version in {4})

expda = da1[(da1['code_contentDomain'] == 'codingExperience') & da1['speakerIsLearner']]
expda = expda.sort_values(['document', 'conversation_number', 
                           'quote_startPosition', 'quote_endPosition', 'annotator'])
expda = expda[['document', 'conversation_number', 'annotator', 'quote_text', 
               'quote_startPosition', 'quote_endPosition']]

if verbose_flag:
    pd.set_option('display.max_colwidth', None)
    expda.groupby(by=['document']).apply(display)
    pd.reset_option('display.max_colwidth')

In [None]:
assert(not codebook_version in {4})

if verbose_flag:
    # "we saw everything in 1 week"
    pd.set_option('display.max_colwidth', None)
    display(da1[(da1['document'] == 'DRskphqiwF.txt') & 
                (da1['conversation_number'] == 0) & 
                (da1['annotator'] == 'A') &
                (5200 < da1['quote_startPosition']) & 
                (da1['quote_startPosition'] < 5400)])
    pd.reset_option('display.max_colwidth')

In [None]:
assert(not codebook_version in {4})

if verbose_flag:
    # "yes, beginning though"
    pd.set_option('display.max_colwidth', None)
    display(da1[(da1['document'] == '6SdCx2rR9F.txt') & 
                (da1['conversation_number'] == 0) & 
                (3500 < da1['quote_startPosition']) & 
                (da1['quote_startPosition'] < 3707)])
    pd.reset_option('display.max_colwidth')

#### Personal info

In [None]:
assert(not codebook_version in {4})

perda = da1[(da1['code_contentDomain'] == 'personalInfo') & da1['speakerIsLearner']]
perda = perda.sort_values(['document', 'conversation_number', 
                           'quote_startPosition', 'quote_endPosition', 'annotator'])
perda = perda[['document', 'conversation_number', 'annotator', 'quote_text', 
               'quote_startPosition', 'quote_endPosition']]

if verbose_flag:
    pd.set_option('display.max_colwidth', None)
    perda.groupby(by=['document']).apply(display)
    pd.reset_option('display.max_colwidth')

In [None]:
assert(not codebook_version in {4})

# nontraditional 1 (3 hr drive)

In [None]:
assert(not codebook_version in {4})

if verbose_flag:
    # nontraditional 2 (Level 35)
    pd.set_option('display.max_colwidth', None)
    display(da1[(da1['document'] == '2I1pDSUuKI.txt') & 
                (da1['conversation_number'] == 0) & 
                (da1['annotator'] == 'A') & # output was too verbose
                (4100 < da1['quote_startPosition']) & 
                (da1['quote_startPosition'] < 4616)])
    pd.reset_option('display.max_colwidth')

### Helpers

#### Coding experience

In [None]:
assert(not codebook_version in {4})

expda = da1[(da1['code_contentDomain'] == 'codingExperience') & ~da1['speakerIsLearner']]
expda = expda.sort_values(['document', 'conversation_number', 
                           'quote_startPosition', 'quote_endPosition', 'annotator'])
expda = expda[['document', 'conversation_number', 'annotator', 'quote_text', 
               'quote_startPosition', 'quote_endPosition']]

if verbose_flag:
    pd.set_option('display.max_colwidth', None)
    expda.groupby(by=['document']).apply(display)
    pd.reset_option('display.max_colwidth')

#### Personal info

In [None]:
assert(not codebook_version in {4})

perda = da1[(da1['code_contentDomain'] == 'personalInfo') & ~da1['speakerIsLearner']]
perda = perda.sort_values(['document', 'conversation_number', 
                           'quote_startPosition', 'quote_endPosition', 'annotator'])
perda = perda[['document', 'conversation_number', 'annotator', 'quote_text', 
               'quote_startPosition', 'quote_endPosition']]

if verbose_flag:
    pd.set_option('display.max_colwidth', None)
    perda.groupby(by=['document']).apply(display)
    pd.reset_option('display.max_colwidth')

In [None]:
assert(not codebook_version in {4})

if verbose_flag:
    # 'I am in 11th grade'
    pd.set_option('display.max_colwidth', None)
    display(da1[(da1['document'] == 'nvPpBOafGk.txt') & 
                (da1['conversation_number'] == 0) & 
                (da1['annotator'] == 'A') & # output was too verbose
                #(2900 < da1['quote_startPosition']) & 
                (da1['quote_startPosition'] < 3681)])
    pd.reset_option('display.max_colwidth')

#### Teaching philosophy

In [None]:
phida = da1[(da1['code'] == 'Attitude, tone, or mood > expressTeachingPhilosophy') & 
            ~da1['speakerIsLearner']]
phida = phida.sort_values(['document', 'conversation_number', 
                           'quote_startPosition', 'quote_endPosition', 'annotator'])
phida = phida[['document', 'conversation_number', 'annotator', 'quote_text', 
               'quote_startPosition', 'quote_endPosition']]

if verbose_flag:
    pd.set_option('display.max_colwidth', None)
    phida.groupby(by=['document']).apply(display)
    pd.reset_option('display.max_colwidth')

## Attitude/tone/mood frequencies

In [None]:
attitude_names = {
    'greeting' : 'greeting', 
    'expressSupportingWords' : 'supporting words', 
    'expressSatisfactionOrGratitude' : 'gratitude', 
    'beingWrong' : 'being incorrect', 
    'apology' : 'apology', 
    'expressTeachingPhilosophy' : 'teaching philosophy', 
    'beingLost' : 'being lost', 
    'frustration' : 'frustration', 
    'selfTalk' : 'negative self-talk'}

In [None]:
# get the relevant rows
attda = da1[da1['code'].str.startswith('Attitude, tone, or mood')].copy()
#attda['code'] = attda['code'].str[len('Attitude, tone, or mood > '):]
attda['code'] = attda['code'].str.split(' > ').str[-1]

# shorten some variable names to make the labels on the graph a little easier to look at
attda['code'] = attda['code'].replace(attitude_names)
print(len(attda))

In [None]:
attda['code'].value_counts()

In [None]:
positive_names = ['greeting', 'supporting words', 
                  'gratitude', 'being incorrect', 
                  'apology', 'teaching philosophy', ]
negative_names = ['being lost', 'frustration', 'negative self-talk']

print('positive count', attda['code'].isin(positive_names).sum())
print('negative count', attda['code'].isin(negative_names).sum())

In [None]:
# compute the thing
def tmp(da):
    da = da['code'].value_counts()
    for c in attda['code'].unique():
        if not c in da.index:
            da[c] = 0
    return da.sort_index()

attfr = attda.groupby(by='annotator').apply(tmp)
attfr = attfr.median(axis=0).sort_values(ascending=False)

attfr

In [None]:
# this is not particularly interesting imo
fig = plt.figure(figsize=(12, 4))
plt.bar(attfr.index, attfr)
fl, ft, fa = 18, 20, 20
plt.xlabel('Expression of Attitude, Tone, or Mood', fontsize=fa)
plt.ylabel('Number of Instances', fontsize=fa)
plt.title('"Attitude" Frequencies Across All Data', fontsize=ft)
plt.xticks(rotation=50, ha='right', fontsize=fl)
fig.show()
if output_flag:
    fig.savefig(os.path.join(output_dir, 'attitude-counts.png'), bbox_inches = 'tight')

In [None]:
attfr = attda.groupby(by=['voted_conversation_requests', 'annotator']).apply(tmp)
attfr = attfr.groupby(by='voted_conversation_requests').aggregate('median')
attfr

In [None]:
for request in attfr.index:
    fr = attfr.loc[request].sort_values(ascending=False)
    
    fig = plt.figure(figsize=(12, 4))
    plt.bar(fr.index, fr)
    fl, ft, fa = 18, 20, 20
    plt.xlabel('Expression of attitude, tone, or mood', fontsize=fa)
    plt.ylabel('Number of instances', fontsize=fa)
    plt.title('Attitude/tone/mood frequencies for request type {}'.format(request), fontsize=ft)
    plt.xticks(rotation=50, ha='right', fontsize=fl)
    if verbose_flag:
        fig.show()
    if output_flag:
        fig.savefig(os.path.join(output_dir, '{}-attitude-counts.png'.format(request)), bbox_inches = 'tight')
    if not verbose_flag:
        plt.close()

In [None]:
attfrp = attfr.div(attfr.sum(axis=1), axis=0)
# attfrp = attfrp.rename(columns={'apology' : 'Apology', 
#                                 'beingLost' : 'Being lost', 
#                                 'beingWrong' : 'Being wrong', 
#                                 'frustration' : 'Frustration', 
#                                 'greeting' : 'Greeting', 
#                                 'negativeSelfTalk' : 'Negative self talk', 
#                                 'satisfaction/gratitude' : 'Satisfaction or gratitude', 
#                                 'supportingWords' : 'Supporting words', 
#                                 'teachingPhilosophy' : 'Teaching philosophy'})
attfrp = attfrp.rename(index=request_names)

In [None]:
attfrp = attfrp[attfrp.sum().sort_values(ascending=False).index]
attfrp = attfrp.transpose()
attfrp

In [None]:
xlim = len(attfrp) #5
ax = attfrp.iloc[:xlim].plot(kind='bar', figsize=(12, 4), width=0.8, zorder=3) # full-width version of Figure 5

bars = ax.patches
hatches = ''.join(h*xlim for h in 'xO/.')
for bar, hatch in zip(bars, hatches):
    bar.set_hatch(hatch)

ax.set_xlabel('"Expression"', fontsize=fa)
ax.set_ylabel('Proportion of Instances', fontsize=fa)
ax.set_title('"Expression" Frequencies Broken Down by "Issue Request"', fontsize=ft)
ax.legend(fontsize=fl-2)
plt.grid(zorder=0)
plt.xticks(rotation=30, ha='right', fontsize=fl-2)
plt.yticks(fontsize=fl-2)
if output_flag:
    plt.savefig(os.path.join(output_dir, 'request-attitude-counts.png'), bbox_inches = 'tight')
else:
    plt.show()

In [None]:
xlim = 5
ax = attfrp.iloc[:xlim].plot(kind='bar', figsize=(12, 4), width=0.8, zorder=3) # FIGURE 5

bars = ax.patches
hatches = ''.join(h*xlim for h in 'xO/.')
for bar, hatch in zip(bars, hatches):
    bar.set_hatch(hatch)

ax.set_xlabel('"Expression"', fontsize=fa)
ax.set_ylabel('Proportion of Instances', fontsize=fa)
ax.set_title('"Expression" Frequencies Broken Down by "Issue Request"', fontsize=ft)
ax.legend(fontsize=fl-2)
plt.grid(zorder=0)
plt.xticks(rotation=30, ha='right', fontsize=fl-2)
plt.yticks(fontsize=fl-2)
if output_flag:
    plt.savefig(os.path.join(output_dir, 'request-attitude-counts.pdf'), bbox_inches = 'tight')
else:
    plt.show()

- It's worth noting that `codeComprehension` and `codeImprove` request types both suffer from small denominators
- `greeting` is low for `codeComprehension` requests, potentially indicating that it's not usually the first request in a document
- `greeting` is high for `codeImprove` requests, potentially indicating that it doesn't tend to stem from prior requests
- `supportingWords` and `negativeSelfTalk` are both high for `codeComprehension` requests, potentially indicating that Learners find this more difficult than other request types
- `supportingWords` is low for `codeImprove` requests, potentially indicating that Helpers perceive that Learners are struggling less or at a more advanced level

### Taking a look at the conversation number hypothesis

In [None]:
attfr = attda.groupby(by=['document', 'conversation_number', 'annotator']).apply(tmp)
attfr = attfr.groupby(by=['document', 'conversation_number']).aggregate('median')
attfr = attfr.groupby(by='conversation_number').aggregate('mean')
attfr

In [None]:
for n in attfr.index:
    fr = attfr.loc[n]
    
    fig = plt.figure(figsize=(12, 4))
    plt.bar(fr.index, fr)
    fl, ft, fa = 18, 20, 20
    plt.xlabel('Expression of attitude, tone, or mood', fontsize=fa)
    plt.ylabel('Average number of instances', fontsize=fa)
    plt.title('Attitude/tone/mood frequencies for conversation {}'.format(n), fontsize=ft)
    plt.xticks(rotation=50, ha='right', fontsize=fl)
    if verbose_flag:
        fig.show()
    if output_flag:
        fig.savefig(os.path.join(output_dir, 'conv{}-attitude-counts.png'.format(n)), bbox_inches = 'tight')
    if not verbose_flag:
        plt.close()

In [None]:
ax = attfr.transpose().plot(kind='bar', figsize=(12, 4), width=0.8)
ax.set_xlabel('Expression of attitude, tone, or mood', fontsize=fa)
ax.set_ylabel('Average number of instances', fontsize=fa)
ax.set_title('Attitude/tone/mood frequencies broken down by conversation number', fontsize=ft)
ax.legend(fontsize=fl-2)
plt.xticks(rotation=50, ha='right', fontsize=fl)
if output_flag:
    plt.savefig(os.path.join(output_dir, 'conv-attitude-counts.png'), bbox_inches = 'tight')
else:
    plt.show()

# Communication mechanism frequencies

In [None]:
assert(codebook_version in {2})

# get the relevant rows
cmda = da1[da1['code_communicationMechanism'] != 'N/A'].copy()

# shorten some variable names to make the labels on the graph a little easier to look at
# if codebook_version in {2}:
#     cmda['code_communicationMechanism'] = cmda['code_communicationMechanism'].replace(
#         {'proposedNewCode' : 'newCode', 
#          'codeSpecifications' : 'specifications'}) 

# compute the thing
def tmp(da):
    da = da['code_communicationMechanism'].value_counts()
    for c in cmda['code_communicationMechanism'].unique():
        if not c in da.index:
            da[c] = 0
    return da.sort_index()

cmfr = cmda.groupby(by='annotator').apply(tmp)
cmfr = cmfr.median(axis=0).sort_values(ascending=False)

In [None]:
fig = plt.figure(figsize=(12, 4))
plt.bar(cmfr.index, cmfr)
fl, ft, fa = 18, 20, 20
plt.xlabel('Communication mechanism', fontsize=fa)
plt.ylabel('Number of instances', fontsize=fa)
plt.title('Communication mechanism frequencies across all data', fontsize=ft)
plt.xticks(rotation=50, ha='right', fontsize=fl)
fig.show()
# if output_flag:
#     fig.savefig(os.path.join(output_dir, 'commMech-counts.png'), bbox_inches = 'tight')

# Other

## 5.2.3 Helper-specific strategies and varied participation

In [None]:
candidate_codes = [('code_communicationMechanism', 'guideInteractively'), 
                   ('code_communicationMechanism', 'teachWithExtensions'), 
                   ('code_questionType', 'guiding')]
for col, val in candidate_codes:
    print('{} > {}'.format(col, val))
    tmp = da1.loc[da1[col] == val, 'speakerIsLearner'].value_counts()
    L = tmp[True] if True in tmp.index else 0
    H = tmp[False] if False in tmp.index else 0
    print('\tLearner: {} ({:.2f}%) \tHelper: {} ({:.2f}%)'.format(
        L, L/(L+H) * 100, 
        H, H/(L+H) * 100))

In [None]:
attda['code'].value_counts()

In [None]:
len(attda)

In [None]:
attda.loc[attda['code'] == 'negative self-talk', 'speakerIsLearner'].value_counts()

In [None]:
attda.loc[attda['code'] == 'supporting words', 'speakerIsLearner'].value_counts()