## Summarize the data scraped from the excel workbook, and check it against previously calculated metadata.

In [None]:
import os
import pandas as pd  # for data storage and manipulation
import seaborn as sns  # for plotting

# common.py is a local file with shared constants
import common

In [None]:
comments = {}  # Log any comments through processing and report them all at the end.

def safe_load(f):
    if os.path.isfile(f):
        df = pd.read_csv(f, sep='\t', index_col=0)
        print("read {:,} records from {}".format(len(df.index), f))
    else:
        print("01_collect_data.ipynb needs to be run before any analyses.")
    return df

data = safe_load(common.data_file)
meta = safe_load(common.meta_file)

### Calculate hit rates and compare with those in Excel

In [None]:
""" Aggregate the hits and misses, by sheet, to facilitate calculating a hit rate. """

df_hitmiss = pd.crosstab(data['sheet'], data['Hit_Miss_raw'])
meta['py_hit_rate'] = df_hitmiss['\'Hit\''] / (df_hitmiss['\'Miss\''] + df_hitmiss['\'Hit\''])
meta.to_csv(common.meta_file, sep="\t")

In [None]:
""" Find and report any differences between my calculations and those in the Excel workbook. """

df_diffs = meta[meta['xl_hit_rate'] != meta['py_hit_rate']]
comment = "I checked that my calculated hit rates match the Excel hit rates. "
comment += "{} hit rate{} differed: ".format(
    len(df_diffs), "s" if len(df_diffs.index) > 1 else ""
)
for diff in df_diffs.itertuples():
    comment += "{} reported {:.2%}; I calculated {:.2%}. ".format(
        getattr(diff, 'Index'), getattr(diff, 'xl_hit_rate'), getattr(diff, 'py_hit_rate')
    )
comment += "The excel sheet formula excluded one 'Hit' cell, reducing the hit rate. "
comment += "This notebook therefore seems to be accurate, even though it differs from the spreadsheet."
comments['hit rates'] = comment

In [None]:
""" Print out any comments accumulated while running the notebook. """
print("## Comments")
for section in ['hit rates', ]:
    print("\n### {}\n".format(section))
    print(comments[section])

In [None]:
df_hitmiss.sample(4)

In [None]:
meta.sample(4)

In [None]:
# Our parametric statistics all assume our results are normally distributed, among other things. Are they?
# Colors and color palettes are defined in common.py
p = sns.distplot(meta['py_hit_rate'], hist=True, bins=15, color='gray')
p = sns.kdeplot(meta[meta['sex'] == 'M']['py_hit_rate'], color=common.bluish, label="male", shade=True, ax=p)
p = sns.kdeplot(meta[meta['sex'] == 'F']['py_hit_rate'], color=common.reddish, label="female", shade=True, ax=p)

print("Seems symmetrical and roughly normal. But is there a bimodal distribution?")
p.get_figure().savefig('hit_rate_distribution.png')

In [None]:
# Another way to look at the same data, nothing new, and perhaps harder to see
p = sns.boxplot(x='py_hit_rate', y='sex', data=meta, palette=common.light_palette)
p = sns.swarmplot(x='py_hit_rate', y='sex', data=meta, palette=common.dark_palette, ax=p)