# Pilot Ratings Analysis

## Introduction

This notebook aims to help analyze ratings data from the Moth pilot study, with the goal of selecting stimuli for inclusion in the primary study.

## Setup

1. Follow the directions from the [README](README.ipynb)
2. Serialize the current ratings data to a CSV file:

In [None]:
! python -m pilot.serialize --ratings backups/pilot_ratings.csv

## Preprocessing

Crunch the data into a useful format (this can take a few minutes).

In [None]:
import pandas as pd
import json
import math
import datetime
import emotioncf
import matplotlib.pyplot as plt
import matplotlib.colors

% matplotlib inline

In [None]:
# Load data from the CSV created above and parse JSON
inputFrame = pd.read_csv("backups/pilot_ratings.csv").set_index("RateID")
parsedRatings = inputFrame["Ratings"].apply(lambda row: pd.to_numeric(pd.Series((json.loads(row)))))
parsedRatings.replace(-1, 0, inplace = True)
masterFrame = pd.concat([inputFrame, parsedRatings], axis = 1)

In [None]:
# Grab stimuli names for lookup later
stimuli = masterFrame[["StimID", "StimName"]].set_index("StimID").drop_duplicates()

In [None]:
# Strip anything we don't need and start to organize
simplified = masterFrame.reset_index().drop(columns = ["ParticipantID", "StimName", "SliceStartSec", "ReactionTime", "Ratings", "RateID"])
simplified.set_index(["StimID", "SeshID", "PollSec"], inplace = True)

In [None]:
# Crunch everything into the format we want.
# This format is:  - Hierachical Index: StimID > Emotion > SeshID
#                  - Columns: Each timestamp used anywhere in the data
#                  - Values: Actual ratings
#                    (most will be NaN b/c only a small subset of all timestamps are used in a given session)
reshaped = simplified.stack().to_frame()
reshaped.index.names = (reshaped.index.names[:3]) + ["Emotion"]
reshaped.set_axis(["Rating"], axis = "columns", inplace = True)
# reshaped = reshaped.unstack(2)
reshaped.reset_index(inplace = True)
reshaped.sort_values(by = ["StimID", "Emotion", "SeshID"], inplace = True)
reshaped.set_index(["StimID", "Emotion", "SeshID"], inplace = True)

## Computation

In [None]:
# Try a few different dilation values
dilations = [15, 30, 45, 60]
# Store cf objects for later
collabs = {}

In [None]:
# Iterate through stimuli > emotions > dilations and build cf objects
stimGrouped = reshaped.reset_index().groupby("StimID", sort = False)
for stim, stimGroup in stimGrouped:
    # Get rid of any columns with no data for this stim
    # (no point doing this on the emotion level b/c all emotions are sampled at the same time w/in a session)
    prunedStimGroup = stimGroup.dropna(how = "all", axis = "columns")
    emotionGrouped = prunedStimGroup.groupby("Emotion", sort = False)
    for emotion, emotionGroup in emotionGrouped:
        # Final cleanup of the data now that we only have one stim > emotion
        data = emotionGroup.drop(["StimID", "Emotion"], axis = 1)
        # Rename for emotionCF and create a matrix
        data.set_axis(["Subject", "Item", "Rating"], axis = "columns", inplace = True)
#         print len(data.Subject.unique())
        matrix = emotioncf.data.create_sub_by_item_matrix(data)
        for dilation in dilations:
            try: # Don't stop when occasional 'cannot broadcast result' errors are hit
                # Build/fit/run/store a cf object
                cf = emotioncf.cf.NNMF_multiplicative(matrix)
                cf.fit(dilate_ts_n_samples = dilation)
                cf.predict()
                collabs.setdefault(stim, {}).setdefault(emotion, {})[dilation] = cf
                # If >0 ratings were predicted, save a plot
#                 if cf.predicted_ratings.dropna().shape[0] > 0:
#                     cf.plot_predictions()[0].savefig("pr_figs/Stim%i_%s@%i.png" % (stim, emotion, dilation))
#                     plt.clf()
            except Exception as error:
                print "The following error occured for stim %i's emotion %s @ dilation %i: %s" % (stim, emotion, dilation, error)

## Visualization

In [None]:
binLength = 15 # For plotting, downsample to bins of this length
minMean = 20 # Only plot emotions with means of at least this, OR...
minPeak = 50 # ... peaks of at least this
emotionColors = {"Anger": "red",                   # Map emotions to colors 
                 "Contempt": "maroon",
                 "Disgust": "olive",
                 "Elation": "magenta",
                 "Envy": "seagreen",
                 "Fear": "palegoldenrod",
                 "Guilt": "darkorange",
                 "Hope": "gold",
                 "Interest": "dimgray",
                 "Joy": "darkorchid",
                 "Pride": "lightsteelblue",
                 "Relief": "skyblue",
                 "Sadness": "navy",
                 "Satisfaction": "cornflowerblue",
                 "Shame": "greenyellow",
                 "Surprise": "lightblue"}

# All of this ignores dilations, just using the first one
for stim, emotions in collabs.iteritems():
    stimName = stimuli.loc[stim].StimName
    # Pick any emotion to count participants (all should have the same)
    numParticipants = emotions[emotions.keys()[0]][dilations[0]].ratings.shape[0]
    avgs = {}
    for emotion, cfs in emotions.iteritems():
        # Downsample and clean up indices
        crunched = cfs[dilations[0]].ratings.T
        crunched.index = pd.to_datetime(crunched.index, unit = "s")
        crunched = crunched.resample("%iS" % binLength).mean().T
        crunched.rename(columns = lambda x: x.minute * 60 + x.second, inplace = True)
        # Flatten down to averages
        avgs[emotion] = (crunched.mean(axis = 0))
    
    # Load average emotions into pandas and prune away boring ones based on critera at top
    avgs = pd.DataFrame(avgs)
    pruned = avgs[avgs.columns[(avgs.max() >= minPeak) | (avgs.mean() >= minMean)]]
    # Choose the subset of colors needed based on which emotions were pruned
    colors = [emotionColors[emotion] for emotion in list(pruned.columns)]
    colorMap = matplotlib.colors.ListedColormap(colors = colors)
    # On x axis, put tick marks at every point
    xticks = list(pruned.index)
    # But, only label every other tick mark for short videos or every 4th tick mark for long ones
    # And label using MM:SS notation instead of raw number of seconds
    xlabels = []
    thisLength = pruned.index.max()
    for tick in xticks:
        if tick % ((binLength * 2) if thisLength <= 10 * 60 else (binLength * 4)) == 0:
            time = datetime.time(minute = int(math.floor(tick / 60)), second = tick % 60)
            xlabels.append(time.strftime("%M:%S"))
        else:
            xlabels.append("")
    # Draw the plot and make it pretty
    figTitle = "Stim %i: '%s'\nRaw @ %is" % (stim, stimName, binLength)
    fig = plt.figure()
    pruned.plot(figsize = (20, 5), ylim = (0, 100), xlim = (0, thisLength), ax = fig.gca(), colormap = colorMap)
    fig.gca().set_xlabel("")
    plt.legend(loc = "upper center", bbox_to_anchor = (0.5, -0.15), ncol = 8, fontsize = "large")
    plt.suptitle(figTitle, fontsize = 18, y = 1.02)
    plt.xticks(xticks, xlabels)
    plt.text(6, 94, "n = %i" % numParticipants , horizontalalignment = "left", verticalalignment = "top", fontdict = {"size": "large"});
    # Save the plot
    plt.savefig("pr_emotion_means/Stim%i_Raw_@%i.png" % (stim, binLength), bbox_inches = "tight")
    plt.clf()
    