# WISER SIMS Data Extraction Question Distribution Analysis

This takes standard SIMS Data Extraction files and a creates a distribution of the answers for a selected questions and plots it.
- Author  : John Lutz <lutzjw@upmc.edu>
- Created : 2020-02-25
- Edited : 2020-02-25

## Some Assumptions
- You are using an unaltered SIMS data extraction file
- The question must be a multiple choice question of some type (Likert, Selection, etc)
- You have an `Analysis/Q-Distribution` directory in the folder your are running this Jupyter notebook. This is where the analysis files will go.

## Instructions

- Change the variables in the section below. The ones you will need to always change are:
    - `file`
        - Get the Data Extraction file from SIMS
            - This is the Excel file straight from SIMS Data Extraction
            - You need to have selected "Correctness" for the "Quiz Reponses" when you generate the file
            - Drag it from your computer and drop it into the file browser in Jupyter (just to the left here)
            - Right click on the file and select "Copy path" from the data file you want and paste it into the `file` variable below
    - `qID1` and `qID2`
        - These are the the two columns of data you want to compare. They should be likert scales.
        - Make sure there are doulble quotes around the IDs: "Q23"
        - Don't forget the asteriks if appliccable for retired questions (e.g. "*Q24")
        - _These need to be on the same scale for this to make sense_
    - `likertMax`
        - This is the maximum value you can select for this liker (typically 5 or 9)

- Once you have made your changes hit the **SHIFT-RETURN** keys together to run the analysis
- Scroll to the bottom to see the results

In [None]:
myJupyterLocation = '/Users/johnlutz/Library/Mobile Documents/com~apple~CloudDocs/Code/Jupyter Notebooks/'
file = "WISER/data/FDBK MNTR STUD/FDBK MNTR STUD 2020-05-04.xlsx"
#Put the first and second Question IDs here.
qID1 = "Q12"   #Don't forget the '*'' if needed!
likertMax = 5  #
fillNaNsInAnswers = True # Fill NaN in the answer bars with 0s. Good for likerts and if you really need to show all possible answers.

xLabelRotation = 0  #90 is straight up and down
bottomMargin   = 0.1  # 0.35 is pretty far down.
#titleLine = ""
titleLine = '1=Not at all, 3=Moderately, 5=Extremely' # Put underneath the Question in the title.
#titleLine = '1=Not at all, 3=Somewhat, 5=Completely Agree' # Put underneath the Question in the title.
####################################################################
#  This is the end of the section where you can change variables   #
####################################################################
%matplotlib inline
import pandas as pd
#from scipy import stats
import math
import os
#from dateutil import parser
import matplotlib.pyplot as plt
import numpy as np
#This is the function that will put the values on the top of the bars
#from : http://composition.al/blog/2015/11/29/a-better-way-to-add-labels-to-bar-charts-with-matplotlib/
def autolabel(rects, ax):
    # Get y-axis height to calculate label position from.
    (y_bottom, y_top) = ax.get_ylim()
    y_height = y_top - y_bottom

    for rect in rects:
        height = rect.get_height()

        # Fraction of axis height taken up by this rectangle
        p_height = (height / y_height)

        # If we can fit the label above the column, do that;
        # otherwise, put it inside the column.
        if p_height > 0.98: # arbitrary; 98% looked good to me.
            label_position = height - (y_height * 0.05)
        else:
            label_position = height + (y_height * 0.01)

        if not math.isnan(height) :
            ax.text(rect.get_x() + rect.get_width()/2., label_position,
                    '%d' % int(height),
                    ha='center', va='bottom')

#set the file label for this chart
fileLabel = qID1

#Add my home directory to the path and get the base file name
file = myJupyterLocation + file
base = os.path.basename(file)
fileName=os.path.splitext(base)[0]
chartTitle = fileName;

# Open up the Excel File
xl = pd.ExcelFile(file)

#Find the Question Text
qSheet = xl.parse('Question Dictionary', index_col="Q#")
q1Text = qID1+ " - " +qSheet.loc[qID1].Text
q1SIMSID = qSheet.loc[qID1].QuesID

#Find the Answer text from the dictionary and create a list of them
aSheet = xl.parse('Answer Dictionary', index_col="QuesID")
df = aSheet.filter(axis=0,regex=str(q1SIMSID)).filter(axis=1, items=['AnsID', 'Text', 'Type'])
if (df.iloc[0,2] == "Specific") : # This has specific items for answers not 1-X radio buttons
    adf = pd.DataFrame(df['Text'].values, index=df['AnsID'].values, columns=['Answer'])
else :
    adf = pd.DataFrame(range(1,likertMax+1), columns=['Answer'])
ds = xl.parse('User') # The User sheet holds the data by default.
q1Results = ds[qID1].replace(-999).dropna()
adf['Counts'] = q1Results.value_counts()

if (fillNaNsInAnswers) :
    adf.fillna(value=0, inplace=True) #fill any NaN with zeros so they show up in the graph

print(adf)
total = adf['Counts'].sum()
print ("Total : " +str(total))
#Start the plotting...
fig, ax = plt.subplots()  # Create a figure and an axes.
plt.gcf().subplots_adjust(bottom=bottomMargin) #make it cleaner and space for the Xticklabels if rotated

#make the bar chart with X=Answer and Y=Counts
rects1 = ax.bar(adf['Answer'], adf['Counts'])
#set the chart size (in inches)"
fig.set_size_inches(9,9)
ax.set_xlabel("\nLikert 1-5\nn={}".format(int(total)))
chartTitle += "\n" +q1Text
if titleLine != "" : chartTitle += "\n" +titleLine
ax.set_title(chartTitle)
ax.set_ylabel('Count')
ax.tick_params(axis='x', labelrotation=xLabelRotation)
autolabel(rects1, ax)
outFileStr = 'Analysis/Q-Distribution/' +fileName+'-Q-Distribution-'+fileLabel+'.pdf'
print ('Output File : ' +outFileStr)
plt.savefig(outFileStr)
