# WISER SIMS Course Evaluation Comparison

This takes a comma separated data file (generated from the SIMS database using the query below), and creates bar charts comparing a courses standard evaluation questions. Any courses that have responses to the standard evaluation questions for classes in the date range will get a pdf file with the graph, one, one graph per file.
- Author  : John Lutz <lutzjw@upmc.edu>
- Created : 2020-03-03
- Edited : 2020-03-04

## Some Assumptions
- You have an `Analysis/Course-Evaluations` directory in the folder your are running this Jupyter notebook. This is where the PDF files containing the graphs will go.

## Instructions

- Change the variables in the section below. The ones you will need to always change are:
    - `file`
        - This is the comma separated data file that is generated from the query above. It is individual answers to the standard course evaluation questions, one answer per line. Typically we can just collect the data for the previous month and append it to this data file.
    - `startDate` and `endDate`
        - This is the start and end dates for the classes you want to generate the charts for. Somebody may complete an evaluation well after the class has occured, so there may be some anomylous data over time.
    - `directoryName`
        - This is the name of the directory that will be created in Analysis/Course-Evaluations. The directory will be created if needed. Any existing files wil be overwritten if the same file names are generated.

- Once you have made your changes hit the **SHIFT-RETURN** keys together to run the analysis
- Scroll to the bottom to see the results

# Query to pull data
- TODO
    - We'll need to add date ranges around this once we start pulling monthly data.
    - THE DATE SHOULD BE THE EVALUATION DATE, NOT THE CLASS DATE
    

- SQL to run on SIMS data:
    
    
```
select c.ABBRV COURSE, c.COURSE_ID, to_char(l.CLASS_DATE, 'YYYY-MM-DD') CLASS_DATE,
       l.CLASS_ID, a.EVAL_ANSWER_ID SCORE,
       case when instr(q.QUESTION_TEXT, 'Effectiveness') > 1 then 'E'
            when instr(q.QUESTION_TEXT, 'Likeliness')    > 1 then 'L'
       else 'F' END "Type" --Facilitator
  from EVALUATION_ANSWERS a, EVALUATION_MAIN m, CLASSES l, courses c, ID0_EVAL_QUESTIONS q
 where a.EVALUATION_ID = m.EVALUATION_ID
   and a.EVAL_QUESTION_ID = q.EVAL_QUESTION_ID
   and m.CLASS_ID = l.CLASS_ID
   and l.COURSE_ID = c.COURSE_ID 
   -- Effectiveness of the in-person education you received
   and (a.EVAL_QUESTION_ID in (36277, 36336, 36348, 36613, 36858, 36927, 37139, 37180, 37217, 37240, 37284, 37406, 37418, 37724, 37966, 38069, 38108, 38220)
       -- Likeliness of recommending this course to a colleague
        or a.EVAL_QUESTION_ID in (36855, 36924, 37136, 37176, 37192, 37194, 37214, 37237, 37281, 37299, 37403, 37415, 37721, 37831, 37951, 37963, 38066, 38105, 38219)
      -- The facilitator(s) made the educational experience relevant to my training level
        or a.EVAL_QUESTION_ID in (36280, 36339, 36351, 36616, 36861, 36930, 37142, 37183, 37220, 37244, 37287, 37409, 37421, 37727, 37969, 38072, 38111, 38223)
        )
```


In [None]:
file = "WISER/data/All Course Analysis/course eval data 2020-03-03 compact.csv"

startDate = '2019-01-01'
endDate   = '2019-12-31'
directoryName = "2019"

####################################################################
#  This is the end of the section where you can change variables   #
####################################################################
%matplotlib inline
import pandas as pd
import math
import os
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

# Type Key = 
typeDict = {'E' : 'Effectiveness of the in-person education you received',
            'L' : 'Likeliness of recommending this course to a colleague', 
            'F' : 'The facilitator(s) made the educational experience relevant to my training level'
           }

#Make the output directory for the PDF files
outFilePath = "Analysis/Course-Evaluations/" +directoryName+"/"
Path(outFilePath).mkdir(parents=True, exist_ok=True)
print ("Creating PDF files in : " +outFilePath)

#Add my home directory to the path and get the base file name
file = "~/" + file
base = os.path.basename(file)
fileName=os.path.splitext(base)[0]

#Load the file into a DataFram
df = pd.read_csv(file)
df['CLASS_DATE'] = pd.to_datetime(df['CLASS_DATE'])
df.replace(-999, np.NaN, inplace=True)
df2 = df[(df.CLASS_DATE>=pd.to_datetime(startDate))]
df3 = df2[(df2.CLASS_DATE<=pd.to_datetime(endDate))]

#These are the headers of the columns for the ce dataframe that we will generate the plots from
theColumnList = ["the1s","the2s","the3s","the4s","the5s"]

pivoted = df3.pivot_table(index=["COURSE", "TYPE"],
                             columns="SCORE",
                             aggfunc={'SCORE':np.count_nonzero}
                            )
#ce will be the compressed, pivoted data
ce = pd.DataFrame(pivoted.to_records())
ce.fillna(0, inplace=True) #clean out the NaNs

#clean out the header cruft from when we created the Pivot Table.
ce.columns = [hdr.replace("('SCORE', ", "").replace(")", "") for hdr in ce.columns]

#Need to rename the columns to alphnumeric names to reference them below
ce.rename(columns={"1.0":"the1s", "2.0":"the2s", "3.0":"the3s", "4.0":"the4s", "5.0":"the5s"}, inplace=True)

#There may be no data for some of the columns (no 1s for example),
#so we may need to create columns with zerosto avoid errors below.
colCnt = 2
for col in theColumnList : 
    if col not in ce.columns :
        ce.insert(colCnt, col, 0)
    colCnt += 1

ce.eval('TOTAL=@ce.the1s+@ce.the2s+@ce.the3s+@ce.the4s+@ce.the5s', inplace=True)    

#these will be the columns for the totals Dataframe
totalsColumns = theColumnList+ ["TOTAL"]
totalsDF = pd.DataFrame(columns=totalsColumns, index=typeDict.keys())

#Calculate the totals of each column and get the sum of all of them.
for key in typeDict : 
    totalAll = 0
# #this will be fixed when we have 1s
#     totalsDF.loc[key, "the1s"] = 0
    for hdr in theColumnList :
        #Calculate the totals
        totalsDF.loc[key,hdr] = ce[(ce.TYPE==key)][hdr].sum()
        totalAll +=  totalsDF.loc[key,hdr]
    totalsDF.loc[key, "TOTAL"] = totalAll

#Now create a Dataframe with the percentages.
totalPercColumns = theColumnList
totalPercDF = pd.DataFrame(columns=totalPercColumns, index=typeDict.keys())
for key in typeDict : 
    for hdr in theColumnList :
        totalPercDF.loc[key,hdr]=(totalsDF.loc[key,hdr]/totalsDF.loc[key,"TOTAL"])*100

#Get the list of courses.
courseList = ce.COURSE.unique()
ce.to_csv('Analysis/Course-Evaluations/test.tsv', sep='\t')

#this is the X Axis for the graphs
x  = np.array([1, 2, 3, 4, 5])
plotCnt = 0
courseCnt = 0
for course in courseList :
    for key in typeDict :
        thisCoursePerc = []
        totalCoursePerc = []
        colCnt = 0
#         #this will be fixed when we have 1s
#         thisCoursePerc.append(0)
#         totalCoursePerc.append(0)
        colCnt += 1
        theDF = ce[(ce.COURSE==course) & (ce.TYPE==key)]
        if (theDF.size) : #Make sure we have data for this question
            for col in theColumnList :
                thisCoursePerc.append(float((theDF[col]/theDF["TOTAL"])*100))
                #             thisCoursePerc.append(ce[(ce.COURSE==course) & (ce.TYPE==key)][col])
                totalCoursePerc.append(totalPercDF.loc[key,col])
            df = pd.DataFrame({'All Courses' : totalCoursePerc, 'This Course' : thisCoursePerc},
                            index=x)
            plt.style.use('seaborn-deep') #Nice dark style - seaborn-deep
            df.plot.bar(rot=0,figsize=(8,8))
            plt.legend(['All Courses n ={0:4.0f}'.format(totalsDF.loc[key,"TOTAL"]),\
                        'This Course n ={0:4.0f}'.format(int(theDF["TOTAL"]))], loc='upper left')
            plt.ylim(0,100)
            plt.ylabel("Percent")
            plt.xlabel("Likert 1-5")
            plt.title(course+"\n"+startDate+ " - " +endDate+ "\n" +typeDict[key])
            plt.grid(b=True, axis='y', color='gray')
#             plt.show()
            plotCnt += 1
            outFileStr = outFilePath+course+ '-' +startDate+ '-' +endDate+ '-' +key+ '.pdf'
            print('.', end='')
            plt.savefig(outFileStr)
            plt.close()
    courseCnt += 1
print (str(plotCnt) + " graphs created for " +str(courseCnt)+ " courses.")