#### **Packages Used:**

In [247]:
import numpy as np
import pandas as pd
import math

#### Fisher Transformation
* Background and Formula: https://en.wikipedia.org/wiki/Fisher_transformation
* Implementation Reference: http://dept.stat.lsa.umich.edu/~kshedden/Python-Workshop/stats_calculations.html

In [248]:
def fisherTransform(corr):
    '''Given 1-row matrix of correlation coefficients, computes Fisher Transform matrix of same dimensions'''
    
    # Fisher transform all the correlation coefficients (equivalent to hyperbolic tangent)
    F = np.arctanh(corr)
    
    
    
    
    
    
    # TODO: Clean this up when we know what we want
    
    # sample size (n is number of columns, which is sample size)
    #m, n = corr.shape
    
    # average correlation coefficient
    #r = corr.mean(axis=1)

    # StdDev and Error
    #fStd = F.std(axis=1)
    #fErr = (1/np.sqrt(n-3))
    
    #avgF = F.mean(axis=1)
    
    # Transform back to R Value
    #convertedR = np.tanh(avgF)
    
    # 95% confidence intervals on the Fisher transform scale
    #LCL = F - 2/np.sqrt(n-3)
    #UCL = F + 2/np.sqrt(n-3)

    # Convert the intervals back to the correlation scale
    #LCL = (np.exp(2*LCL)-1) / (np.exp(2*LCL)+1)
    #UCL = (np.exp(2*UCL)-1) / (np.exp(2*UCL)+1)
    
    # Coverage Probability
    #CP = np.mean((LCL < r) & (r < UCL))
    
    #print ('\n\nAverage Fisher transformation:')
    #print (avgF)
    
    #print ('\nThe standard deviation of the Fisher transformed correlation coefficients is: ' + str(fStd))
    #print ('\nErr = 1/sqrt(n-3)=' + str(fErr))
    
    #print ('\nConverted back to R Value:')
    #print (convertedR)

    # TODO: This does not seem right!!
    #print ('The coverage probability is: ' + str(CP))
    
    return F

In [249]:
def avgFisherTransform(F):
    '''Given 1-row Matrix of Fisher Transform Values, returns Average Fisher Transform'''
    return F.mean(axis=1)

In [250]:
def fisherStdDev(F):
    '''Given 1-row Matrix of Fisher Transform Values, returns std dev of Fisher Transform'''
    return F.std(axis=1)

In [251]:
def fisherStdErr(n):
    '''Given sample size n, returns standard error for Fisher Transform'''
    return (1/np.sqrt(n-3))

In [252]:
def convertedAvgR(avgF):
    '''Given Average Fisher Transform, computes Average R Correlation Coefficient'''
    return np.tanh(avgF)

#### **Load data into numpy array and pandas dataframes:**
* **datasetXX_nTreeRanks:** Ordered n-ary tree ranks for dataset XX (A, B, C, or D)
* **datasetXX_absRanks:** Ordered absolute post-traversal ranks for dataset XX (A, B, C, or D)


In [253]:
rawData = np.loadtxt(fname='disagreement-mturk-raw-ids-rankings-only.csv', delimiter=',', skiprows=1, 
                     dtype=np.dtype([('version', 'i4'), ('responseId', 'S25'), 
                                     ('rDA_1', 'f4'), ('rDA_2', 'f4'), ('rDA_3', 'f4'), ('rDA_4', 'f4'), ('rDA_5', 'f4'), 
                                     ('sDA_1', 'f4'), ('sDA_2', 'f4'), ('sDA_3', 'f4'), ('sDA_4', 'f4'), ('sDA_5', 'f4'), 
                                     ('rDB_1', 'f4'), ('rDB_2', 'f4'), ('rDB_3', 'f4'), ('rDB_4', 'f4'), ('rDB_5', 'f4'), 
                                     ('sDB_1', 'f4'), ('sDB_2', 'f4'), ('sDB_3', 'f4'), ('sDB_4', 'f4'), ('sDB_5', 'f4'), 
                                     ('rDC_1', 'f4'), ('rDC_2', 'f4'), ('rDC_3', 'f4'), ('rDC_4', 'f4'), ('rDC_5', 'f4'), 
                                     ('sDC_1', 'f4'), ('sDC_2', 'f4'), ('sDC_3', 'f4'), ('sDC_4', 'f4'), ('sDC_5', 'f4'), 
                                     ('rDD_1', 'f4'), ('rDD_2', 'f4'), ('rDD_3', 'f4'), ('rDD_4', 'f4'), ('rDD_5', 'f4'), 
                                     ('sDD_1', 'f4'), ('sDD_2', 'f4'), ('sDD_3', 'f4'), ('sDD_4', 'f4'), ('sDD_5', 'f4'), 
                                     ('mTurkCode', 'S25')]))

# initialize n-ary tree ranking results with headers
nTresults = [['overallCoeff', 'overallFTrans', 'overallConvCoeff']]

# full data frame
df = pd.DataFrame(rawData)

# Utils for all datasets
overallDropCols = [50,51]
group1DropCols = []
group2DropCols = []

# Ground Truth Ranking Datasets
datasetA_nTreeRanks = df.ix[df['version']==3].filter(['rDA_1', 'rDA_2', 'rDA_3', 'rDA_4', 'rDA_5']).T
datasetA_absRanks = df.ix[df['version']==4].filter(['rDA_1', 'rDA_2', 'rDA_3', 'rDA_4', 'rDA_5']).T


# Dataset A
datasetA_overall_ranks = df.filter(['rDA_1', 'rDA_2', 'rDA_3', 'rDA_4', 'rDA_5']).T
datasetA_overall_sliders = df.filter(['sDA_1', 'sDA_2', 'sDA_3', 'sDA_4', 'sDA_5']).T
datasetA_overall_ranks = df.filter(['rDA_1', 'rDA_2', 'rDA_3', 'rDA_4', 'rDA_5']).T




#### **Calculate Correlations and Store Results:**
* Pearson's used for Rank data
* Spearman's used for continuous (slider) data
* TODO: Currently only applying n-ary tree ranking, but data is in place for absolute ranking also (indices for n-ary tree are hard-coded currently)

In [254]:
def pearsonStats(dataId, overall, group1, group2):
    '''Given a dataset name, overall data set and dataset for each group, returns list of results for pearson method'''
    
    #initialize empty list of stats
    stats=[]
    
    # overall data
    
    # all correlations
    overallCorr = overall.corr(method='pearson', min_periods=5)
    # correlations with n-ary tree ranking
    nTOverall = overallCorr[-2:-1].drop(pearsonA_overall.columns[overallDropCols],axis=1)
    # average n-ary tree correlation
    avgNtOverall = nTOverall.mean(axis=1)
    stats.append(avgNtOverall)
    # average F-transform of n-ary tree metric
    fTransform = fisherTransform(nTOverall)
    avgF = avgFisherTransform(fTransform)
    stats.append(avgF)
    # converted average correlation coeff
    convAvgR = convertedAvgR(avgF)
    stats.append(convAvgR)
    
    return stats
    

In [256]:


# Dataset A Ranks
aRanks = pearsonStats('datasetA_ranks', datasetA_overall_ranks, datasetA_group1_ranks, datasetA_group2_ranks)
nTresults.append(aRanks)

#pearsonA_overall = datasetA_overall_ranks.corr(method='pearson', min_periods=5)
pearsonA_group1 = datasetA_group1_ranks.corr(method='pearson', min_periods=5)
pearsonA_group2 = datasetA_group2_ranks.corr(method='pearson', min_periods=5)


# Dataset A Sliders
spearmanA_overall = datasetA_overall_sliders.corr(method='spearman', min_periods=5)

#print(pearsonA_overall)



# Correlations with n-ary tree ranking
nT_pearsonA_overall = pearsonA_overall[-2:-1].drop(pearsonA_overall.columns[overallDropCols],axis=1)

#print ('\nN-Ary Tree Corr only')
#print(nT_pearsonA_overall)

#avg_nT_pearsonA_overall = nT_pearsonA_overall.mean(axis=1)

print ('\nOriginal Average N-Ary Tree Corr')
print (avg_nT_pearsonA_overall[0][1])

print ('Average Fisher Transformation for N-ary Tree Ranking:')
F = fisherTransform(nT_pearsonA_overall)
avgF = avgFisherTransform(F)
print (avgF)

print ('Converted Avg Corr Coeff')
convAvgCoeff = convertedAvgR(avgF)
print (convAvgCoeff)


Original Average N-Ary Tree Corr
50    0.748
dtype: float64
Average Fisher Transformation for N-ary Tree Ranking:
50    inf
dtype: float64
Converted Avg Corr Coeff
50    1.0
dtype: float64




#### Print Results Table:

In [257]:
print(nTresults)

[['overallCoeff', 'overallFTrans', 'overallConvCoeff'], [50    0.748
dtype: float64, 50    inf
dtype: float64, 50    1.0
dtype: float64], [50    0.748
dtype: float64, 50    inf
dtype: float64, 50    1.0
dtype: float64]]
