#### **Packages Used:**

In [137]:
import numpy as np
import pandas as pd

#### Fisher Transformation
(https://en.wikipedia.org/wiki/Fisher_transformation)

In [152]:
def fisherTransform(corr):
    # sample size
    m, n = corr.shape
    print ('length:')
    print (n)

    ## Fisher transform all the correlation coefficients
    #F = 0.5*np.log((1+corr)/(1-corr))
    F = np.arctan(corr)
    print ('Fisher transformed:')
    print (F)
    print ('The standard deviation of the Fisher transformed correlation coefficients is: ' + str(F.std()))
    print ('1/sqrt(n-3)=' + str((1/np.sqrt(n-3))))

    ## 95% confidence intervals on the Fisher transform scale
    #LCL = F - 2/np.sqrt(n-3)
    #UCL = F + 2/np.sqrt(n-3)

    ## Convert the intervals back to the correlation scale
    #LCL = (np.exp(2*LCL)-1) / (np.exp(2*LCL)+1)
    #UCL = (np.exp(2*UCL)-1) / (np.exp(2*UCL)+1)

    #CP = np.mean((LCL < r) & (r < UCL))

    #print ('The coverage probability is: ' + CP)
    
    return F

In [153]:
#TEST

testDF = pd.DataFrame(np.random.randn(100, 6), columns=['rda1', 'rda2', 'rda3', 'rda4', 'rda5', 'GT'])

#print(testDF)


corr = testDF.corr(method='pearson')

#print ('full matrix:')
#print (corr)


gtCorr = corr[-1:].drop('GT', axis=1)

#print ('\nGT Corr only')
#print(gtCorr)

avgCorrPearson = gtCorr.mean(axis=1)

#print ('\nAverage GT Corr')
#print (avgCorrPearson)

#print ('Fisher Transformation:')
#fisherTransform(gtCorr)

#### **Load data into numpy array and pandas dataframes:**
* **datasetXX_nTreeRanks:** Ordered n-ary tree ranks for dataset XX (A, B, C, or D)
* **datasetXX_absRanks:** Ordered absolute post-traversal ranks for dataset XX (A, B, C, or D)


In [154]:
rawData = np.loadtxt(fname='disagreement-mturk-raw-ids-rankings-only.csv', delimiter=',', skiprows=1, 
                     dtype=np.dtype([('version', 'i4'), ('responseId', 'S25'), 
                                     ('rDA_1', 'f4'), ('rDA_2', 'f4'), ('rDA_3', 'f4'), ('rDA_4', 'f4'), ('rDA_5', 'f4'), 
                                     ('sDA_1', 'f4'), ('sDA_2', 'f4'), ('sDA_3', 'f4'), ('sDA_4', 'f4'), ('sDA_5', 'f4'), 
                                     ('rDB_1', 'f4'), ('rDB_2', 'f4'), ('rDB_3', 'f4'), ('rDB_4', 'f4'), ('rDB_5', 'f4'), 
                                     ('sDB_1', 'f4'), ('sDB_2', 'f4'), ('sDB_3', 'f4'), ('sDB_4', 'f4'), ('sDB_5', 'f4'), 
                                     ('rDC_1', 'f4'), ('rDC_2', 'f4'), ('rDC_3', 'f4'), ('rDC_4', 'f4'), ('rDC_5', 'f4'), 
                                     ('sDC_1', 'f4'), ('sDC_2', 'f4'), ('sDC_3', 'f4'), ('sDC_4', 'f4'), ('sDC_5', 'f4'), 
                                     ('rDD_1', 'f4'), ('rDD_2', 'f4'), ('rDD_3', 'f4'), ('rDD_4', 'f4'), ('rDD_5', 'f4'), 
                                     ('sDD_1', 'f4'), ('sDD_2', 'f4'), ('sDD_3', 'f4'), ('sDD_4', 'f4'), ('sDD_5', 'f4'), 
                                     ('mTurkCode', 'S25')]))

df = pd.DataFrame(rawData).drop('responseId', axis=1)

datasetA_nTreeRanks = df.ix[df['version']==3].filter(['rDA_1', 'rDA_2', 'rDA_3', 'rDA_4', 'rDA_5']).T
datasetA_absRanks = df.ix[df['version']==4].filter(['rDA_1', 'rDA_2', 'rDA_3', 'rDA_4', 'rDA_5']).T

datasetA_overall_ranks = df.filter(['rDA_1', 'rDA_2', 'rDA_3', 'rDA_4', 'rDA_5']).T
datasetA_overall_sliders = df.filter(['sDA_1', 'sDA_2', 'sDA_3', 'sDA_4', 'sDA_5']).T

#print (df)



#### **Calculate Correlations with Pearson's and Spearman's Methods:**

In [159]:
pearsonA_overall = datasetA_overall_ranks.corr(method='pearson', min_periods=5)
spearmanA_overall = datasetA_overall_ranks.corr(method='spearman', min_periods=5)

#print(pearsonA_overall)

dropCols = [50,51]

# Correlations with n-ary tree ranking
nT_pearsonA_overall = pearsonA_overall[-2:-1].drop(pearsonA_overall.columns[dropCols],axis=1)

print ('\nN-Ary Tree Corr only')
print(nT_pearsonA_overall)

avg_nT_pearsonA_overall = nT_pearsonA_overall.mean(axis=1)

print ('\nAverage N-Ary Tree Corr')
print (avg_nT_pearsonA_overall)

print ('Fisher Transformation for N-ary Tree Ranking:')
fisherTransform(nT_pearsonA_overall)


N-Ary Tree Corr only
     0    1    2    3    4    5    6    7    8    9  ...    40   41   42   43  \
50  0.9  0.0  0.1  1.0  1.0  1.0 -0.8  0.9  0.9  0.9 ...   1.0  0.7  0.5  0.3   

     44   45   46   47   48   49  
50  0.9  1.0  0.9  0.9  0.7  0.9  

[1 rows x 50 columns]

Average N-Ary Tree Corr
50    0.748
dtype: float64
Fisher Transformation for N-ary Tree Ranking:
length:
50
Fisher transformed:
          0    1         2         3         4         5         6         7   \
50  0.732815  0.0  0.099669  0.785398  0.785398  0.785398 -0.674741  0.732815   

          8         9     ...           40        41        42        43  \
50  0.732815  0.732815    ...     0.785398  0.610726  0.463648  0.291457   

          44        45        46        47        48        49  
50  0.732815  0.785398  0.732815  0.732815  0.610726  0.732815  

[1 rows x 50 columns]
The standard deviation of the Fisher transformed correlation coefficients is: 0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
50,0.732815,0.0,0.099669,0.785398,0.785398,0.785398,-0.674741,0.732815,0.732815,0.732815,...,0.785398,0.610726,0.463648,0.291457,0.732815,0.785398,0.732815,0.732815,0.610726,0.732815


In [160]:
rawData = np.loadtxt(fname='disagreement-mturk-raw-ids-rankings-only.csv', delimiter=',', skiprows=1, 
                     dtype=np.dtype([('version', 'i4'), ('responseId', 'S25'), 
                                     ('rDA_1', 'f4'), ('rDA_2', 'f4'), ('rDA_3', 'f4'), ('rDA_4', 'f4'), ('rDA_5', 'f4'), 
                                     ('sDA_1', 'f4'), ('sDA_2', 'f4'), ('sDA_3', 'f4'), ('sDA_4', 'f4'), ('sDA_5', 'f4'), 
                                     ('rDB_1', 'f4'), ('rDB_2', 'f4'), ('rDB_3', 'f4'), ('rDB_4', 'f4'), ('rDB_5', 'f4'), 
                                     ('sDB_1', 'f4'), ('sDB_2', 'f4'), ('sDB_3', 'f4'), ('sDB_4', 'f4'), ('sDB_5', 'f4'), 
                                     ('rDC_1', 'f4'), ('rDC_2', 'f4'), ('rDC_3', 'f4'), ('rDC_4', 'f4'), ('rDC_5', 'f4'), 
                                     ('sDC_1', 'f4'), ('sDC_2', 'f4'), ('sDC_3', 'f4'), ('sDC_4', 'f4'), ('sDC_5', 'f4'), 
                                     ('rDD_1', 'f4'), ('rDD_2', 'f4'), ('rDD_3', 'f4'), ('rDD_4', 'f4'), ('rDD_5', 'f4'), 
                                     ('sDD_1', 'f4'), ('sDD_2', 'f4'), ('sDD_3', 'f4'), ('sDD_4', 'f4'), ('sDD_5', 'f4'), 
                                     ('mTurkCode', 'S25')]))

df = pd.DataFrame(rawData)


datasetA_nTreeRanks = df.ix[df['version']==3].filter(['rDA_1', 'rDA_2', 'rDA_3', 'rDA_4', 'rDA_5'])
datasetA_absRanks = df.ix[df['version']==4].filter(['rDA_1', 'rDA_2', 'rDA_3', 'rDA_4', 'rDA_5'])

# dataset A for group 1 - table data
datasetA_group1_ranks = df.ix[df['version']==1].filter(['rDA_1', 'rDA_2', 'rDA_3', 'rDA_4', 'rDA_5'])
datasetA_group1_sliders = df.ix[df['version']==1].filter(['sDA_1', 'sDA_2', 'sDA_3', 'sDA_4', 'sDA_5'])

# dataset A for group 2 - visualization data
datasetA_group2_ranks = df.ix[df['version']==2].filter(['rDA_1', 'rDA_2', 'rDA_3', 'rDA_4', 'rDA_5'])
datasetA_group2_sliders = df.ix[df['version']==2].filter(['sDA_1', 'sDA_2', 'sDA_3', 'sDA_4', 'sDA_5'])

# dataset A overall
datasetA_overall_ranks = df.ix[(df['version']==1)|(df['version']==2)].filter(['rDA_1', 'rDA_2', 'rDA_3', 'rDA_4', 'rDA_5'])
datasetA_overall_sliders = df.ix[(df['version']==1)|(df['version']==2)].filter(['sDA_1', 'sDA_2', 'sDA_3', 'sDA_4', 'sDA_5'])

#print (datasetA_nTreeRanks)
#print (datasetA_absRanks)

#print(datasetA_overall_sliders)

#### **Compute Correlations and Average Correlation for each group:**

In [161]:
def computePearson(df, gt):
    '''
    Computes correlation of each row in df (dataframe) with gt (ground truth) values.
    Correlation type identified by 'method.'
    Results added to df in column identified by 'colName.'
    '''
    
    pearsons_stats = dict()
    
    for index, row in df.iterrows():
        r=1.0
        p=25.0
        
        df.iloc[index]['pearsonsNTree_r'] = r
        #df.iloc[index, 'pearsonsNTree_p'] = p
        print (index)
        print (df.iloc[index,'pearsonsNTree_r'])
    return pearson_stats

computePearson(df=datasetA_overall_ranks, gt=datasetA_nTreeRanks)


#print (datasetA_overall_ranks.iloc[1])
#print (datasetA_nTreeRanks.iloc[0])

#r, p = stats.pearsonr(datasetA_overall_ranks.iloc[1],datasetA_nTreeRanks.iloc[0])

#data1 = np.array([5.0, 1.0, 2.0, 4.0, 3.0])
#gt = np.array([4.0, 1.0, 2.0, 5.0, 3.0])

#r, p = stats.pearsonr(data1, gt)

#print(r)
#print(p)


#datasetA_overall_ranks['pearsonsNTree_r'], datasetA_overall_ranks['pearsonsNTree_p'] = datasetA_overall_ranks.apply(stats.pearsonr, axis=1, args=(datasetA_nTreeRanks.iloc[0]))


print (datasetA_overall_ranks)

0


ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types