In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

%matplotlib inline

### A function that returns all result file pathes in the output folder

In [2]:
"""
Return a list of result file pathes in the output folder

output_path: output folder name
"""
def get_result_file_pathes(output_path):
    results = os.listdir(output_path)
    results = [output_path + '/' + result for result in results]
    return results

### A function that generates precision scores and write the results to a new file.

In [53]:
"""
Return the result dataframe containing precision scores calculated.

input_path: raw result file's path
"""
def process_row_data(input_path):
    results = pd.read_csv(input_path)
    results.columns = results.columns.str.strip()
    #results['sample_id'] = str(id)
    results['w_support'] = (results['ww'] + results['wm'])
    results['m_support'] = (results['mm'] + results['mw'])

    results['w_precision'] = results['ww'] / (results['ww'] + results['mw'])
    results['w_recall'] = results['ww'] / (results['ww'] + results['wm'])
    results['m_precision'] = results['mm'] / (results['mm'] + results['wm'])
    results['m_recall'] = results['mm'] / (results['mm'] + results['mw'])
    results = results.fillna(0)
    
    results['w_f1'] = ( 2 * results['w_precision'] * results['w_recall'] )/ (results['w_precision'] + results['w_recall'])
    results['m_f1'] = ( 2 * results['m_precision'] * results['m_recall'] )/ (results['m_precision'] + results['m_recall'])
    results = results.fillna(0)
    
    results['precision'] = ((results['w_precision'] * results['w_support']) + (results['m_precision'] * results['m_support']))/ (results['w_support'] + results['m_support'])
    results['recall'] = ((results['w_recall'] * results['w_support']) + (results['m_recall'] * results['m_support']))/ (results['w_support'] + results['m_support'])
    results['f1'] = ((results['w_f1'] * results['w_support']) + (results['m_f1'] * results['m_support']))/ (results['w_support'] + results['m_support'])
    results = results.fillna(0)
    
    return results

### Analysis Distribution of Overall Scores

In [54]:
def analyse_result(file_path, show_distribution, show_voting):
    results = pd.read_csv(file_path)
    results['sample_index'] = results['sample_index'].astype(str)

    if show_distribution:
        # the histogram of the data
        plt.figure(figsize=(15, 15)) 
        plt.subplot(3, 3, 1)
        plt.hist(results.precision, 50, facecolor='b', alpha=0.5, normed=True, range=[0, 1])
        plt.xlabel('precision')
        plt.title('Overall Precision')
        plt.grid(True)

        plt.subplot(3, 3, 2)
        plt.hist(results.recall, 50, facecolor='r', alpha=0.5, normed=True, range=[0, 1])
        plt.xlabel('recall')
        plt.title('Overall Recall')
        plt.grid(True)

        plt.subplot(3, 3, 3)
        plt.hist(results.f1, 50, facecolor='g', alpha=0.5, normed=True, range=[0, 1])
        plt.xlabel('F1')
        plt.title('Overall F1')
        plt.grid(True)

        plt.subplot(3, 3, 4)
        plt.hist(results.w_precision, 50, facecolor='b', alpha=0.5, range=[0, 1])
        plt.xlabel('precision')
        plt.title('WT Precision')
        plt.grid(True)

        plt.subplot(3, 3, 5)
        plt.hist(results.w_recall, 50, facecolor='r', alpha=0.5, range=[0, 1])
        plt.xlabel('recall')
        plt.title('WT Recall')
        plt.grid(True)

        plt.subplot(3, 3, 6)
        plt.hist(results.w_f1, 50, facecolor='g', alpha=0.5, range=[0, 1])
        plt.xlabel('f1')
        plt.title('WT F1')
        plt.grid(True)

        plt.subplot(3, 3, 7)
        plt.hist(results.m_precision, 50, facecolor='b', alpha=0.5, range=[0, 1])
        plt.xlabel('precision')
        plt.title('MT Precision')
        plt.grid(True)

        plt.subplot(3, 3, 8)
        plt.hist(results.m_recall, 50, facecolor='r', alpha=0.5, range=[0, 1])
        plt.xlabel('recall')
        plt.title('MT Recall')
        plt.grid(True)

        plt.subplot(3, 3, 9)
        plt.hist(results.m_f1, 50, facecolor='g', alpha=0.5, range=[0, 1])
        plt.xlabel('F1')
        plt.title('MT F1')
        plt.grid(True)

        plt.show()

    if show_voting:
        correctness = []
        for sample_index in results.sample_index.unique():
            print('sample:', sample_index)
            sample_results = results[results.sample_index == str(sample_index)]
            
            label = sample_results.stype.tolist()[0]
            label = 0 if label == 'wt-at' else 1
            print('label:', sample_results.stype.tolist()[0])
            vote0 = sample_results[(sample_results.pred == 0)].shape[0]
            vote1 = sample_results[(sample_results.pred == 1)].shape[0]
            prediction = 1 if vote1 > vote0 else 0
            print('votes for type 0 (no threshold): ', vote0)
            print('votes for type 1 (no threshold): ', vote1)
            print('prediction (no threshold): ', prediction)
            print()
            
            vote0_threshold = sample_results[(sample_results.pred == 0) & (sample_results.f1 > 0.8)].shape[0]
            vote1_threshold = sample_results[(sample_results.pred == 1) & (sample_results.f1 > 0.8)].shape[0]
            prediction_threshold = 1 if vote1_threshold > vote0_threshold else 0
            print('votes for type 0 (f1>0.7): ', vote0_threshold)
            print('votes for type 1 (f1>0.7): ', vote1_threshold)
            print('prediction (f1>0.7): ', prediction_threshold)
            print('===================================')
            correctness.append(prediction_threshold == label)
        return correctness

### Testing

In [58]:
input_folder = 'output_ZRF_2med'
output_folder = 'analysis_ZRF_2med'
# get result file names
result_files = get_result_file_pathes(os.path.join(os.getcwd(), input_folder))

In [59]:
results = []
for result_file in result_files:
    name = result_file.split('/')[-1]
    if (name == '.DS_Store'): continue
    #id = ''.join(x for x in name if x.isdigit())
    print('processing: ' + name)
    
    # process raw result and append the dataframe to the results list
    results.append(process_row_data(result_file))

processing: ZRF152.csv
processing: ZRF141.csv
processing: ZRF140.csv
processing: ZRF142.csv
processing: ZRF340.csv
processing: ZRF143.csv
processing: ZRF118.csv
processing: ZRF124.csv
processing: ZRF130.csv
processing: ZRF326.csv
processing: ZRF332.csv
processing: ZRF333.csv
processing: ZRF327.csv
processing: ZRF131.csv
processing: ZRF125.csv
processing: ZRF119.csv
processing: ZRF127.csv
processing: ZRF319.csv
processing: ZRF331.csv
processing: ZRF325.csv
processing: ZRF324.csv
processing: ZRF330.csv
processing: ZRF318.csv
processing: ZRF126.csv
processing: ZRF132.csv
processing: ZRF112_1.csv
processing: ZRF136.csv
processing: ZRF122.csv
processing: ZRF334.csv
processing: ZRF320.csv
processing: ZRF321.csv
processing: ZRF335.csv
processing: ZRF123.csv
processing: ZRF137.csv
processing: ZRF121.csv
processing: ZRF112_2.csv
processing: ZRF135.csv
processing: ZRF109.csv
processing: ZRF323.csv
processing: ZRF337.csv
processing: ZRF336.csv
processing: ZRF108.csv
processing: ZRF134.csv
process

In [61]:
aggregated_result = pd.concat(results)
aggregated_result['sample_index'] = aggregated_result['sample_index'].astype(str)
print(aggregated_result.shape)


#landmark_data = pd.read_csv('./data/tidy/landmark_ZRF_w_index.csv')
#aggregated_result_w_type = pd.merge(aggregated_result,landmark_data[['sample_index','stype']],on='sample_index', how='left')
aggregated_result['pred'] = 1- aggregated_result_w_type['pred']
print(aggregated_result.shape)
aggregated_result.to_csv(output_folder + '/ZRF_2med_aggregated.csv')

(11704, 18)
(11704, 18)


In [15]:
correctness = analyse_result(output_folder + '/ZRF_2med_aggregated.csv', False, True)

NameError: name 'output_folder' is not defined

In [17]:
correctness = analyse_result('./analysis_AT_2med' + '/AT_2med_aggregated.csv', False, True)

sample: 113
label: wt-at
votes for type 0 (no threshold):  21432
votes for type 1 (no threshold):  1672
prediction (no threshold):  0

votes for type 0 (f1>0.7):  10488
votes for type 1 (f1>0.7):  152
prediction (f1>0.7):  0
sample: 107
label: wt-at
votes for type 0 (no threshold):  20824
votes for type 1 (no threshold):  2280
prediction (no threshold):  0

votes for type 0 (f1>0.7):  9576
votes for type 1 (f1>0.7):  912
prediction (f1>0.7):  0
sample: 339
label: mt-at
votes for type 0 (no threshold):  9120
votes for type 1 (no threshold):  13984
prediction (no threshold):  1

votes for type 0 (f1>0.7):  2280
votes for type 1 (f1>0.7):  8360
prediction (f1>0.7):  1
sample: 338
label: mt-at
votes for type 0 (no threshold):  7448
votes for type 1 (no threshold):  15656
prediction (no threshold):  1

votes for type 0 (f1>0.7):  1976
votes for type 1 (f1>0.7):  8968
prediction (f1>0.7):  1
sample: 106
label: wt-at
votes for type 0 (no threshold):  21736
votes for type 1 (no threshold):  13

sample: 143
label: wt-at
votes for type 0 (no threshold):  5016
votes for type 1 (no threshold):  18088
prediction (no threshold):  1

votes for type 0 (f1>0.7):  0
votes for type 1 (f1>0.7):  10032
prediction (f1>0.7):  1
sample: 340
label: mt-at
votes for type 0 (no threshold):  8816
votes for type 1 (no threshold):  14288
prediction (no threshold):  1

votes for type 0 (f1>0.7):  2280
votes for type 1 (f1>0.7):  8208
prediction (f1>0.7):  1
sample: 142
label: wt-at
votes for type 0 (no threshold):  1824
votes for type 1 (no threshold):  21280
prediction (no threshold):  1

votes for type 0 (f1>0.7):  912
votes for type 1 (f1>0.7):  9120
prediction (f1>0.7):  1
sample: 140
label: wt-at
votes for type 0 (no threshold):  19456
votes for type 1 (no threshold):  3648
prediction (no threshold):  0

votes for type 0 (f1>0.7):  9424
votes for type 1 (f1>0.7):  1216
prediction (f1>0.7):  0
sample: 141
label: wt-at
votes for type 0 (no threshold):  21736
votes for type 1 (no threshold):  1368

sample: 135
label: wt-at
votes for type 0 (no threshold):  15960
votes for type 1 (no threshold):  7144
prediction (no threshold):  0

votes for type 0 (f1>0.7):  8208
votes for type 1 (f1>0.7):  2584
prediction (f1>0.7):  0
sample: 121
label: wt-at
votes for type 0 (no threshold):  18088
votes for type 1 (no threshold):  5016
prediction (no threshold):  0

votes for type 0 (f1>0.7):  9576
votes for type 1 (f1>0.7):  760
prediction (f1>0.7):  0
sample: 109
label: wt-at
votes for type 0 (no threshold):  21888
votes for type 1 (no threshold):  1216
prediction (no threshold):  0

votes for type 0 (f1>0.7):  9576
votes for type 1 (f1>0.7):  608
prediction (f1>0.7):  0
sample: 137
label: wt-at
votes for type 0 (no threshold):  19608
votes for type 1 (no threshold):  3496
prediction (no threshold):  0

votes for type 0 (f1>0.7):  9880
votes for type 1 (f1>0.7):  608
prediction (f1>0.7):  0
sample: 123
label: wt-at
votes for type 0 (no threshold):  16112
votes for type 1 (no threshold):  6992

In [18]:
correctness.count(False)

9

In [19]:
len(correctness)

78

In [28]:
aggregate = np.array_split(pd.read_csv('./analysis_ZRF_2med/ZRF_2med_aggregated.csv', index_col=0), 4)

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


In [29]:
for i in range(4):
    aggregate[i].to_csv('./analysis_ZRF_2med/ZRF_2med_aggregated_' + str(i) + '.csv')

In [50]:
aggregated_result = pd.read_csv('./analysis_AT_2med/AT_2med_aggregated.csv', index_col=0)
aggregated_result['sample_index'] = aggregated_result['sample_index'].astype(str)
aggregated_result.sample_index.replace(['1121', '1122'], ['112_1', '112_2'], inplace=True)
aggregated_result['sample_index'] = aggregated_result['sample_index'].astype(str)
print(aggregated_result.shape)
print(aggregated_result.head())

aggregated_result.to_csv('./analysis_AT_2med/AT_2med_aggregated.csv')

(11856, 18)
         f1  landmark_index  m_f1  m_precision  m_recall  m_support  mm  mw  \
0  0.385027               1   0.0          0.0       0.0         35   0  35   
1  0.385027               2   0.0          0.0       0.0         35   0  35   
2  0.385027               3   0.0          0.0       0.0         35   0  35   
3  1.000000               4   1.0          1.0       1.0         35  35   0   
4  1.000000               5   1.0          1.0       1.0         35  35   0   

   precision  pred    recall sample_index      w_f1  w_precision  w_recall  \
0   0.297521     0  0.545455          113  0.705882     0.545455       1.0   
1   0.297521     0  0.545455          113  0.705882     0.545455       1.0   
2   0.297521     0  0.545455          113  0.705882     0.545455       1.0   
3   1.000000     0  1.000000          113  1.000000     1.000000       1.0   
4   1.000000     0  1.000000          113  1.000000     1.000000       1.0   

   w_support  wm  ww  
0         42   0  42 

In [63]:
AT = pd.read_csv('../5.output_AT_2med/AT330.csv', index_col=0)
AT.columns = AT.columns.str.strip()
AT['pred'] = 1 - AT['pred']

In [64]:
AT.head()

Unnamed: 0_level_0,landmark_index,pred,ww,wm,mm,mw
sample_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
330,1,0,0,35,43,0
330,2,0,0,35,43,0
330,3,0,0,35,43,0
330,4,1,35,0,43,0
330,5,1,35,0,43,0


In [65]:
AT = AT.rename(columns={'ww': 'mm', 'wm': 'mw', 'mm': 'ww', 'mw': 'wm'})
AT.head()

Unnamed: 0_level_0,landmark_index,pred,mm,mw,ww,wm
sample_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
330,1,0,0,35,43,0
330,2,0,0,35,43,0
330,3,0,0,35,43,0
330,4,1,35,0,43,0
330,5,1,35,0,43,0


In [66]:
AT.to_csv('../5.output_AT_2med/AT330.csv')

In [3]:
df = pd.read_csv('../7.aggregatedResults/AT_2med.csv')
df.head()

Unnamed: 0,type0_f1,type0_precision,type0_recall,type0_support,type1_f1,type1_precision,type1_recall,type1_support,f1,landmark_index,type1_1,type1_0,precision,pred,recall,sample_index,type0_1,type0_0
0,0.705882,0.545455,1.0,42,0.0,0.0,0.0,35,0.385027,1,0,35,0.297521,0,0.545455,113,0,42
1,0.705882,0.545455,1.0,42,0.0,0.0,0.0,35,0.385027,2,0,35,0.297521,0,0.545455,113,0,42
2,0.705882,0.545455,1.0,42,0.0,0.0,0.0,35,0.385027,3,0,35,0.297521,0,0.545455,113,0,42
3,1.0,1.0,1.0,42,1.0,1.0,1.0,35,1.0,4,35,0,1.0,0,1.0,113,0,42
4,1.0,1.0,1.0,42,1.0,1.0,1.0,35,1.0,5,35,0,1.0,0,1.0,113,0,42


In [20]:
landmarks = list(set(df.landmark_index.values))
cols = ['type0_f1',
 'type0_precision',
 'type0_recall',
 'type0_support',
 'type1_f1',
 'type1_precision',
 'type1_recall',
 'type1_support',
 'f1',
 'landmark_index',
 'type1_1',
 'type1_0',
 'precision',
 'recall',
 'type0_1',
 'type0_0']

In [26]:
for l in landmarks:
    df_l = df[df.landmark_index == l]
    new_row={'sample_index': 'aggregated'}
    for col in cols:
        new_row[col]=np.mean(df_l[col])
    df = df.append(new_row, ignore_index=True)

In [27]:
df.tail()

Unnamed: 0,type0_f1,type0_precision,type0_recall,type0_support,type1_f1,type1_precision,type1_recall,type1_support,f1,landmark_index,type1_1,type1_0,precision,pred,recall,sample_index,type0_1,type0_0
12155,1.0,1.0,1.0,42.487179,1.0,1.0,1.0,34.628205,1.0,148.0,34.628205,0.0,1.0,,1.0,aggregated,0.0,42.487179
12156,1.0,1.0,1.0,42.487179,1.0,1.0,1.0,34.628205,1.0,149.0,34.628205,0.0,1.0,,1.0,aggregated,0.0,42.487179
12157,1.0,1.0,1.0,42.487179,1.0,1.0,1.0,34.628205,1.0,150.0,34.628205,0.0,1.0,,1.0,aggregated,0.0,42.487179
12158,1.0,1.0,1.0,42.487179,1.0,1.0,1.0,34.628205,1.0,151.0,34.628205,0.0,1.0,,1.0,aggregated,0.0,42.487179
12159,1.0,1.0,1.0,42.487179,1.0,1.0,1.0,34.628205,1.0,152.0,34.628205,0.0,1.0,,1.0,aggregated,0.0,42.487179


In [28]:
list(df)

['type0_f1',
 'type0_precision',
 'type0_recall',
 'type0_support',
 'type1_f1',
 'type1_precision',
 'type1_recall',
 'type1_support',
 'f1',
 'landmark_index',
 'type1_1',
 'type1_0',
 'precision',
 'pred',
 'recall',
 'sample_index',
 'type0_1',
 'type0_0']

In [10]:
result = pd.read_csv('../7.aggregatedResults/AT_2med_renamed_2.csv')
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12008 entries, 0 to 12007
Data columns (total 19 columns):
sample_index         12008 non-null object
landmark_index       12008 non-null int64
type                 11856 non-null object
pred                 12008 non-null object
type0_0              12008 non-null float64
type0_1              12008 non-null float64
type1_0              12008 non-null float64
type1_1              12008 non-null float64
type0_precision      12008 non-null float64
type0_recall         12008 non-null float64
type0_f1             12008 non-null float64
type0_num            12008 non-null float64
type1_precision      12008 non-null float64
type1_recall         12008 non-null float64
type1_f1             12008 non-null float64
type1_num            12008 non-null float64
overall_precision    12008 non-null float64
overall_recall       12008 non-null float64
overall_f1           12008 non-null float64
dtypes: float64(15), int64(1), object(3)
memory usage: 1.7+ 

In [2]:
df = pd.read_csv('../3.InputData/final/landmark_AT_filled_w_2median.csv')
df.head()

Unnamed: 0,sample_index,pts,r,stype,landmark_index
0,1,0,0.0,mt-at,1
1,101,0,0.0,wt-at,1
2,102,0,0.0,wt-at,1
3,103,0,0.0,wt-at,1
4,104,0,0.0,wt-at,1


In [3]:
df.sample(frac=0.1, replace=True)

Unnamed: 0,sample_index,pts,r,stype,landmark_index
4475,128,6251,4.151089,wt-at,58
7062,141,8414,9.091117,wt-at,91
11637,114,0,31.192271,wt-at,150
627,103,1,2.088584,wt-at,9
3570,326,866,14.366965,mt-at,46
6625,339,30,1.047009,mt-at,85
32,131,0,0.000000,wt-at,1
8914,121,11730,5.269774,wt-at,115
6086,102,7937,3.891096,wt-at,79
5182,133,1767,11.712533,wt-at,67
