update

plastering · Jun 21, 2018 · 6d57f0a · 6d57f0a
1 parent 486037a
commit 6d57f0a
Show file tree

Hide file tree

Showing 9 changed files with 256 additions and 28 deletions.
diff --git a/plastering/inferencers/inferencer.py b/plastering/inferencers/inferencer.py
@@ -1,4 +1,5 @@
 import os
+import time
 import pdb
 import random
 from copy import deepcopy

diff --git a/plastering/inferencers/scrabble_new.py b/plastering/inferencers/scrabble_new.py
@@ -185,32 +185,34 @@ def apply_filter_by_zodiac(self, pred):
             triple = (BASE[srcid], RDF.type, BRICK[point_tagset])
             if self.prior_confidences[triple] > 0.8:
                 self.zodiac_good_preds[srcid] = point_tagset
+        fixed_cnt = 0
         for srcid, pred_tagsets in pred.items():
             pred_point_tagset = sel_point_tagset(pred_tagsets, srcid)
             good_point_tagset = self.zodiac_good_preds.get(srcid, None)
             if not good_point_tagset:
                 continue
             if not self.is_same_tagset(pred_point_tagset, good_point_tagset):
                 pred_tagsets = [tagset for tagset in pred_tagsets
-                                if self.is_same_tagset(tagset,
-                                                       pred_point_tagset)]
+                                if not is_point_tagset(tagset)]
                 pred_tagsets.append(good_point_tagset)
                 print('FIXED {0}, {1} -> {2}'.format(srcid,
                                                      pred_point_tagset,
                                                      good_point_tagset))
+                fixed_cnt += 1
                 pred[srcid] = pred_tagsets
+        print('TOTAL_FIXED_POINTS: {0}'.format(fixed_cnt))
         return pred
 
     def select_informative_samples(self, sample_num=10):
         # Use prior (e.g., from Zodiac.)
         new_srcids = []
-        if self.apply_validating_samples:
-            new_srcids += self.apply_prior_zodiac(sample_num)
+        #if self.apply_validating_samples:
+        #    new_srcids += self.apply_prior_zodiac(sample_num)
         if len(new_srcids) < sample_num:
             new_srcids += self.scrabble.select_informative_samples(
-                sample_num * 3 - len(new_srcids))
-        new_srcids = [srcid for srcid in new_srcids
-                      if srcid not in self.zodiac_good_preds][0:sample_num]
+                sample_num - len(new_srcids))
+        #new_srcids = [srcid for srcid in new_srcids
+        #              if srcid not in self.zodiac_good_preds][0:sample_num]
         return new_srcids
 
 

diff --git a/plastering/inferencers/zodiac_new.py b/plastering/inferencers/zodiac_new.py
@@ -82,6 +82,10 @@ def __init__(self,
             sample_num_list = config['sample_num_list']
         else:
             sample_num_list = [0] * (len(source_buildings) + 1) # +1 for target
+        if 'use_quiver' in config:
+            self.use_quiver = config['use_quiver']
+        else:
+            self.use_quiver = False
         if len(self.source_buildings) > len(sample_num_list):
             sample_num_list.append(0)
 
@@ -301,8 +305,6 @@ def calc_prior_g_acc(self):
                 acc += 1
         acc = 0 if not cnt else acc / cnt
         print('Accuracy: {0}'.format(acc))
-        pdb.set_trace()
-
 
     def apply_prior_augment_samples(self):
         prior_preds = {}

diff --git a/result/arka/parse_result.py b/result/arka/parse_result.py
@@ -1,17 +1,150 @@
 import os
+import sys
 import pdb
 import re
+from copy import deepcopy
+from operator import itemgetter
+import json
 
 import pandas as pd
 
+dir_path = os.path.dirname(os.path.realpath(__file__))
+sys.path.insert(0, dir_path + '/../..')
+from plastering.metadata_interface import *
+from plastering.evaluator import *
+
 target_building = 'sdh'
 currfile = __file__
 base_dir = os.path.dirname(currfile)
 target_dir = base_dir + '/' + target_building
-cluster_sizes = {}
+
+
+def get_number(s):
+    return int(re.findall('\\d+', s)[0])
+
+def is_finished():
+    for cid, curr_eid in curr_eids.items():
+        if curr_eid < len(qualified_examples_nums[cid]) - 1:
+            return False
+    return True
+
+def select_next_cid():
+    ordered_cids = [row[0] for row in
+                    sorted(curr_cluster_sizes.items(),
+                           key=itemgetter(1),
+                           reverse=True)]
+    for cid in ordered_cids:
+        curr_eid = curr_eids[cid]
+        if curr_eid < len(qualified_examples_nums[cid]) - 1:
+            return cid
+    raise Exception('cannot find cids without finishing the algorithm. A bug')
+
+def get_srcid(name):
+    return '_'.join(re.findall('[a-zA-Z0-9]+', name))
+
+orig_cluster_sizes = {}
+total_names = []
 for filename in os.listdir(target_dir):
+    if not re.match('{0}-ORIGINAL-METADATA-\\d+$'.format(target_building.upper()),
+                    filename):
+        continue
+    cid = get_number(filename)
+    with open(target_dir + '/' + filename, 'r') as fp:
+        names = fp.readlines()
+    orig_cluster_sizes[cid] = len(names)
+    total_names += names
+total_names = list(set(total_names))
+total_srcids = [get_srcid(name) for name in total_names]
+curr_cluster_sizes = deepcopy(orig_cluster_sizes)
+
+true_tagsets = {srcid: LabeledMetadata.objects(srcid=srcid).first().tagsets
+                for srcid in total_srcids}
+true_points = {srcid: LabeledMetadata.objects(srcid=srcid).first().point_tagset
+                for srcid in total_srcids}
+
+qualified_examples_nums = {}
+for filename in os.listdir(target_dir):
+    if not re.match('l-ex-\\d+-out$', filename):
+        continue
+    cid = get_number(filename)
     df = pd.read_csv(target_dir + '/' + filename)
     df.columns = df.columns.str.strip()
-    cluster_id = int(re.findall('\\d+', filename)[0])
-    coverages = df['fullyQualified'].tolist()
-    pdb.set_trace()
+    coverages = df['Num Examples Thought to be fully qualified'].tolist()
+    qualified_examples_nums[cid] = coverages
+
+
+inferred_points_dict = {i: {} for i in curr_cluster_sizes.keys()}
+for filename in os.listdir(target_dir):
+    if not re.match('l-ex-\\d+-out-points-qualified$', filename):
+        continue
+    cid = get_number(filename)
+    with open(target_dir + '/' + filename, 'r') as fp:
+        lines = fp.readlines()
+    for line in lines:
+        ex_id = int(line.split(' ')[0])
+        if "'" not in line:
+            items = []
+        else:
+            items = line.split('[')[-1].split(']')[0][1:-1].split("', '")
+        inferred_points_dict[cid][ex_id] = items
+
+pred = {}
+
+curr_eids = {i: 0 for i in curr_cluster_sizes.keys()}
+
+
+total_num = sum(orig_cluster_sizes.values())
+
+pred_names = set()
+cnt = 0
+accs = []
+f1s = []
+mf1s = []
+anymf1s = []
+srcids = []
+pred = {srcid: [] for srcid in total_srcids}
+point_pred = {srcid: [] for srcid in total_srcids}
+res = []
+
+while not is_finished():
+    # select cluster
+    #max_cid = max(curr_cluster_sizes.items(), key=itemgetter(1))[0]
+    cnt += 1
+    max_cid = select_next_cid()
+    curr_eids[max_cid] += 1
+    curr_eid = curr_eids[max_cid]
+    found_names = set(inferred_points_dict[max_cid][curr_eid])
+    new_names = found_names - pred_names
+    new_srcids = [get_srcid(name) for name in new_names]
+    pred_names = pred_names.union(new_names)
+    curr_cluster_sizes[max_cid] = orig_cluster_sizes[max_cid] - len(found_names)
+    acc = len(pred_names) / total_num
+    print('{0}\tacc: {1}'.format(cnt, acc))
+    pred.update({srcid: LabeledMetadata.objects(srcid=srcid).first().tagsets
+                 for srcid in new_srcids})
+    point_pred.update({
+        srcid: LabeledMetadata.objects(srcid=srcid).first().point_tagset
+        for srcid in new_srcids})
+    anymf1 = get_macro_f1(true_tagsets, pred)
+    mf1 = get_macro_f1(true_points, point_pred)
+    f1 = get_micro_f1(true_points, point_pred)
+    #mf1s.append(mf1)
+    #f1s.append(f1)
+    #anymf1s.append(anymf1)
+    #accs.append(acc)
+    #srcids.append(len(pred_names))
+    row = {
+        'metrics': {
+            'f1': f1,
+            'macrof1': mf1,
+            'accuracy': acc,
+            'macrof1-all': anymf1
+        },
+        'learning_srcids': cnt
+    }
+    res.append(row)
+
+
+with open('result/pointonly_notransfer_arka_{0}_0.json'.format(target_building),
+          'w') as fp:
+    json.dump(res, fp)
diff --git a/scripts/exp_scrabble_zodiac.py b/scripts/exp_scrabble_zodiac.py
@@ -68,6 +68,6 @@
         'metrics': hist['metrics'],
         'learning_srcids': len(hist['total_training_srcids'])
     } for hist in workflow.history]
-    with open('result/scrabble_zodiac_{0}_{1}.json'
+    with open('result/scrabble_zodiac_{0}_{1}_onlyfiltering.json'
               .format(target_building, exp_id), 'w') as fp:
         json.dump(history, fp)
diff --git a/scripts/exp_zodiac.py b/scripts/exp_zodiac.py
@@ -1,6 +1,7 @@
 import sys, os
 import pdb
 import json
+os.environ['TRIPLE_STORE_TYPE'] = "rdflib"
 dir_path = os.path.dirname(os.path.realpath(__file__))
 sys.path.insert(0, dir_path + '/..')
 #sys.path.append(os.path.abspath(os.path.join(dir_path + '/..', 'config')))
@@ -10,7 +11,7 @@
 from plastering.metadata_interface import *
 import pdb
 
-EXP_NUM = 4
+EXP_NUM = 2
 
 target_building = sys.argv[1]
 try:

diff --git a/scripts/result_drawer.py b/scripts/result_drawer.py
@@ -182,6 +182,8 @@ def plot_entities():
                 if target_building != 'sdh':
                     continue
                 exp_num = 1
+            elif inferencer_name == 'scrabble':
+                exp_num = 2
             else:
                 exp_num = EXP_NUM
             # Notransfer
@@ -407,8 +409,8 @@ def plot_scrabble_zodiac():
     fig, ax = plt.subplots(1, 1)
     xticks, xticks_labels, yticks, yticks_labels, xlim, ylim, interp_x, \
         xlabel, ylabel, linestyles, xtickRotate = get_grid_params(
-            ymin = 0, ymax = 40, ydelta = 5,
-            xmin = 10, xmin2=50, xmax = 150, xdelta=50)
+            ymin = 0, ymax = 35, ydelta = 5,
+            xmin = 10, xmin2=50, xmax = 250, xdelta=50)
     ylabel = 'Count'
     # Baseline (Naive Zodiac)
     with open('result/scrabble_zodiac.json', 'r') as fp:
@@ -435,6 +437,86 @@ def plot_scrabble_zodiac():
     fig.set_size_inches((1.5,1.7))
     save_fig(fig, outputfile)
 
+def plot_ba_zodiac():
+    EXP_NUM = 2
+    building = 'ebu3b'
+    outputfile = FIG_DIR + '/ba_zodiac.pdf'
+    fig, ax = plt.subplots(1, 1)
+    xticks, xticks_labels, yticks, yticks_labels, xlim, ylim, interp_x, \
+        xlabel, ylabel, linestyles, xtickRotate = get_grid_params()
+
+    title = building_anon_map[building]
+
+    # Baseline (Naive Zodiac)
+    xs = []
+    ys = []
+    xss = []
+    f1s = []
+    mf1s = []
+    for i in range(0, EXP_NUM):
+        with open('result/pointonly_notransfer_zodiac_{0}_{1}.json'
+                  .format(building, i)) as  fp:
+            data = json.load(fp)
+        xss.append([datum['learning_srcids'] for datum in data])
+        f1s.append([datum['metrics']['f1'] for datum in data])
+        mf1s.append([datum['metrics']['macrof1'] for datum in data])
+    xs = xss[0] # Assuming all xss are same.
+    f1 = average_data(xss, f1s, interp_x)
+    mf1 = average_data(xss, mf1s, interp_x)
+    x = interp_x
+    ys = [f1, mf1]
+    legends = ['MicroF1, {0}'.format('Zodiac'),
+               'MacroF1, {0}'.format('Zodiac')
+               ]
+
+    _, plots = plotter.plot_multiple_2dline(
+        x, ys, xlabel, ylabel, xticks, xticks_labels,
+        yticks, yticks_labels, title, ax, fig, ylim, xlim, legends,
+        linestyles=[linestyles.pop()]*len(ys), cs=colors,
+        xtickRotate=xtickRotate)
+
+    # Baseline (Naive Zodiac)
+    xs = []
+    ys = []
+    xss = []
+    f1s = []
+    mf1s = []
+    for i in range(0, EXP_NUM):
+        with open('result/ba_zodiac_{0}_{1}.json'
+                  .format(building, i)) as  fp:
+            data = json.load(fp)
+        xss.append([datum['learning_srcids'] for datum in data])
+        f1s.append([datum['metrics']['f1'] for datum in data])
+        mf1s.append([datum['metrics']['macrof1'] for datum in data])
+    xs = xss[0] # Assuming all xss are same.
+    f1 = average_data(xss, f1s, interp_x)
+    mf1 = average_data(xss, mf1s, interp_x)
+    x = interp_x
+    ys = [f1, mf1]
+    legends = ['MicroF1, {0}'.format('BA/Zodiac'),
+               'MacroF1, {0}'.format('BA/Zodiac')
+               ]
+    xtickRotate = 45
+
+    _, plots = plotter.plot_multiple_2dline(
+        x, ys, xlabel, ylabel, xticks, xticks_labels,
+        yticks, yticks_labels, title, ax, fig,
+        ylim=ylim, xlim=xlim,
+        dataLabels=legends,
+        linestyles=[linestyles.pop()]*len(ys), cs=colors,
+        xtickRotate=xtickRotate)
+
+
+
+    ax.grid(True)
+    ax.tick_params(axis='x', pad=-1.5)
+    #ax.xaxis.set_label_coords(1.1, -0.2)
+
+    ax.legend(bbox_to_anchor=(1.26, 1.75), ncol=1, frameon=False, fontsize='small')
+    #fig.set_size_inches((8,2))
+    fig.set_size_inches((1.5,1.7))
+    save_fig(fig, outputfile)
+
 def plot_quiver_zodiac():
     EXP_NUM = 2
     building = 'ebu3b'
@@ -520,5 +602,6 @@ def plot_quiver_zodiac():
     #plot_pointonly_notransfer()
     #plot_pointonly_transfer()
     #plot_quiver_zodiac()
-    plot_entities()
-    #plot_scrabble_zodiac()
+    #plot_entities()
+    plot_scrabble_zodiac()
+    #plot_ba_zodiac()
diff --git a/scripts/run_hong_all.sh b/scripts/run_hong_all.sh
@@ -1,4 +1,9 @@
 #!/usr/bin/env python
-(nohup python -u scripts/exp_hong_al.py ebu3b sdh > nohup.hong.ebu3b.sdh; slack_notify --msg 'hong ebu3b sdh at ozone') &
-(nohup python -u scripts/exp_hong_al.py sdh ebu3b > nohup.hong.sdh.ebu3b; slack_notify --msg 'hong sdh ebu3b at ozone') &
-(nohup python -u scripts/exp_hong_al.py ebu3b ap_m > nohup.hong.ebu3b.ap_m; slack_notify --msg 'hong ebu3b ap_m at ozone') &
+(nohup python -u scripts/exp_hong_al.py ebu3b sdh > nohup.hong.ebu3b.sdh; slack_notify --msg 'nohup.hong.ebu3b.sdh at labpc')
+(nohup python -u scripts/exp_hong_al.py sdh ebu3b > nohup.hong.sdh.ebu3b; slack_notify --msg 'nohup.hong.sdh.ebu3b at labpc')
+(nohup python -u scripts/exp_hong_al.py ebu3b ap_m > nohup.hong.ebu3b.ap_m; slack_notify --msg 'nohup.hong.ebu3b.ap_m at labpc')
+
+(nohup python -u scripts/exp_hong_al.py ebu3b > nohup.hong.ebu3b; slack_notify --msg 'nohup.hong.ebu3b at labpc')
+(nohup python -u scripts/exp_hong_al.py sdh > nohup.hong.sdh; slack_notify --msg 'nohup.hong.sdh at labpc')
+(nohup python -u scripts/exp_hong_al.py ghc > nohup.hong.ghc; slack_notify --msg 'nohup.hong.ghc at labpc')
+(nohup python -u scripts/exp_hong_al.py uva_cse > nohup.hong.uva_cse; slack_notify --msg 'nohup.hong.uva_cse at labpc')
diff --git a/scripts/run_zodiac_all.sh b/scripts/run_zodiac_all.sh
@@ -1,8 +1,9 @@
 #!/usr/bin/env python
-(nohup python -u scripts/exp_zodiac.py ebu3b sdh > nohup.zodiac.ebu3b.sdh; slack_notify --msg 'zodiac ebu3b sdh at ozone') &
-(nohup python -u scripts/exp_zodiac.py sdh ebu3b > nohup.zodiac.sdh.ebu3b; slack_notify --msg 'zodiac sdh ebu3b at ozone') &
-(nohup python -u scripts/exp_zodiac.py ebu3b ap_m > nohup.zodiac.ebu3b.ap_m; slack_notify --msg 'zodiac ebu3b ap_m at ozone') &
+(nohup python -u scripts/exp_zodiac.py ebu3b sdh > nohup.zodiac.ebu3b.sdh; slack_notify --msg 'nohup.zodiac.ebu3b.sdh at lab-pc') &
+(nohup python -u scripts/exp_zodiac.py sdh ebu3b > nohup.zodiac.sdh.ebu3b; slack_notify --msg 'nohup.zodiac.sdh.ebu3b at lab-pc') &
+(nohup python -u scripts/exp_zodiac.py ebu3b ap_m > nohup.zodiac.ebu3b.ap_m; slack_notify --msg 'nohup.zodiac.ebu3b.ap_m') &
 
-(nohup python -u scripts/exp_zodiac.py ebu3b sdh > nohup.zodiac.ebu3b.sdh; slack_notify --msg 'zodiac ebu3b sdh at ozone') &
-(nohup python -u scripts/exp_zodiac.py sdh ebu3b > nohup.zodiac.sdh.ebu3b; slack_notify --msg 'zodiac sdh ebu3b at ozone') &
-(nohup python -u scripts/exp_zodiac.py ebu3b ap_m > nohup.zodiac.ebu3b.ap_m; slack_notify --msg 'zodiac ebu3b ap_m at ozone') &
+(nohup python -u scripts/exp_zodiac.py ebu3b > nohup.zodiac.ebu3b; slack_notify --msg 'zodiac ebu3b sdh at ozone') &
+(nohup python -u scripts/exp_zodiac.py sdh > nohup.zodiac.sdh; slack_notify --msg 'nohup.zodiac.sdh') &
+(nohup python -u scripts/exp_zodiac.py uva_cse > nohup.zodiac.uva_cse; slack_notify --msg 'nohup.zodiac.uva_cse') &
+(nohup python -u scripts/exp_zodiac.py ghc > nohup.zodiac.ghc ; slack_notify --msg 'nohup.zodiac.ghc') &