Added small language models and also added output file for ranker

mounicam · Jun 13, 2019 · 8d11581 · 8d11581
1 parent 56cd2d5
commit 8d11581
Show file tree

Hide file tree

Showing 5 changed files with 20 additions and 10 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@ neural_ranker/*pyc
 neural_ranker/*/*pyc
 neural_ranker/__pycache__
 neural_ranker/*/__pycache__
+data_versions
diff --git a/data/small_gt.bin.gz b/data/small_gt.bin.gz
diff --git a/data/small_kn.bin.gz b/data/small_kn.bin.gz
diff --git a/neural_ranker/config.py b/neural_ranker/config.py
@@ -3,22 +3,20 @@
 """
 
 LM_KN = [
-    "/data/maddela/Gigaword/gigaword_kn_3.bin",
-    "/data/maddela/TwitterGardenhose/language_models/2010_01_12_kn.bin"
+    "data/small_kn.bin"
 ]
 
 LM_GT = [
-    "/data/maddela/Gigaword/gigaword_gt_3.bin",
-    "/data/maddela/TwitterGardenhose/language_models/2010_01_12_gt.bin",
+    "data/small_gt.bin" 
 ]
 
 RESOURCES = {
     "lm_gt": LM_GT,
     "lm_kn": LM_KN,
-    "wiki": "../data/wiki_titles.txt",
-    "urban": "../data/urban_dict_words_A_Z.txt",
-    "twitter": "../data/twitter_counts.tsv",
-    "google":  "../data/google_counts.tsv",
+    "wiki": "data/wiki_titles.txt",
+    "urban": "data/urban_dict_words_A_Z.txt",
+    "twitter": "data/twitter_counts.tsv",
+    "google":  "data/google_counts.tsv",
 }
 
 

diff --git a/neural_ranker/main.py b/neural_ranker/main.py
@@ -37,6 +37,13 @@ def main(args):
             reranked_segs = rerank(segs, segs_feats, model, args.model)
             top_segmentations.append(reranked_segs)
 
+    if args.output is not None:
+        fp = open(args.output, 'w')
+        for segs in top_segmentations:
+            target = "".join(segs[0].split())
+            fp.write(target + "\t" + "\t".join([seg.strip() for seg in segs]) + "\n")
+        fp.close()
+
     # Evaluate metrics
     print("MRR:", mean_reciprocal_rank(test_gold_truths, top_segmentations))
     print("Accuracy@1:", accuracy(1, test_gold_truths, top_segmentations))
@@ -58,9 +65,13 @@ def main(args):
     parser.add_argument('--test', help='Path to test hashtags file. The format is same as traning dataset. \n',
                         dest='test', type=str)
     parser.add_argument('--test_topk', help='Path to top-k candidates file of traning dataset. \n'
-                                            'The format is same as traning dataset.',
+                                            'The format is same as training dataset.',
                         dest='test_topk', type=str)
-    parser.add_argument('--model', type=str, dest='model', help='Type of model. The input should be one'
+    parser.add_argument('--out', help='Path to reranked candidates file. \n'
+                                      'The output file is tab seperated. The format is: \n'
+                                      '<hashtag without #> <tab separated top-k candidates>.',
+                        dest='output', type=str)
+    parser.add_argument('--model', type=str, dest='model', default="mse_multi", help='Type of model. The input should be one'
                                                                 'of the strings: mse, mse_multi, mr, mr_multi')
     args = parser.parse_args()
     main(args)