Skip to content

Commit

Permalink
Added small language models and also added output file for ranker
Browse files Browse the repository at this point in the history
  • Loading branch information
Maddela Mounica committed Jun 13, 2019
1 parent 56cd2d5 commit 8d11581
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 10 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -4,3 +4,4 @@ neural_ranker/*pyc
neural_ranker/*/*pyc
neural_ranker/__pycache__
neural_ranker/*/__pycache__
data_versions
Binary file added data/small_gt.bin.gz
Binary file not shown.
Binary file added data/small_kn.bin.gz
Binary file not shown.
14 changes: 6 additions & 8 deletions neural_ranker/config.py
Expand Up @@ -3,22 +3,20 @@
"""

LM_KN = [
"/data/maddela/Gigaword/gigaword_kn_3.bin",
"/data/maddela/TwitterGardenhose/language_models/2010_01_12_kn.bin"
"data/small_kn.bin"
]

LM_GT = [
"/data/maddela/Gigaword/gigaword_gt_3.bin",
"/data/maddela/TwitterGardenhose/language_models/2010_01_12_gt.bin",
"data/small_gt.bin"
]

RESOURCES = {
"lm_gt": LM_GT,
"lm_kn": LM_KN,
"wiki": "../data/wiki_titles.txt",
"urban": "../data/urban_dict_words_A_Z.txt",
"twitter": "../data/twitter_counts.tsv",
"google": "../data/google_counts.tsv",
"wiki": "data/wiki_titles.txt",
"urban": "data/urban_dict_words_A_Z.txt",
"twitter": "data/twitter_counts.tsv",
"google": "data/google_counts.tsv",
}


Expand Down
15 changes: 13 additions & 2 deletions neural_ranker/main.py
Expand Up @@ -37,6 +37,13 @@ def main(args):
reranked_segs = rerank(segs, segs_feats, model, args.model)
top_segmentations.append(reranked_segs)

if args.output is not None:
fp = open(args.output, 'w')
for segs in top_segmentations:
target = "".join(segs[0].split())
fp.write(target + "\t" + "\t".join([seg.strip() for seg in segs]) + "\n")
fp.close()

# Evaluate metrics
print("MRR:", mean_reciprocal_rank(test_gold_truths, top_segmentations))
print("Accuracy@1:", accuracy(1, test_gold_truths, top_segmentations))
Expand All @@ -58,9 +65,13 @@ def main(args):
parser.add_argument('--test', help='Path to test hashtags file. The format is same as traning dataset. \n',
dest='test', type=str)
parser.add_argument('--test_topk', help='Path to top-k candidates file of traning dataset. \n'
'The format is same as traning dataset.',
'The format is same as training dataset.',
dest='test_topk', type=str)
parser.add_argument('--model', type=str, dest='model', help='Type of model. The input should be one'
parser.add_argument('--out', help='Path to reranked candidates file. \n'
'The output file is tab seperated. The format is: \n'
'<hashtag without #> <tab separated top-k candidates>.',
dest='output', type=str)
parser.add_argument('--model', type=str, dest='model', default="mse_multi", help='Type of model. The input should be one'
'of the strings: mse, mse_multi, mr, mr_multi')
args = parser.parse_args()
main(args)

0 comments on commit 8d11581

Please sign in to comment.