diff --git a/baseline/baseline.md b/baseline/baseline.md index b350783..578938a 100644 --- a/baseline/baseline.md +++ b/baseline/baseline.md @@ -37,10 +37,17 @@ nohup gzip -cd /mnt/langsplit/2015_32_kv.gz | ~/DataCollection/baseline/langstat If you are collecting data for a language direction for which you already earlier collected data from the reverse direction, please see an optimized process in the appendix. ## Step 3: Look up where these URLs appear in CommonCrawl S3 + +### Option 1 (if you have build your own location database) ``` nohup cat candidates.en-de | nice ~/DataCollection/baseline/locate_candidates.py - - -server='http://statmt.org:8084/query_prefix' > candidates.en-de.locations 2> locate.log & ``` +### Option 2 (use the [CommonCrawl Index API](http://commoncrawl.org/2015/04/announcing-the-common-crawl-index/)) +``` +nohup cat candidates.en-de | nice ~/DataCollection/baseline/locate_candidates_cc_index_api.py - - > candidates.en-de.locations 2> locate.log & +``` + ## Step 4: Download pages from CommonCrawl S3 and extract text For certain language pairs we provide the `.locations` files in compressed form in our releases on https://github.com/ModernMT/DataCollection/releases. You can use these files to start the process in this step. ``` diff --git a/baseline/locate_candidates_cc_index_api.py b/baseline/locate_candidates_cc_index_api.py new file mode 100755 index 0000000..d5c3391 --- /dev/null +++ b/baseline/locate_candidates_cc_index_api.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import sys +import json +import tldextract +import re +import requests +import urllib + +COMMONCRAWL_S3_URL = "https://commoncrawl.s3.amazonaws.com" +COMMONCRAWL_INDEX_URL = "http://index.commoncrawl.org" + +INVALID_URL = "123" +INVALID_CRAWL = "abc" + +def make_full_filename(filepath): + return '/'.join([COMMONCRAWL_S3_URL, filepath]) + +def make_query_url(crawl, url): + params = { + "base_url": COMMONCRAWL_INDEX_URL, + "crawl_id": crawl.replace('_', '-'), + "url": urllib.quote(url, safe='') # Percent encode URL. + } + + query = "{base_url}/CC-MAIN-{crawl_id}-index?url={url}&output=json&limit=1" + return query.format(**params) + +def get_location(session, url, crawl): + """ Returns success and location """ + query_url = make_query_url(crawl, url) + try: + r = session.get(query_url) + result = r.json() + except: + return False, None + + try: + data = { + "filename": make_full_filename(result["filename"]), + "length": result["length"], + "mime": result["mime"], + "offset": result["offset"] + } + except KeyError: + return False, None + + return True, data + + +def report_error(url, crawl, errors, total): + percentage = 100. * errors / total + sys.stderr.write("Errors: %d/%d = %.2f%%\t%s\t%s\n" % + (errors, total, percentage, crawl, url)) + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('candidates', type=argparse.FileType('r'), + help='file containing candidates') + parser.add_argument('outfile', type=argparse.FileType('w'), + default=sys.stdout) + parser.add_argument('-kv', help='input is a .kv.gz file', + default=False, action="store_true") + args = parser.parse_args(sys.argv[1:]) + + total_lines, total_errors = 0, 0 + with requests.Session() as session: + for line in args.candidates: + total_lines += 1 + line = line.decode("utf-8") + if args.kv: + # Lines have the format: + # {domain} {url} {crawl}\t{language_data} + url_data, _ = line.strip().split('\t') + _, src_url, src_crawl = url_data.strip().split() + tgt_success = False + else: + # Lines have the format: + # {stripped_url} {src_url} {src_crawl} {tgt_url} {tgt_crawl} + _, src_url, src_crawl, tgt_url, tgt_crawl = line.strip().split() + + src_success, src_loc = get_location(session, src_url, src_crawl) + if not src_success: + total_errors += 1 + report_error(src_url, src_crawl, total_errors, total_lines) + + if not args.kv: + tgt_success, tgt_loc = get_location(session, tgt_url, tgt_crawl) + if not tgt_success: + total_errors += 1 + report_error(tgt_url, tgt_crawl, total_errors, total_lines) + + if src_success and tgt_success: + args.outfile.write("%s\t%s\t%s\n" % + (src_url, src_crawl, json.dumps(src_loc))) + args.outfile.write("%s\t%s\t%s\n" % + (tgt_url, tgt_crawl, json.dumps(tgt_loc))) + elif args.kv and src_success: + args.outfile.write("%s\t%s\t%s\n" % + (src_url, src_crawl, json.dumps(src_loc))) + + sys.stderr.write("Done: ") + report_error(tgt_url, tgt_crawl, total_errors, total_lines)