Update data download script; don't download MRPC IDs or SNLI

nyu-mll · Dec 10, 2020 · b1c8239 · b1c8239
1 parent 4cbf044
commit b1c8239
Showing 1 changed file with 37 additions and 27 deletions.
diff --git a/download_glue_data.py b/download_glue_data.py
@@ -29,24 +29,24 @@
 if sys.version_info >= (3, 0):
     URLLIB=urllib.request
 
-TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
-TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4',
-             "SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
-             "MRPC":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc',
-             "QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5',
-             "STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5',
-             "MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce',
-             "SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df',
-             "QNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLI.zip?alt=media&token=c24cad61-f2df-4f04-9ab6-aa576fa829d0',
-             "RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb',
-             "WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf',
-             "diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'}
-
-MRPC_TRAIN = 'https://s3.amazonaws.com/senteval/senteval_data/msr_paraphrase_train.txt'
-MRPC_TEST = 'https://s3.amazonaws.com/senteval/senteval_data/msr_paraphrase_test.txt'
+TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
+TASK2PATH = {"CoLA":'https://dl.fbaipublicfiles.com/glue/data/CoLA.zip',
+             "SST":'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip',
+             "QQP":'https://dl.fbaipublicfiles.com/glue/data/STS-B.zip',
+             "STS":'https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip',
+             "MNLI":'https://dl.fbaipublicfiles.com/glue/data/MNLI.zip',
+             "QNLI":'https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip',
+             "RTE":'https://dl.fbaipublicfiles.com/glue/data/RTE.zip',
+             "WNLI":'https://dl.fbaipublicfiles.com/glue/data/WNLI.zip',
+             "diagnostic":'https://dl.fbaipublicfiles.com/glue/data/AX.tsv'}
+
+MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
+MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'
 
 def download_and_extract(task, data_dir):
     print("Downloading and extracting %s..." % task)
+    if task == "MNLI":
+        print("\tNote (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.")
     data_file = "%s.zip" % task
     URLLIB.urlretrieve(TASK2PATH[task], data_file)
     with zipfile.ZipFile(data_file) as zip_ref:
@@ -63,13 +63,30 @@ def format_mrpc(data_dir, path_to_data):
         mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")
         mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")
     else:
-        mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
-        mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
-        URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file)
-        URLLIB.urlretrieve(MRPC_TEST, mrpc_test_file)
+        try:
+            mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
+            mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
+            URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file)
+            URLLIB.urlretrieve(MRPC_TEST, mrpc_test_file)
+        except urllib.error.HTTPError:
+            print("Error downloading MRPC")
+            return
     assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
     assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
-    URLLIB.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))
+
+    with io.open(mrpc_test_file, encoding='utf-8') as data_fh, \
+            io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh:
+        header = data_fh.readline()
+        test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
+        for idx, row in enumerate(data_fh):
+            label, id1, id2, s1, s2 = row.strip().split('\t')
+            test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
+
+    try:
+        URLLIB.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))
+    except KeyError or urllib.error.HTTPError:
+        print("\tError downloading standard development IDs for MRPC. You will need to manually split your data.")
+        return
 
     dev_ids = []
     with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh:
@@ -89,13 +106,6 @@ def format_mrpc(data_dir, path_to_data):
             else:
                 train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
 
-    with io.open(mrpc_test_file, encoding='utf-8') as data_fh, \
-            io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh:
-        header = data_fh.readline()
-        test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
-        for idx, row in enumerate(data_fh):
-            label, id1, id2, s1, s2 = row.strip().split('\t')
-            test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
     print("\tCompleted!")
 
 def download_diagnostic(data_dir):