From c1af38916a5a34a680c63d0c3368d189dba3cf10 Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Fri, 24 Jan 2020 12:35:59 +0100
Subject: [PATCH 1/2] CLI helper for CodeSearchNet to OpenNMT

---
 notebooks/codesearchnet-opennmt.py | 98 ++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 notebooks/codesearchnet-opennmt.py

diff --git a/notebooks/codesearchnet-opennmt.py b/notebooks/codesearchnet-opennmt.py
new file mode 100644
index 0000000..6f80d9e
--- /dev/null
+++ b/notebooks/codesearchnet-opennmt.py
@@ -0,0 +1,98 @@
+from argparse import ArgumentParser
+import os
+from pathlib import Path
+import time
+from typing import Dict, List, Tuple
+import logging
+
+import pandas as pd
+
+logging.basicConfig(level=logging.INFO)
+
+class CodeSearchNetRAM(object):
+    """Stores one split of CodeSearchNet data in memory
+
+    Usage example:
+        wget 'https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip'
+        unzip java.zip
+        python notebooks/codesearchnet-opennmt.py --data_dir='java/final/jsonl/valid' --newline='\\n'
+    """
+
+    def __init__(self, split_path: Path, newline_repl: str):
+        super().__init__()
+        self.pd = pd
+
+        files = sorted(split_path.glob('**/*.gz'))
+        logging.info(f'Total number of files: {len(files):,}')
+        assert len(files) != 0, "could not find files under %s" % split_path
+
+        columns_list = ['code', 'func_name']
+
+        start = time.time()
+        self.pd = self._jsonl_list_to_dataframe(files, columns_list)
+        logging.info(f"Loading took {time.time() - start:.2f}s for {len(self)} rows")
+
+    @staticmethod
+    def _jsonl_list_to_dataframe(file_list: List[Path],
+                                 columns: List[str]) -> pd.DataFrame:
+        """Load a list of jsonl.gz files into a pandas DataFrame."""
+        return pd.concat([pd.read_json(f,
+                                    orient='records',
+                                    compression='gzip',
+                                    lines=True)[columns]
+                        for f in file_list], sort=False)
+
+
+    def __getitem__(self, idx: int):
+        row = self.pd.iloc[idx]
+
+        # drop class name
+        fn_name = row["func_name"]
+        fn_name = fn_name.split('.')[-1]  # drop the class name
+        # fn_name_enc = self.enc.encode(fn_name)
+
+        # drop fn signature
+        code = row["code"]
+        fn_body = code[code.find("{") + 1:code.find("}")].lstrip().rstrip()
+        fn_body = fn_body.replace("\n", "\\n")
+        # fn_body_enc = self.enc.encode(fn_body)
+        return (fn_name, fn_body)
+
+    def __len__(self):
+        return len(self.pd)
+
+
+def main(args):
+    test_set = CodeSearchNetRAM(Path(args.data_dir), args.newline)
+    with open(args.src_file, mode="w", encoding="utf8") as s, open(args.tgt_file, mode="w", encoding="utf8") as t:
+        for fn_name, fn_body in test_set:
+            print(f"'{fn_name[:40]:40}' - '{fn_body[:40]:40}'")
+            print(fn_name, file=s)
+            print(fn_body, file=t)
+
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(add_help=False)
+    parser.add_argument('--data_dir',
+                        type=str,
+                        default="java/final/jsonl/test",
+                        help="Path to the unziped input data (CodeSearchNet)")
+
+    parser.add_argument('--newline',
+                    type=str,
+                    default="\\n",
+                    help="Replace newline with this")
+
+    parser.add_argument('--src_file',
+                    type=str,
+                    default="src-trian.txt",
+                    help="File with function bodies")
+
+    parser.add_argument('--tgt_file',
+                    type=str,
+                    default="tgt-trian.txt",
+                    help="File with function texts")
+
+    args = parser.parse_args()
+    main(args)

From f2521a739d4e22a05dc0eb10520068d9523ab14a Mon Sep 17 00:00:00 2001
From: m09 <142691+m09@users.noreply.github.com>
Date: Fri, 24 Jan 2020 17:20:52 +0100
Subject: [PATCH 2/2] Tweak the codesearchnet opennmt helper a bit

Signed-off-by: m09 <142691+m09@users.noreply.github.com>
---
 notebooks/codesearchnet-opennmt.py | 109 ++++++++++++++++-------------
 1 file changed, 62 insertions(+), 47 deletions(-)

diff --git a/notebooks/codesearchnet-opennmt.py b/notebooks/codesearchnet-opennmt.py
index 6f80d9e..b148061 100644
--- a/notebooks/codesearchnet-opennmt.py
+++ b/notebooks/codesearchnet-opennmt.py
@@ -1,59 +1,66 @@
 from argparse import ArgumentParser
-import os
-from pathlib import Path
-import time
-from typing import Dict, List, Tuple
 import logging
+from pathlib import Path
+from time import time
+from typing import List
 
 import pandas as pd
 
+
 logging.basicConfig(level=logging.INFO)
 
+
 class CodeSearchNetRAM(object):
     """Stores one split of CodeSearchNet data in memory
 
     Usage example:
         wget 'https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip'
         unzip java.zip
-        python notebooks/codesearchnet-opennmt.py --data_dir='java/final/jsonl/valid' --newline='\\n'
+        python notebooks/codesearchnet-opennmt.py \
+            --data_dir='java/final/jsonl/valid' \
+            --newline='\\n'
     """
 
     def __init__(self, split_path: Path, newline_repl: str):
         super().__init__()
         self.pd = pd
 
-        files = sorted(split_path.glob('**/*.gz'))
-        logging.info(f'Total number of files: {len(files):,}')
-        assert len(files) != 0, "could not find files under %s" % split_path
+        files = sorted(split_path.glob("**/*.gz"))
+        logging.info(f"Total number of files: {len(files):,}")
+        assert files, "could not find files under %s" % split_path
 
-        columns_list = ['code', 'func_name']
+        columns_list = ["code", "func_name"]
 
-        start = time.time()
+        start = time()
         self.pd = self._jsonl_list_to_dataframe(files, columns_list)
-        logging.info(f"Loading took {time.time() - start:.2f}s for {len(self)} rows")
+        logging.info(f"Loading took {time() - start:.2f}s for {len(self)} rows")
 
     @staticmethod
-    def _jsonl_list_to_dataframe(file_list: List[Path],
-                                 columns: List[str]) -> pd.DataFrame:
+    def _jsonl_list_to_dataframe(
+        file_list: List[Path], columns: List[str]
+    ) -> pd.DataFrame:
         """Load a list of jsonl.gz files into a pandas DataFrame."""
-        return pd.concat([pd.read_json(f,
-                                    orient='records',
-                                    compression='gzip',
-                                    lines=True)[columns]
-                        for f in file_list], sort=False)
-
+        return pd.concat(
+            [
+                pd.read_json(f, orient="records", compression="gzip", lines=True)[
+                    columns
+                ]
+                for f in file_list
+            ],
+            sort=False,
+        )
 
     def __getitem__(self, idx: int):
         row = self.pd.iloc[idx]
 
         # drop class name
         fn_name = row["func_name"]
-        fn_name = fn_name.split('.')[-1]  # drop the class name
+        fn_name = fn_name.split(".")[-1]  # drop the class name
         # fn_name_enc = self.enc.encode(fn_name)
 
         # drop fn signature
         code = row["code"]
-        fn_body = code[code.find("{") + 1:code.find("}")].lstrip().rstrip()
+        fn_body = code[code.find("{") + 1 : code.rfind("}")].lstrip().rstrip()
         fn_body = fn_body.replace("\n", "\\n")
         # fn_body_enc = self.enc.encode(fn_body)
         return (fn_name, fn_body)
@@ -63,36 +70,44 @@ def __len__(self):
 
 
 def main(args):
-    test_set = CodeSearchNetRAM(Path(args.data_dir), args.newline)
-    with open(args.src_file, mode="w", encoding="utf8") as s, open(args.tgt_file, mode="w", encoding="utf8") as t:
-        for fn_name, fn_body in test_set:
-            print(f"'{fn_name[:40]:40}' - '{fn_body[:40]:40}'")
-            print(fn_name, file=s)
-            print(fn_body, file=t)
-
+    dataset = CodeSearchNetRAM(Path(args.data_dir), args.newline)
+    split_name = Path(args.data_dir).name
+    with open(args.src_file % split_name, mode="w", encoding="utf8") as s, open(
+        args.tgt_file % split_name, mode="w", encoding="utf8"
+    ) as t:
+        for fn_name, fn_body in dataset:
+            if not fn_name or not fn_body:
+                continue
+            print(fn_body, file=s)
+            print(fn_name if args.word_level_targets else " ".join(fn_name), file=t)
 
 
 if __name__ == "__main__":
     parser = ArgumentParser(add_help=False)
-    parser.add_argument('--data_dir',
-                        type=str,
-                        default="java/final/jsonl/test",
-                        help="Path to the unziped input data (CodeSearchNet)")
-
-    parser.add_argument('--newline',
-                    type=str,
-                    default="\\n",
-                    help="Replace newline with this")
-
-    parser.add_argument('--src_file',
-                    type=str,
-                    default="src-trian.txt",
-                    help="File with function bodies")
-
-    parser.add_argument('--tgt_file',
-                    type=str,
-                    default="tgt-trian.txt",
-                    help="File with function texts")
+    parser.add_argument(
+        "--data_dir",
+        type=str,
+        default="java/final/jsonl/test",
+        help="Path to the unziped input data (CodeSearchNet)",
+    )
+
+    parser.add_argument(
+        "--newline", type=str, default="\\n", help="Replace newline with this"
+    )
+
+    parser.add_argument(
+        "--word-level-targets",
+        action="store_true",
+        help="Use word level targets instead of char level ones",
+    )
+
+    parser.add_argument(
+        "--src_file", type=str, default="src-%s.txt", help="File with function bodies",
+    )
+
+    parser.add_argument(
+        "--tgt_file", type=str, default="tgt-%s.txt", help="File with function texts"
+    )
 
     args = parser.parse_args()
     main(args)