From c1af38916a5a34a680c63d0c3368d189dba3cf10 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Fri, 24 Jan 2020 12:35:59 +0100 Subject: [PATCH 1/2] CLI helper for CodeSearchNet to OpenNMT --- notebooks/codesearchnet-opennmt.py | 98 ++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 notebooks/codesearchnet-opennmt.py diff --git a/notebooks/codesearchnet-opennmt.py b/notebooks/codesearchnet-opennmt.py new file mode 100644 index 0000000..6f80d9e --- /dev/null +++ b/notebooks/codesearchnet-opennmt.py @@ -0,0 +1,98 @@ +from argparse import ArgumentParser +import os +from pathlib import Path +import time +from typing import Dict, List, Tuple +import logging + +import pandas as pd + +logging.basicConfig(level=logging.INFO) + +class CodeSearchNetRAM(object): + """Stores one split of CodeSearchNet data in memory + + Usage example: + wget 'https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip' + unzip java.zip + python notebooks/codesearchnet-opennmt.py --data_dir='java/final/jsonl/valid' --newline='\\n' + """ + + def __init__(self, split_path: Path, newline_repl: str): + super().__init__() + self.pd = pd + + files = sorted(split_path.glob('**/*.gz')) + logging.info(f'Total number of files: {len(files):,}') + assert len(files) != 0, "could not find files under %s" % split_path + + columns_list = ['code', 'func_name'] + + start = time.time() + self.pd = self._jsonl_list_to_dataframe(files, columns_list) + logging.info(f"Loading took {time.time() - start:.2f}s for {len(self)} rows") + + @staticmethod + def _jsonl_list_to_dataframe(file_list: List[Path], + columns: List[str]) -> pd.DataFrame: + """Load a list of jsonl.gz files into a pandas DataFrame.""" + return pd.concat([pd.read_json(f, + orient='records', + compression='gzip', + lines=True)[columns] + for f in file_list], sort=False) + + + def __getitem__(self, idx: int): + row = self.pd.iloc[idx] + + # drop class name + fn_name = row["func_name"] + fn_name = fn_name.split('.')[-1] # drop the class name + # fn_name_enc = self.enc.encode(fn_name) + + # drop fn signature + code = row["code"] + fn_body = code[code.find("{") + 1:code.find("}")].lstrip().rstrip() + fn_body = fn_body.replace("\n", "\\n") + # fn_body_enc = self.enc.encode(fn_body) + return (fn_name, fn_body) + + def __len__(self): + return len(self.pd) + + +def main(args): + test_set = CodeSearchNetRAM(Path(args.data_dir), args.newline) + with open(args.src_file, mode="w", encoding="utf8") as s, open(args.tgt_file, mode="w", encoding="utf8") as t: + for fn_name, fn_body in test_set: + print(f"'{fn_name[:40]:40}' - '{fn_body[:40]:40}'") + print(fn_name, file=s) + print(fn_body, file=t) + + + +if __name__ == "__main__": + parser = ArgumentParser(add_help=False) + parser.add_argument('--data_dir', + type=str, + default="java/final/jsonl/test", + help="Path to the unziped input data (CodeSearchNet)") + + parser.add_argument('--newline', + type=str, + default="\\n", + help="Replace newline with this") + + parser.add_argument('--src_file', + type=str, + default="src-trian.txt", + help="File with function bodies") + + parser.add_argument('--tgt_file', + type=str, + default="tgt-trian.txt", + help="File with function texts") + + args = parser.parse_args() + main(args) From f2521a739d4e22a05dc0eb10520068d9523ab14a Mon Sep 17 00:00:00 2001 From: m09 <142691+m09@users.noreply.github.com> Date: Fri, 24 Jan 2020 17:20:52 +0100 Subject: [PATCH 2/2] Tweak the codesearchnet opennmt helper a bit Signed-off-by: m09 <142691+m09@users.noreply.github.com> --- notebooks/codesearchnet-opennmt.py | 109 ++++++++++++++++------------- 1 file changed, 62 insertions(+), 47 deletions(-) diff --git a/notebooks/codesearchnet-opennmt.py b/notebooks/codesearchnet-opennmt.py index 6f80d9e..b148061 100644 --- a/notebooks/codesearchnet-opennmt.py +++ b/notebooks/codesearchnet-opennmt.py @@ -1,59 +1,66 @@ from argparse import ArgumentParser -import os -from pathlib import Path -import time -from typing import Dict, List, Tuple import logging +from pathlib import Path +from time import time +from typing import List import pandas as pd + logging.basicConfig(level=logging.INFO) + class CodeSearchNetRAM(object): """Stores one split of CodeSearchNet data in memory Usage example: wget 'https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip' unzip java.zip - python notebooks/codesearchnet-opennmt.py --data_dir='java/final/jsonl/valid' --newline='\\n' + python notebooks/codesearchnet-opennmt.py \ + --data_dir='java/final/jsonl/valid' \ + --newline='\\n' """ def __init__(self, split_path: Path, newline_repl: str): super().__init__() self.pd = pd - files = sorted(split_path.glob('**/*.gz')) - logging.info(f'Total number of files: {len(files):,}') - assert len(files) != 0, "could not find files under %s" % split_path + files = sorted(split_path.glob("**/*.gz")) + logging.info(f"Total number of files: {len(files):,}") + assert files, "could not find files under %s" % split_path - columns_list = ['code', 'func_name'] + columns_list = ["code", "func_name"] - start = time.time() + start = time() self.pd = self._jsonl_list_to_dataframe(files, columns_list) - logging.info(f"Loading took {time.time() - start:.2f}s for {len(self)} rows") + logging.info(f"Loading took {time() - start:.2f}s for {len(self)} rows") @staticmethod - def _jsonl_list_to_dataframe(file_list: List[Path], - columns: List[str]) -> pd.DataFrame: + def _jsonl_list_to_dataframe( + file_list: List[Path], columns: List[str] + ) -> pd.DataFrame: """Load a list of jsonl.gz files into a pandas DataFrame.""" - return pd.concat([pd.read_json(f, - orient='records', - compression='gzip', - lines=True)[columns] - for f in file_list], sort=False) - + return pd.concat( + [ + pd.read_json(f, orient="records", compression="gzip", lines=True)[ + columns + ] + for f in file_list + ], + sort=False, + ) def __getitem__(self, idx: int): row = self.pd.iloc[idx] # drop class name fn_name = row["func_name"] - fn_name = fn_name.split('.')[-1] # drop the class name + fn_name = fn_name.split(".")[-1] # drop the class name # fn_name_enc = self.enc.encode(fn_name) # drop fn signature code = row["code"] - fn_body = code[code.find("{") + 1:code.find("}")].lstrip().rstrip() + fn_body = code[code.find("{") + 1 : code.rfind("}")].lstrip().rstrip() fn_body = fn_body.replace("\n", "\\n") # fn_body_enc = self.enc.encode(fn_body) return (fn_name, fn_body) @@ -63,36 +70,44 @@ def __len__(self): def main(args): - test_set = CodeSearchNetRAM(Path(args.data_dir), args.newline) - with open(args.src_file, mode="w", encoding="utf8") as s, open(args.tgt_file, mode="w", encoding="utf8") as t: - for fn_name, fn_body in test_set: - print(f"'{fn_name[:40]:40}' - '{fn_body[:40]:40}'") - print(fn_name, file=s) - print(fn_body, file=t) - + dataset = CodeSearchNetRAM(Path(args.data_dir), args.newline) + split_name = Path(args.data_dir).name + with open(args.src_file % split_name, mode="w", encoding="utf8") as s, open( + args.tgt_file % split_name, mode="w", encoding="utf8" + ) as t: + for fn_name, fn_body in dataset: + if not fn_name or not fn_body: + continue + print(fn_body, file=s) + print(fn_name if args.word_level_targets else " ".join(fn_name), file=t) if __name__ == "__main__": parser = ArgumentParser(add_help=False) - parser.add_argument('--data_dir', - type=str, - default="java/final/jsonl/test", - help="Path to the unziped input data (CodeSearchNet)") - - parser.add_argument('--newline', - type=str, - default="\\n", - help="Replace newline with this") - - parser.add_argument('--src_file', - type=str, - default="src-trian.txt", - help="File with function bodies") - - parser.add_argument('--tgt_file', - type=str, - default="tgt-trian.txt", - help="File with function texts") + parser.add_argument( + "--data_dir", + type=str, + default="java/final/jsonl/test", + help="Path to the unziped input data (CodeSearchNet)", + ) + + parser.add_argument( + "--newline", type=str, default="\\n", help="Replace newline with this" + ) + + parser.add_argument( + "--word-level-targets", + action="store_true", + help="Use word level targets instead of char level ones", + ) + + parser.add_argument( + "--src_file", type=str, default="src-%s.txt", help="File with function bodies", + ) + + parser.add_argument( + "--tgt_file", type=str, default="tgt-%s.txt", help="File with function texts" + ) args = parser.parse_args() main(args)