# Очистка и фильтрация датасетов

In [None]:
!pip -q install datasets black datasketch unidecode

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.4/86.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.1/96.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Импорты
from google.colab import drive
drive.mount("/content/drive")
import os
import re
import ast
import hashlib
from typing import Optional, Dict, Any, List, Tuple
from datasets import load_dataset, Dataset, DatasetDict
import black
from unidecode import unidecode
from datasketch import MinHash, MinHashLSH
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

import warnings
warnings.filterwarnings(
    "ignore",
    category=SyntaxWarning
)

Mounted at /content/drive


In [None]:
CFG = {
    # Фильтры
    "min_loc": 3,
    "max_loc": 150,
    "max_args": 7,
    "max_ast_depth": 10,

    # Очистка
    "remove_commented_out_code": True,
    "normalize_with_black": True,      # полный прогон: включаем
    "black_line_length": 88,

    "strip_trailing_ws": True,
    "collapse_blank_lines": True,

    "ascii_in_comments_only": True,
    "escape_non_ascii_in_strings": True,

    "transliterate_non_ascii_identifiers": False,
    "exclude_self_cls_from_arg_count": True,

    # Дедуп
    "dedup_exact": True,
    "dedup_near": False,
    "near_dup_threshold": 0.9,
    "near_dup_num_perm": 128,
    "near_dup_shingle_size": 5,
}
# Кол-во процессов для datasets.map
NUM_PROC = max(1, (os.cpu_count() or 2))
print("NUM_PROC =", NUM_PROC)

NUM_PROC = 2


In [None]:
COMMENTED_CODE_RE = re.compile(
    r"^\s*#\s*("
    r"(?:def|class|for|while|if|elif|else|try|except|with|return|yield|import|from|raise|assert)\b"
    r"|[\w\]\)\}]+\s*=\s*.+"
    r"|[\w\.]+\(.+\)\s*$"
    r")"
)
MULTI_BLANK_RE = re.compile(r"\n{3,}")
IDENT_NON_ASCII_RE = re.compile(r"[^\x00-\x7F]")

In [None]:
# Вспомогательные функции
def compute_loc(code: str) -> int:
    return len(code.splitlines())

def _ast_max_depth(node: ast.AST) -> int:
    if not isinstance(node, ast.AST):
        return 0
    children = list(ast.iter_child_nodes(node))
    if not children:
        return 1
    return 1 + max(_ast_max_depth(ch) for ch in children)

def extract_first_function_node(tree: ast.AST) -> Optional[ast.AST]:
    for n in getattr(tree, "body", []):
        if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef)):
            return n
        if isinstance(n, ast.ClassDef):
            for m in n.body:
                if isinstance(m, (ast.FunctionDef, ast.AsyncFunctionDef)):
                    return m
    return None

In [None]:
def count_args(fn: ast.AST, exclude_self_cls: bool = True) -> int:
    if fn is None or not isinstance(fn, (ast.FunctionDef, ast.AsyncFunctionDef)):
        return 0
    a = fn.args
    total = len(a.posonlyargs) + len(a.args) + len(a.kwonlyargs)
    if a.vararg is not None:
        total += 1
    if a.kwarg is not None:
        total += 1
    if exclude_self_cls and len(a.args) >= 1:
        first = a.args[0].arg
        if first in ("self", "cls"):
            total -= 1
    return total

In [None]:
def normalize_newlines(s: str) -> str:
    return (s or "").replace("\r\n", "\n").replace("\r", "\n")

def strip_trailing_whitespace(s: str) -> str:
    return "\n".join(line.rstrip() for line in s.split("\n"))

def collapse_blank_lines(s: str) -> str:
    return MULTI_BLANK_RE.sub("\n\n", s).strip() + "\n"

In [None]:
def remove_commented_out_code_lines(s: str) -> str:
    lines = s.split("\n")
    out = []
    for line in lines:
        if COMMENTED_CODE_RE.match(line):
            continue
        out.append(line)
    return "\n".join(out)

def black_format(code: str, line_length: int = 88) -> str:
    try:
        mode = black.FileMode(line_length=line_length)
        return black.format_str(code, mode=mode)
    except Exception:
        return code

In [None]:
def stable_hash(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8", errors="ignore")).hexdigest()

def ascii_clean_comment_text(comment: str) -> str:
    return comment.encode("ascii", errors="ignore").decode("ascii")

def escape_non_ascii_string_literal(token_text: str) -> str:
    m = re.match(r"^([rubfRUBF]*)(['\"]{1,3})(.*)\2$", token_text, flags=re.DOTALL)
    if not m:
        return token_text
    prefix = m.group(1)
    if "f" in prefix.lower():
        return token_text
    try:
        val = ast.literal_eval(token_text)
        if isinstance(val, str):
            return ascii(val)  # даёт \u-escape и ASCII-only
        return token_text
    except Exception:
        return token_text

In [None]:
def clean_non_ascii_in_comments_and_strings(code: str,
                                           ascii_in_comments_only: bool = True,
                                           escape_non_ascii_in_strings: bool = True) -> str:
    import io, tokenize
    try:
        tokens = list(tokenize.generate_tokens(io.StringIO(code).readline))
    except Exception:
        return code

    pieces = []
    for tok_type, tok_str, *_ in tokens:
        if tok_type == tokenize.COMMENT and ascii_in_comments_only:
            tok_str = ascii_clean_comment_text(tok_str)
        elif tok_type == tokenize.STRING and escape_non_ascii_in_strings:
            if any(ord(c) > 127 for c in tok_str):
                tok_str = escape_non_ascii_string_literal(tok_str)
        pieces.append((tok_type, tok_str))

    try:
        return tokenize.untokenize(pieces)
    except Exception:
        return code

In [None]:
def transliterate_identifiers(code: str) -> str:
    import io, tokenize
    try:
        toks = list(tokenize.generate_tokens(io.StringIO(code).readline))
    except Exception:
        return code

    out = []
    for tok_type, tok_str, *_ in toks:
        if tok_type == tokenize.NAME and IDENT_NON_ASCII_RE.search(tok_str):
            tok_str = unidecode(tok_str)
        out.append((tok_type, tok_str))

    try:
        return tokenize.untokenize(out)
    except Exception:
        return code

In [None]:
def clean_example(code: str, doc: Optional[str] = None, cfg: Dict[str, Any] = None) -> Dict[str, Any]:
    cfg = cfg or CFG
    code = normalize_newlines(code)

    if cfg["strip_trailing_ws"]:
        code = strip_trailing_whitespace(code)

    if cfg["remove_commented_out_code"]:
        code = remove_commented_out_code_lines(code)

    code = clean_non_ascii_in_comments_and_strings(
        code,
        ascii_in_comments_only=cfg["ascii_in_comments_only"],
        escape_non_ascii_in_strings=cfg["escape_non_ascii_in_strings"],
    )

    if cfg["transliterate_non_ascii_identifiers"]:
        code = transliterate_identifiers(code)

    if cfg["normalize_with_black"]:
        code = black_format(code, line_length=cfg["black_line_length"])

    if cfg["collapse_blank_lines"]:
        code = collapse_blank_lines(code)

    loc = compute_loc(code)

    try:
        tree = ast.parse(code)
        fn = extract_first_function_node(tree)
        nargs = count_args(fn, exclude_self_cls=cfg["exclude_self_cls_from_arg_count"])
        ast_depth = _ast_max_depth(tree)
        parsable = True
    except Exception:
        nargs = 0
        ast_depth = 0
        parsable = False

    keep = (
        parsable
        and (loc >= cfg["min_loc"])
        and (loc <= cfg["max_loc"])
        and (nargs <= cfg["max_args"])
        and (ast_depth <= cfg["max_ast_depth"])
    )

    return {
        "code_clean": code,
        "docstring": doc,
        "loc": loc,
        "n_args": nargs,
        "ast_depth": ast_depth,
        "parsable": parsable,
        "keep": keep,
        "sha256": stable_hash(code),
    }

def run_cleaning(ds: Dataset, code_col: str, doc_col: Optional[str], cfg: Dict[str, Any], num_proc: int) -> Dataset:
    def _map_fn(ex):
        code = ex.get(code_col, "")
        doc = ex.get(doc_col) if doc_col else None
        return clean_example(code=code, doc=doc, cfg=cfg)

    ds2 = ds.map(_map_fn, desc=f"Cleaning {code_col}", num_proc=num_proc)
    ds2 = ds2.filter(lambda x: x["keep"], desc="Filtering keep==True")

    if cfg.get("dedup_exact", True):
        seen = set()
        def _dedup(ex):
            h = ex["sha256"]
            if h in seen:
                return False
            seen.add(h)
            return True
        ds2 = ds2.filter(_dedup, desc="Exact dedup (sha256)")

    return ds2


In [None]:
!pip -q install -U datasets pyarrow black datasketch unidecode

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h

## Загрузка и очистка CodeSearchNet

In [None]:
# Parquet-версия CodeSearchNet
csn = load_dataset("claudios/code_search_net", "python")

print(csn)
print(csn["train"].column_names)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

python/train-00000-of-00003.parquet:   0%|          | 0.00/130M [00:00<?, ?B/s]

python/train-00001-of-00003.parquet:   0%|          | 0.00/135M [00:00<?, ?B/s]

python/train-00002-of-00003.parquet:   0%|          | 0.00/125M [00:00<?, ?B/s]

python/test-00000-of-00001.parquet:   0%|          | 0.00/21.7M [00:00<?, ?B/s]

python/validation-00000-of-00001.parquet:   0%|          | 0.00/23.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/412178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23107 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_documentation_string', 'func_code_url'],
        num_rows: 412178
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_documentation_string', 'func_code_url'],
        num_rows: 22176
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_documentation_string', 'func_code_url'],
        num_rows: 23107
    })
})
['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_documentation_string', 'func_code_url']


In [None]:
CODE_COL = "func_code_string"
DOC_COL  = "func_documentation_string"

csn_train_clean = run_cleaning(csn["train"], code_col=CODE_COL, doc_col=DOC_COL, cfg=CFG, num_proc=NUM_PROC)
csn_valid_clean = run_cleaning(csn["validation"], code_col=CODE_COL, doc_col=DOC_COL, cfg=CFG, num_proc=NUM_PROC)
csn_test_clean  = run_cleaning(csn["test"],  code_col=CODE_COL, doc_col=DOC_COL, cfg=CFG, num_proc=NUM_PROC)

csn_clean = DatasetDict({"train": csn_train_clean, "validation": csn_valid_clean, "test": csn_test_clean})
csn_clean

Cleaning func_code_string (num_proc=2):   0%|          | 0/412178 [00:00<?, ? examples/s]

Filtering keep==True:   0%|          | 0/412178 [00:00<?, ? examples/s]

Exact dedup (sha256):   0%|          | 0/284003 [00:00<?, ? examples/s]

Cleaning func_code_string (num_proc=2):   0%|          | 0/23107 [00:00<?, ? examples/s]

Filtering keep==True:   0%|          | 0/23107 [00:00<?, ? examples/s]

Exact dedup (sha256):   0%|          | 0/15960 [00:00<?, ? examples/s]

Cleaning func_code_string (num_proc=2):   0%|          | 0/22176 [00:00<?, ? examples/s]

Filtering keep==True:   0%|          | 0/22176 [00:00<?, ? examples/s]

Exact dedup (sha256):   0%|          | 0/15219 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_documentation_string', 'func_code_url', 'code_clean', 'docstring', 'loc', 'n_args', 'ast_depth', 'parsable', 'keep', 'sha256'],
        num_rows: 283995
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_documentation_string', 'func_code_url', 'code_clean', 'docstring', 'loc', 'n_args', 'ast_depth', 'parsable', 'keep', 'sha256'],
        num_rows: 15960
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_documentation_string', 'func_code_url', 'code_clean', 'docstring', 'loc', 'n_args', 'ast_depth', 'parsable', 'keep', 'sha256'],
        num_rows: 15219
    })
})

In [None]:
OUT_DIR = "/content/drive/MyDrive/project_2263_clean/csn_python_clean_full"
os.makedirs(OUT_DIR, exist_ok=True)

csn_clean.save_to_disk(OUT_DIR)
print("Saved to:", OUT_DIR)


Saving the dataset (0/3 shards):   0%|          | 0/283995 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/15960 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/15219 [00:00<?, ? examples/s]

Saved to: /content/drive/MyDrive/project_2263_clean/csn_python_clean_full


In [None]:
# Экспорт в JSONL (удобно для дальнейшей подготовки)
OUT_JSONL_DIR = "/content/drive/MyDrive/project_2263_clean/csn_python_jsonl_full"
os.makedirs(OUT_JSONL_DIR, exist_ok=True)

for split in ["train", "validation", "test"]:
    csn_clean[split].to_json(f"{OUT_JSONL_DIR}/csn_{split}_clean.jsonl")

print("JSONL saved to:", OUT_JSONL_DIR)


Creating json from Arrow format:   0%|          | 0/284 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

JSONL saved to: /content/drive/MyDrive/project_2263_clean/csn_python_jsonl_full


#### Блок проверки

In [None]:
from datasets import load_from_disk

csn_clean = load_from_disk(
    "/content/drive/MyDrive/project_2263_clean/csn_python_clean_full"
)

In [None]:

print(csn_clean)

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_documentation_string', 'func_code_url', 'code_clean', 'docstring', 'loc', 'n_args', 'ast_depth', 'parsable', 'keep', 'sha256'],
        num_rows: 283995
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_documentation_string', 'func_code_url', 'code_clean', 'docstring', 'loc', 'n_args', 'ast_depth', 'parsable', 'keep', 'sha256'],
        num_rows: 15960
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_documentation_string', 'func_code_url', 'code_clean', 'docstring', 'loc', 'n_args', 'ast_depth', 'parsable', 'keep', 'sha256'],
        num_rows: 15219
    })
})


In [None]:
orig = len(csn["train"])
clean = len(csn_clean["train"])

print(f"Train было: {orig}")
print(f"Train стало: {clean}")
print(f"Удалено: {orig - clean} ({(orig-clean)/orig:.2%})")

Train было: 412178
Train стало: 283995
Удалено: 128183 (31.10%)


In [None]:
ex = csn_clean["train"][0]
print(ex.keys())
print(ex["loc"], ex["n_args"], ex["ast_depth"])
print(ex["code_clean"][:500])

dict_keys(['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_documentation_string', 'func_code_url', 'code_clean', 'docstring', 'loc', 'n_args', 'ast_depth', 'parsable', 'keep', 'sha256'])
10 2 10
def addidsuffix(self, idsuffix, recursive=True):
    """Appends a suffix to this element's ID, and optionally to all child IDs as well. There is sually no need to call this directly, invoked implicitly by :meth:`copy`"""
    if self.id:
        self.id += idsuffix
    if recursive:
        for e in self:
            try:
                e.addidsuffix(idsuffix, recursive)
            except Exception:
                pass



In [None]:
import numpy as np

locs = np.array(csn_clean["train"]["loc"])
args = np.array(csn_clean["train"]["n_args"])
depths = np.array(csn_clean["train"]["ast_depth"])

print("LOC min/max:", locs.min(), locs.max())
print("Args max:", args.max())
print("AST depth max:", depths.max())

LOC min/max: 3 150
Args max: 7
AST depth max: 10


In [None]:
assert all(csn_clean["train"]["parsable"]), "Есть непарсящийся код!"

In [None]:
hashes = csn_clean["train"]["sha256"]
print("Уникальных хэшей:", len(set(hashes)))
print("Всего:", len(hashes))

Уникальных хэшей: 283995
Всего: 283995


In [None]:
h_train = set(csn_clean["train"]["sha256"])
h_val   = set(csn_clean["validation"]["sha256"])
h_test  = set(csn_clean["test"]["sha256"])

print("train ∩ val:", len(h_train & h_val))
print("train ∩ test:", len(h_train & h_test))
print("val ∩ test:", len(h_val & h_test))

train ∩ val: 0
train ∩ test: 0
val ∩ test: 0


In [None]:
sample = csn_clean["train"][100]["code_clean"]
print(any(ord(c) > 127 for c in sample))

False


## Загрузка и очистка CodeXGlue

In [None]:
CODE_COL = "code"
DOC_COL = "docstring"

try:
    cx = load_dataset("code_x_glue_ct_code_to_text", "python")
    print("Загружено: code_x_glue_ct_code_to_text / python")
except RuntimeError as e:
    msg = str(e)
    print("Не получилось загрузить стандартный CodeXGLUE:", msg[:300], "...\n")
    cx = load_dataset("claudios/code_x_glue_ct_code_to_text", "python")
    print("Загружено: claudios/code_x_glue_ct_code_to_text / python")

print(cx)
print(cx["train"].column_names)

README.md: 0.00B [00:00, ?B/s]

python/train-00000-of-00002.parquet:   0%|          | 0.00/144M [00:00<?, ?B/s]

python/train-00001-of-00002.parquet:   0%|          | 0.00/147M [00:00<?, ?B/s]

python/validation-00000-of-00001.parquet:   0%|          | 0.00/16.7M [00:00<?, ?B/s]

python/test-00000-of-00001.parquet:   0%|          | 0.00/18.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/251820 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13914 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/14918 [00:00<?, ? examples/s]

Загружено: code_x_glue_ct_code_to_text / python
DatasetDict({
    train: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 251820
    })
    validation: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 13914
    })
    test: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 14918
    })
})
['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url']


In [None]:
train_split = "train"
valid_split = "validation" if "validation" in cx else ("valid" if "valid" in cx else ("dev" if "dev" in cx else None))
test_split  = "test" if "test" in cx else None

if valid_split is None or test_split is None:
    raise ValueError(f"Не нашёл нужные split'ы. Доступные: {list(cx.keys())}")

cx_train_clean = run_cleaning(cx[train_split], code_col=CODE_COL, doc_col=DOC_COL, cfg=CFG, num_proc=NUM_PROC)
cx_valid_clean = run_cleaning(cx[valid_split], code_col=CODE_COL, doc_col=DOC_COL, cfg=CFG, num_proc=NUM_PROC)
cx_test_clean  = run_cleaning(cx[test_split],  code_col=CODE_COL, doc_col=DOC_COL, cfg=CFG, num_proc=NUM_PROC)

codexglue_clean = DatasetDict({
    "train": cx_train_clean,
    "validation": cx_valid_clean,
    "test": cx_test_clean
})

codexglue_clean

DatasetDict({
    train: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'code_clean', 'loc', 'n_args', 'ast_depth', 'parsable', 'keep', 'sha256'],
        num_rows: 177695
    })
    validation: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'code_clean', 'loc', 'n_args', 'ast_depth', 'parsable', 'keep', 'sha256'],
        num_rows: 9843
    })
    test: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'code_clean', 'loc', 'n_args', 'ast_depth', 'parsable', 'keep', 'sha256'],
        num_rows: 10490
    })
})

In [None]:
OUT_DIR = "/content/drive/MyDrive/project_2263_clean/codexglue_python_clean_full"
os.makedirs(OUT_DIR, exist_ok=True)

codexglue_clean.save_to_disk(OUT_DIR)
print("Saved to:", OUT_DIR)

Saving the dataset (0/2 shards):   0%|          | 0/177695 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9843 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10490 [00:00<?, ? examples/s]

Saved to: /content/drive/MyDrive/project_2263_clean/codexglue_python_clean_full


#### Блок проверки

In [None]:
print("Исходный train:", len(cx["train"]))
print("Очищенный train:", len(codexglue_clean["train"]))
print("Доля оставшихся:",
      len(codexglue_clean["train"]) / len(cx["train"]))


Исходный train: 251820
Очищенный train: 177695
Доля оставшихся: 0.7056429195457072


In [None]:
ex = codexglue_clean["train"][0]

print(ex.keys())
print("LOC:", ex["loc"],
      "Args:", ex["n_args"],
      "AST:", ex["ast_depth"],
      "Parsable:", ex["parsable"])

print("\nCODE:\n", ex["code_clean"][:400])
print("\nDOC:\n", ex["docstring"][:200] if ex["docstring"] else None)

dict_keys(['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'code_clean', 'loc', 'n_args', 'ast_depth', 'parsable', 'keep', 'sha256'])
LOC: 8 Args: 2 AST: 7 Parsable: True

CODE:
 def settext(self, text, cls="current"):
    """Set the text for this element.

    Arguments:
        text (str): The text
        cls (str): The class of the text, defaults to ``current`` (leave this unless you know what you are doing). There may be only one text content element of each class associated with the element.
    """
    self.replace(TextContent, value=text, cls=cls)


DOC:
 Set the text for this element.

        Arguments:
            text (str): The text
            cls (str): The class of the text, defaults to ``current`` (leave this unless you know what you are doing


In [None]:

locs   = np.array(codexglue_clean["train"]["loc"])
args   = np.array(codexglue_clean["train"]["n_args"])
depths = np.array(codexglue_clean["train"]["ast_depth"])

print("LOC min/max:", locs.min(), locs.max())
print("Args max:", args.max())
print("AST depth max:", depths.max())


LOC min/max: 3 150
Args max: 7
AST depth max: 10


In [None]:
assert all(codexglue_clean["train"]["parsable"])

In [None]:
hashes = codexglue_clean["train"]["sha256"]

print("Всего:", len(hashes))
print("Уникальных:", len(set(hashes)))

Всего: 177695
Уникальных: 177695


In [None]:
h_train = set(codexglue_clean["train"]["sha256"])
h_val   = set(codexglue_clean["validation"]["sha256"])
h_test  = set(codexglue_clean["test"]["sha256"])

print("train ∩ val:", len(h_train & h_val))
print("train ∩ test:", len(h_train & h_test))
print("val ∩ test:", len(h_val & h_test))

train ∩ val: 0
train ∩ test: 0
val ∩ test: 0


In [None]:
sample = codexglue_clean["train"][100]["code_clean"]
print(any(ord(c) > 127 for c in sample))

False


In [None]:
print("LOC p50 / p95:", np.percentile(locs, [50, 95]))
print("Args p95:", np.percentile(args, 95))
print("AST depth p95:", np.percentile(depths, 95))

LOC p50 / p95: [15. 46.]
Args p95: 5.0
AST depth p95: 10.0


## Будем использовать CodeSearchNet для обучения, а CodeXGlue для валидации