In [2]:
from pathlib import Path

import csv
import pandas as pd

csv_path = Path(r"../../artifacts_full/patent_quality_output.csv")
max_rows = 20

encoding_candidates = ["utf-8-sig", "utf-8", "gb18030"]
delimiter_candidates = [",", "\t", ";", "|"]


def _detect_encoding(path: Path) -> str:
    with open(path, "rb") as bf:
        raw = bf.read(256 * 1024)
    last_error: Exception | None = None
    for enc in encoding_candidates:
        try:
            raw.decode(enc, errors="strict")
            return enc
        except UnicodeDecodeError as e:
            last_error = e
    if last_error is not None:
        raise last_error
    return "utf-8"


def _detect_delimiter(path: Path, encoding: str) -> str:
    with open(path, "r", encoding=encoding, errors="strict", newline="") as f:
        sample = f.read(1024 * 1024)
    try:
        return csv.Sniffer().sniff(sample, delimiters=delimiter_candidates).delimiter
    except csv.Error:
        return ","


enc = _detect_encoding(csv_path)
sep = _detect_delimiter(csv_path, enc)

read_kwargs = dict(
    filepath_or_buffer=csv_path,
    nrows=max_rows,
    sep=sep,
    encoding=enc,
    dtype=str,
)

try:
    df = pd.read_csv(**read_kwargs)
except pd.errors.ParserError:
    df = pd.read_csv(**read_kwargs, engine="python")

df


Unnamed: 0,申请号,申请年份,专利名称,BS,FS,Quality_q,申请人,申请人类型,申请人地址,申请人城市
0,CN85100468.7,1985,四氟乙烯、乙烯三元共聚物的生产方法,0.0,0.0,0.0,中国科学院上海有机化学研究所,科研单位,上海市零陵路345号,上海市
1,CN85100490.3,1985,氟塑料合金、制造及应用,0.0,0.0,0.0,中国科学院上海有机化学研究所,科研单位,上海市零陵路345号,上海市
2,CN85102316.9,1985,冲吸漂染辊筒,0.0,0.0,0.0,林良忠; 林理象,个人,上海市打蒲路60弄5号,上海市
3,CN85102493.9,1985,双曲肋式波纹管及其制造方法,0.0,0.0,0.0,北京市丰源机械研究所,科研单位,北京市9200信箱11分箱,北京市
4,CN85100840.2,1985,旋转冲击凿岩机的冲击装置,0.0,0.0,0.0,张启风,个人,北京市朝阳区定福庄水电二局修造厂,北京市
5,CN85100912.3,1985,微波经络穴位治疗仪,0.0,0.0,0.0,华北光电技术研究所,科研单位,北京8511信箱,北京市
6,CN85100673.6,1985,牙科用高铜白银合金粉制造方法,0.0,0.0,0.0,冶金工业部钢铁研究总院,科研单位,北京市海淀区学院南路24号,北京市
7,CN85100037.1,1985,焊接电源外特性控制法，弧焊机及其电路,0.0,0.0,0.0,清华大学,学校,北京市海淀区清华园,北京市
8,CN85100285.4,1985,高分辨率汉字字形发生器,0.0,0.0,0.0,北京大学; 潍坊电子计算机公司,"企业,学校",北京市海淀区中关村,北京市
9,CN85100820.8,1985,搅刀——拨轮式排肥、排种器,0.0,0.0,0.0,北京农业机械化学院,学校,北京市海淀区清华东路,北京市


In [3]:
from pathlib import Path

import csv
import pandas as pd

csv_path = Path(r"../../artifacts_full/patent_quality_output.csv")

encoding_candidates = ["utf-8-sig", "utf-8", "gb18030"]
delimiter_candidates = [",", "\t", ";", "|"]


def _detect_encoding(path: Path) -> str:
    with open(path, "rb") as bf:
        raw = bf.read(256 * 1024)
    last_error: Exception | None = None
    for enc in encoding_candidates:
        try:
            raw.decode(enc, errors="strict")
            return enc
        except UnicodeDecodeError as e:
            last_error = e
    if last_error is not None:
        raise last_error
    return "utf-8"


def _detect_delimiter(path: Path, encoding: str) -> str:
    with open(path, "r", encoding=encoding, errors="strict", newline="") as f:
        sample = f.read(1024 * 1024)
    try:
        return csv.Sniffer().sniff(sample, delimiters=delimiter_candidates).delimiter
    except csv.Error:
        return ","


def _has_header(path: Path, encoding: str) -> bool:
    with open(path, "r", encoding=encoding, errors="strict", newline="") as f:
        sample = f.read(1024 * 1024)
    try:
        return csv.Sniffer().has_header(sample)
    except csv.Error:
        return True


def count_lines_fast(path: Path, chunk_size: int = 64 * 1024 * 1024) -> int:
    total_newlines = 0
    last_byte = b""
    with open(path, "rb") as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            total_newlines += chunk.count(b"\n")
            last_byte = chunk[-1:]
    if total_newlines == 0:
        return 0
    if last_byte != b"\n":
        return total_newlines + 1
    return total_newlines


def count_rows_pandas_chunks(
    path: Path,
    sep: str,
    encoding: str,
    chunksize: int = 200_000,
) -> int:
    total = 0
    for chunk in pd.read_csv(path, sep=sep, encoding=encoding, chunksize=chunksize, dtype=str):
        total += len(chunk)
    return total


enc = _detect_encoding(csv_path)
sep = _detect_delimiter(csv_path, enc)
header = _has_header(csv_path, enc)

physical_lines = count_lines_fast(csv_path)
data_rows_estimate = max(0, physical_lines - (1 if header else 0))

result = pd.DataFrame(
    [{
        "file": str(csv_path),
        "encoding": enc,
        "delimiter": sep,
        "has_header": header,
        "physical_lines": physical_lines,
        "data_rows_estimate": data_rows_estimate,
    }]
)

result


Unnamed: 0,file,encoding,delimiter,has_header,physical_lines,data_rows_estimate
0,..\..\artifacts_full\patent_quality_output.csv,utf-8-sig,",",True,5092684,5092683
