In [12]:
import os
import re


In [13]:
def process_py_file(file_path):
    with open(file_path, "r", encoding="utf8") as f:
        file_content = f.read()

    # Remove import statements
    file_content = re.sub(r"^import.*$|^from .* import.*$", "", file_content, flags=re.MULTILINE)

    file_content = re.sub(r"^sys.path.append.*\n", "", file_content, flags=re.MULTILINE)

    # Remove comments
    file_content = re.sub(r"#.*", "", file_content)

    # # Remove function descriptions and docstrings
    patron_docstrings_triple_comillas = r'(\"\"\")(.*?)(\"\"\")'
    patron_docstrings_comillas_simples = r'(\'\'\')(.*?)(\'\'\')'
    patron_total = f"{patron_docstrings_triple_comillas}|{patron_docstrings_comillas_simples}"

    file_content = re.sub(patron_total, '', file_content, flags=re.DOTALL)

    # Remove the code inside if __name__ == "__main__": block
    file_content = re.sub(r'if __name__ == "__main__":[\s\S]*', "", file_content)

    # Remove any empty lines and extra whitespaces
    file_content = re.sub(r"^\s*\n", "", file_content, flags=re.MULTILINE)
    file_content = re.sub(r"\n\s*\n", "\n", file_content)
    file_content = file_content.strip()

    return file_content


In [14]:
def get_import_lines(file_path):
    with open(file_path, "r", encoding="utf8") as f:
        file_content = f.read()

    # Find all import lines using regular expression
    import_lines = re.findall(r"^import.*$|^from .* import.*$", file_content, re.MULTILINE)

    return import_lines



In [15]:
input_file_path = "../../discrete/distributions/bernoulli.py"
processed_content = process_py_file(input_file_path)
print(processed_content)

class BERNOULLI:
    def __init__(self, measurements):
        self.parameters = self.get_parameters(measurements)
        self.p = self.parameters["p"]
    def cdf(self, x: float) -> float:
        if ( x < 0 ):
            result = 0
        elif( x >= 0 and x < 1 ):
            result = 1 - self.p
        else:
            result = 1
        return result
    def pmf(self, x: int) -> float:
        result = (self.p ** x) * (1 - self.p) ** (1 - x)
        return result
    def get_num_parameters(self) -> int:
        return len(self.parameters)
    def parameter_restrictions(self) -> bool:
        v1 = self.p > 0 and self.p < 1
        return v1
    def get_parameters(self, measurements) -> dict[str, float | int]:
        p = measurements.mean
        parameters = {"p": p}
        return parameters


In [16]:
input_file_path = "../../discrete/distributions/bernoulli.py"
import_lines = get_import_lines(input_file_path)

In [17]:
IMPORTS = []
for file in os.listdir("../../discrete/distributions"):
    if ".py" in file:
        import_lines = get_import_lines(f"../../discrete/distributions/{file}")
        IMPORTS.extend(import_lines)

input_file_path = "../../discrete/measurements/measurements_discrete.py"
import_lines = get_import_lines(input_file_path)
IMPORTS.extend(import_lines)

In [18]:
CODE = "\n".join(sorted(list(set(IMPORTS)))) + "\n\n"
for file in os.listdir("../../discrete/distributions"):
    if ".py" in file:
        processed_content = process_py_file(f"../../discrete/distributions/{file}")
        CODE += processed_content + "\n\n"

In [19]:
input_file_path = "../../discrete/measurements/measurements_discrete.py"
measuerements_code = process_py_file(input_file_path)
CODE += measuerements_code + "\n\n"

input_file_path = "../../discrete/test_chi_square_discrete.py"
test_chi_square_discrete_code = process_py_file(input_file_path)
CODE += test_chi_square_discrete_code + "\n\n"

input_file_path = "../../discrete/test_kolmogorov_smirnov_discrete.py"
test_kolmogorov_smirnov_discrete_code = process_py_file(input_file_path)
CODE += test_kolmogorov_smirnov_discrete_code + "\n\n"

In [20]:
if_name = """
def phitter_discrete(data):
    _all_distributions = [BERNOULLI, BINOMIAL, GEOMETRIC, HYPERGEOMETRIC, LOGARITHMIC, NEGATIVE_BINOMIAL, POISSON, UNIFORM]
    measurements = MEASUREMENTS_DISCRETE(data)

    NONE_RESULTS = {
        "test_statistic": None,
        "critical_value": None,
        "p_value": None,
        "rejected": None,
    }

    RESPONSE = {}
    for distribution_class in _all_distributions:
        distribution_name = distribution_class.__name__.lower()

        validate_estimation = True
        sse = 0
        try:
            distribution = distribution_class(measurements)
            pmf_values = [distribution.pmf(d) for d in measurements.domain]
            sse = numpy.sum(numpy.power(numpy.array(pmf_values) - numpy.array(measurements.frequencies_pmf), 2.0))
        except:
            validate_estimation = False

        DISTRIBUTION_RESULTS = {}
        v1, v2 = False, False
        if validate_estimation and not math.isnan(sse) and not math.isinf(sse):
            try:
                chi2_test = test_chi_square_discrete(data, distribution, measurements)
                if numpy.isnan(chi2_test["test_statistic"]) == False and math.isinf(chi2_test["test_statistic"]) == False and chi2_test["test_statistic"] > 0:
                    DISTRIBUTION_RESULTS["chi_square"] = {
                        "test_statistic": chi2_test["test_statistic"],
                        "critical_value": chi2_test["critical_value"],
                        "p_value": chi2_test["p-value"],
                        "rejected": chi2_test["rejected"],
                    }
                    v1 = True
                else:
                    DISTRIBUTION_RESULTS["chi_square"] = NONE_RESULTS
            except:
                DISTRIBUTION_RESULTS["chi_square"] = NONE_RESULTS

            try:
                ks_test = test_kolmogorov_smirnov_discrete(data, distribution, measurements)
                if numpy.isnan(ks_test["test_statistic"]) == False and math.isinf(ks_test["test_statistic"]) == False and ks_test["test_statistic"] > 0:
                    DISTRIBUTION_RESULTS["kolmogorov_smirnov"] = {
                        "test_statistic": ks_test["test_statistic"],
                        "critical_value": ks_test["critical_value"],
                        "p_value": ks_test["p-value"],
                        "rejected": ks_test["rejected"],
                    }
                    v2 = True
                else:
                    DISTRIBUTION_RESULTS["anderson_darling"] = NONE_RESULTS
            except:
                DISTRIBUTION_RESULTS["kolmogorov_smirnov"] = NONE_RESULTS

            if v1 or v2:
                DISTRIBUTION_RESULTS["sse"] = sse
                DISTRIBUTION_RESULTS["parameters"] = str(distribution.parameters)
                DISTRIBUTION_RESULTS["n_test_passed"] = int(DISTRIBUTION_RESULTS["chi_square"]["rejected"] == False) + int(DISTRIBUTION_RESULTS["kolmogorov_smirnov"]["rejected"] == False)
                DISTRIBUTION_RESULTS["n_test_null"] = int(DISTRIBUTION_RESULTS["chi_square"]["rejected"] == None) + int(DISTRIBUTION_RESULTS["kolmogorov_smirnov"]["rejected"] == None)

                RESPONSE[distribution_name] = DISTRIBUTION_RESULTS

    sorted_results_sse = {distribution: results for distribution, results in sorted(RESPONSE.items(), key=lambda x: x[1]["sse"])}
    aproved_results = {distribution: results for distribution, results in sorted_results_sse.items() if results["n_test_passed"] > 0}

    return sorted_results_sse, aproved_results


if __name__ == "__main__":
    path = "../../discrete/data/data_binomial.txt"
    sample_distribution_file = open(path, "r")
    data = [float(x.replace(",", ".")) for x in sample_distribution_file.read().splitlines()]

    sorted_results_sse, aproved_results = phitter_discrete(data)

    for distribution, results in aproved_results.items():
        print(f"Distribution: {distribution}, SSE: {results['sse']}, Aprobados: {results['n_test_passed']}")
"""


In [21]:
CODE += if_name

In [22]:
code_file = open("./code_discrete.py", "+w", encoding="utf8")
code_file.write(CODE)
code_file.close()