In [1]:
import os
import re


In [2]:
def process_py_file(file_path):
    with open(file_path, "r", encoding="utf8") as f:
        file_content = f.read()

    # Remove import statements
    file_content = re.sub(r"^import.*$|^from .* import.*$", "", file_content, flags=re.MULTILINE)

    file_content = re.sub(r"^sys.path.append.*\n", "", file_content, flags=re.MULTILINE)

    # Remove comments
    file_content = re.sub(r"#.*", "", file_content)

    # # Remove function descriptions and docstrings
    patron_docstrings_triple_comillas = r'(\"\"\")(.*?)(\"\"\")'
    patron_docstrings_comillas_simples = r'(\'\'\')(.*?)(\'\'\')'
    patron_total = f"{patron_docstrings_triple_comillas}|{patron_docstrings_comillas_simples}"

    file_content = re.sub(patron_total, '', file_content, flags=re.DOTALL)

    # Remove the code inside if __name__ == "__main__": block
    file_content = re.sub(r'if __name__ == "__main__":[\s\S]*', "", file_content)

    # Remove any empty lines and extra whitespaces
    file_content = re.sub(r"^\s*\n", "", file_content, flags=re.MULTILINE)
    file_content = re.sub(r"\n\s*\n", "\n", file_content)
    file_content = file_content.strip()

    return file_content



In [3]:
def get_import_lines(file_path):
    with open(file_path, "r", encoding="utf8") as f:
        file_content = f.read()

    # Find all import lines using regular expression
    import_lines = re.findall(r"^import.*$|^from .* import.*$", file_content, re.MULTILINE)

    return import_lines


In [4]:
input_file_path = "../../continuous/distributions/fatigue_life.py"
processed_content = process_py_file(input_file_path)
print(processed_content)

class FATIGUE_LIFE:
    def __init__(self, measurements):
        self.parameters = self.get_parameters(measurements)
        self.gamma = self.parameters["gamma"]
        self.loc = self.parameters["loc"]
        self.scale = self.parameters["scale"]
    def cdf(self, x: float) -> float:
        z = lambda t: math.sqrt((t - self.loc) / self.scale)
        result = scipy.stats.norm.cdf((z(x) - 1 / z(x)) / (self.gamma))
        return result
    def pdf(self, x: float) -> float:
        z = lambda t: math.sqrt((t - self.loc) / self.scale)
        result = (z(x) + 1 / z(x)) / (2 * self.gamma * (x - self.loc)) * scipy.stats.norm.pdf((z(x) - 1 / z(x)) / (self.gamma))
        return result
    def get_num_parameters(self) -> int:
        return len(self.parameters)
    def parameter_restrictions(self) -> bool:
        v1 = self.scale > 0
        v2 = self.gamma > 0
        return v1 and v2
    def get_parameters(self, measurements) -> dict[str, float | int]:
        scipy_params = scipy.sta

In [5]:
input_file_path = "../../continuous/distributions/fatigue_life.py"
import_lines = get_import_lines(input_file_path)

In [6]:
IMPORTS = []
for file in os.listdir("../../continuous/distributions"):
    if ".py" in file:
        import_lines = get_import_lines(f"../../continuous/distributions/{file}")
        IMPORTS.extend(import_lines)

input_file_path = "../../continuous/measurements/measurements_continuous.py"
import_lines = get_import_lines(input_file_path)
IMPORTS.extend(import_lines)

In [7]:
CODE = "\n".join(sorted(list(set(IMPORTS)))) + "\n\n"
for file in os.listdir("../../continuous/distributions"):
    if ".py" in file:
        processed_content = process_py_file(f"../../continuous/distributions/{file}")
        CODE += processed_content + "\n\n"

In [8]:
input_file_path = "../../continuous/measurements/measurements_continuous.py"
measuerements_code = process_py_file(input_file_path)
CODE += measuerements_code + "\n\n"

input_file_path = "../../continuous/test_chi_square_continuous.py"
test_chi_square_continuous_code = process_py_file(input_file_path)
CODE += test_chi_square_continuous_code + "\n\n"

input_file_path = "../../continuous/test_kolmogorov_smirnov_continuous.py"
test_kolmogorov_smirnov_continuous_code = process_py_file(input_file_path)
CODE += test_kolmogorov_smirnov_continuous_code + "\n\n"

input_file_path = "../../utilities/ad_marsaglia.py"
anderson_darling_code = process_py_file(input_file_path)
CODE += anderson_darling_code + "\n\n"

input_file_path = "../../continuous/test_anderson_darling_continuous.py"
test_anderson_darling_continuous_code = process_py_file(input_file_path)
test_anderson_darling_continuous_code = test_anderson_darling_continuous_code.replace("ad.", "")
CODE += test_anderson_darling_continuous_code + "\n\n"

In [9]:
if_name = """
def phitter_continuous(data, num_bins=None, confidence_level=0.95):
    _all_distributions = [
        ALPHA,
        ARCSINE,
        ARGUS,
        BETA,
        BETA_PRIME,
        BETA_PRIME_4P,
        BRADFORD,
        BURR,
        BURR_4P,
        CAUCHY,
        CHI_SQUARE,
        CHI_SQUARE_3P,
        DAGUM,
        DAGUM_4P,
        ERLANG,
        ERLANG_3P,
        ERROR_FUNCTION,
        EXPONENTIAL,
        EXPONENTIAL_2P,
        F,
        FATIGUE_LIFE,
        FOLDED_NORMAL,
        FRECHET,
        F_4P,
        GAMMA,
        GAMMA_3P,
        GENERALIZED_EXTREME_VALUE,
        GENERALIZED_GAMMA,
        GENERALIZED_GAMMA_4P,
        GENERALIZED_LOGISTIC,
        GENERALIZED_NORMAL,
        GENERALIZED_PARETO,
        GIBRAT,
        GUMBEL_LEFT,
        GUMBEL_RIGHT,
        HALF_NORMAL,
        HYPERBOLIC_SECANT,
        INVERSE_GAMMA,
        INVERSE_GAMMA_3P,
        INVERSE_GAUSSIAN,
        INVERSE_GAUSSIAN_3P,
        JOHNSON_SB,
        JOHNSON_SU,
        KUMARASWAMY,
        LAPLACE,
        LEVY,
        LOGGAMMA,
        LOGISTIC,
        LOGLOGISTIC,
        LOGLOGISTIC_3P,
        LOGNORMAL,
        MAXWELL,
        MOYAL,
        NAKAGAMI,
        NON_CENTRAL_CHI_SQUARE,
        NON_CENTRAL_F,
        NON_CENTRAL_T_STUDENT,
        NORMAL,
        PARETO_FIRST_KIND,
        PARETO_SECOND_KIND,
        PERT,
        POWER_FUNCTION,
        RAYLEIGH,
        RECIPROCAL,
        RICE,
        SEMICIRCULAR,
        TRAPEZOIDAL,
        TRIANGULAR,
        T_STUDENT,
        T_STUDENT_3P,
        UNIFORM,
        WEIBULL,
        WEIBULL_3P,
    ]
    measurements = MEASUREMENTS_CONTINUOUS(data, num_bins)

    ## Calculae Histogram
    num_bins = measurements.num_bins
    frequencies, bin_edges = numpy.histogram(data, num_bins, density=True)
    central_values = [(bin_edges[i] + bin_edges[i + 1]) / 2 for i in range(len(bin_edges) - 1)]

    NONE_RESULTS = {
        "test_statistic": None,
        "critical_value": None,
        "p_value": None,
        "rejected": None,
    }

    RESPONSE = {}
    for distribution_class in _all_distributions:
        distribution_name = distribution_class.__name__.lower()

        validate_estimation = True
        sse = 0
        try:
            distribution = distribution_class(measurements)
            pdf_values = [distribution.pdf(c) for c in central_values]
            sse = numpy.sum(numpy.power(frequencies - pdf_values, 2.0))
        except:
            validate_estimation = False

        DISTRIBUTION_RESULTS = {}
        v1, v2, v3 = False, False, False
        if validate_estimation and not math.isnan(sse) and not math.isinf(sse):
            try:
                chi2_test = test_chi_square_continuous(data, distribution, measurements, confidence_level=confidence_level)
                if numpy.isnan(chi2_test["test_statistic"]) == False and math.isinf(chi2_test["test_statistic"]) == False and chi2_test["test_statistic"] > 0:
                    DISTRIBUTION_RESULTS["chi_square"] = {
                        "test_statistic": chi2_test["test_statistic"],
                        "critical_value": chi2_test["critical_value"],
                        "p_value": chi2_test["p-value"],
                        "rejected": chi2_test["rejected"],
                    }
                    v1 = True
                else:
                    DISTRIBUTION_RESULTS["chi_square"] = NONE_RESULTS
            except:
                DISTRIBUTION_RESULTS["chi_square"] = NONE_RESULTS

            try:
                ks_test = test_kolmogorov_smirnov_continuous(data, distribution, measurements, confidence_level=confidence_level)
                if numpy.isnan(ks_test["test_statistic"]) == False and math.isinf(ks_test["test_statistic"]) == False and ks_test["test_statistic"] > 0:
                    DISTRIBUTION_RESULTS["kolmogorov_smirnov"] = {
                        "test_statistic": ks_test["test_statistic"],
                        "critical_value": ks_test["critical_value"],
                        "p_value": ks_test["p-value"],
                        "rejected": ks_test["rejected"],
                    }
                    v2 = True
                else:
                    DISTRIBUTION_RESULTS["anderson_darling"] = NONE_RESULTS
            except:
                DISTRIBUTION_RESULTS["kolmogorov_smirnov"] = NONE_RESULTS
            try:
                ad_test = test_anderson_darling_continuous(data, distribution, measurements, confidence_level=confidence_level)
                if numpy.isnan(ad_test["test_statistic"]) == False and math.isinf(ad_test["test_statistic"]) == False and ad_test["test_statistic"] > 0:
                    DISTRIBUTION_RESULTS["anderson_darling"] = {
                        "test_statistic": ad_test["test_statistic"],
                        "critical_value": ad_test["critical_value"],
                        "p_value": ad_test["p-value"],
                        "rejected": ad_test["rejected"],
                    }
                    v3 = True
                else:
                    DISTRIBUTION_RESULTS["anderson_darling"] = NONE_RESULTS
            except:
                DISTRIBUTION_RESULTS["anderson_darling"] = NONE_RESULTS

            if v1 or v2 or v3:
                DISTRIBUTION_RESULTS["sse"] = sse
                DISTRIBUTION_RESULTS["parameters"] = str(distribution.parameters)
                DISTRIBUTION_RESULTS["n_test_passed"] = (
                    int(DISTRIBUTION_RESULTS["chi_square"]["rejected"] == False)
                    + int(DISTRIBUTION_RESULTS["kolmogorov_smirnov"]["rejected"] == False)
                    + int(DISTRIBUTION_RESULTS["anderson_darling"]["rejected"] == False)
                )
                DISTRIBUTION_RESULTS["n_test_null"] = (
                    int(DISTRIBUTION_RESULTS["chi_square"]["rejected"] == None)
                    + int(DISTRIBUTION_RESULTS["kolmogorov_smirnov"]["rejected"] == None)
                    + int(DISTRIBUTION_RESULTS["anderson_darling"]["rejected"] == None)
                )

                RESPONSE[distribution_name] = DISTRIBUTION_RESULTS

    sorted_results_sse = {distribution: results for distribution, results in sorted(RESPONSE.items(), key=lambda x: x[1]["sse"])}
    aproved_results = {distribution: results for distribution, results in sorted_results_sse.items() if results["n_test_passed"] > 0}

    return sorted_results_sse, aproved_results


if __name__ == "__main__":
    path = "../../continuous/data/data_beta.txt"
    sample_distribution_file = open(path, "r")
    data = [float(x.replace(",", ".")) for x in sample_distribution_file.read().splitlines()]

    sorted_results_sse, aproved_results = phitter_continuous(data, 20, confidence_level=0.99)

    for distribution, results in aproved_results.items():
        print(f"Distribution: {distribution}, SSE: {results['sse']}, Aprobados: {results['n_test_passed']}")
"""


In [10]:
CODE += if_name

In [11]:
code_file = open("./code_continuous.py", "+w", encoding="utf8")
code_file.write(CODE)
code_file.close()