In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.stats import norm
from scipy.stats import pearsonr


In [2]:
srcfld = Path("C:/git/py/tlak")
dstfld = Path("C:/git/py/tlak")

srcfile = "tlak.xlsx"
dstfile = "tlak_out.xlsx"

srcpath = srcfld / srcfile
dstpath = dstfld / dstfile

In [3]:
xl = pd.ExcelFile(srcpath)
df = xl.parse('morning', usecols = [0, 1, ])
xl.close()
original_data = df[df.columns[0:2]].to_numpy()


In [4]:
# Scale and shift original data to the range of 0 to 1
min_value = np.min(original_data, axis=0)
max_value = np.max(original_data, axis=0)
range_value = max_value - min_value
scaled_data = (original_data - min_value) / range_value

In [5]:
# Calculate correlation coefficient
corr_coefficient, _ = pearsonr(scaled_data[:, 0], scaled_data[:, 1])

# Fit a Gaussian copula to the data
copula_data = pd.DataFrame(scaled_data, columns=['x1', 'x2'])



In [6]:
# Transform the data into uniform space using rank transformation
copula_data['u1'] = norm.cdf(copula_data['x1'])
copula_data['u2'] = norm.cdf(copula_data['x2'])

# Generate new samples in uniform space
n_new_samples = len(original_data)
u1_new = np.random.uniform(0, 1, size=n_new_samples)
u2_new = np.random.uniform(0, 1, size=n_new_samples)

#  Apply the inverse CDF of the copula to obtain new samples
x1_new = norm.ppf(u1_new)
x2_new = norm.ppf(u2_new)

# Calculate the correlation coefficient of the generated samples
corr_coefficient_new, _ = pearsonr(x1_new, x2_new)

print("Original correlation coefficient:", corr_coefficient)
print("Generated correlation coefficient:", corr_coefficient_new)

Original correlation coefficient: 0.5704726278903476
Generated correlation coefficient: 0.21044768303298925


In [7]:
# Scale and shift the generated samples back to the original range of values
new_data = np.zeros_like(scaled_data)
new_data[:, 0] = u1_new * range_value[0] + min_value[0]
new_data[:, 1] = u2_new * range_value[1] + min_value[1]
# new_data = np.clip(new_data, 80, 150)  # Apply range constraints

In [8]:
min_dia = 80
max_dia = 105

out_data  = new_data[((new_data[:,1] >= min_dia) & (new_data[:,1] <= max_dia))]

In [9]:
print(new_data.min(axis=0) , new_data.max(axis=0), new_data.mean(axis=0))
print(out_data.min(axis=0) , out_data.max(axis=0), out_data.mean(axis=0))

[112.53940909  81.42093755] [149.61977262 112.51686083] [130.61470681  95.28225253]
[112.53940909  81.42093755] [149.61977262 104.39178852] [129.33198797  92.98178466]


In [10]:
out_data = out_data.round(0)