In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from sklearn.datasets import make_spd_matrix
from sklearn.preprocessing import MinMaxScaler

<h3>Generate Random but Correlated data </h3>
<p>Using randomly generated covariance matrics, this function generates correlated data</p>
<p>Noise is added to simulate real-world data</p>
<p>Data is scaled though this is not strictly necessary</p>

In [2]:
def mergeJoin(data, id='x'):
    """ Helper function to outer join many dataframes
        Inspired by merge sort
        @param data: List of dataframes
        @retval: single wide dataframe
    """
    if len(data) > 1:
        mid = len(data) // 2
        left = data[:mid]
        right = data[mid:]

        # Recursive call on each half
        return mergeJoin(left, id).merge(mergeJoin(right, id), on=id, how='outer').dropna()

    if len(data) == 1:
        return data[0]

In [3]:
size = 100000
results = []
for i in tqdm(range(0, 10)):
    # generate random covariance matrices
    l = np.random.randint(2, 20)
    mean = np.random.rand(l)
    cov = make_spd_matrix(l, random_state=0)
    # for debugging, generates a covariance matrix of 1s
#     cov = np.ones((l, l))

    # generates random/correlated data based on given mean/covairance matrix
    pts = np.random.multivariate_normal(mean, cov, size=size)
    x, y = pts[:, 0], pts[:, 1]
    # add Gaussian noise
    y = y + np.random.normal(0, 1, size)
    # scale
    scaler = MinMaxScaler()
    x = scaler.fit_transform(x.reshape(-1, 1)).T[0]
    # round
    x = np.around(x, 5)
    # plot
#     plt.plot(x, y, '.')
#     plt.xlim(-1, 2)
#     plt.grid()
#     plt.show()
    
    t_df = pd.DataFrame(list(zip(x, y)), columns=['x', 'y' + str(i)])
    t_df.drop_duplicates('x', inplace=True)
    results.append(t_df)

100%|██████████| 10/10 [00:01<00:00,  5.67it/s]


<h3>Merging</h3>
<p>This is to merge multiple columns together to simulate cases where multiple columns correlate with the response</p>

In [4]:
df = mergeJoin(results)
df.sort_values('x', inplace=True)
# re-scaling to balance out x
df = df[(df['x'] != 0) & (df['x'] != 1)]
scaler = MinMaxScaler()
df['x'] = scaler.fit_transform(df['x'].values.reshape(-1, 1)).T[0]
df

Unnamed: 0,x,y0,y1,y2,y3,y4,y5,y6,y7,y8,y9
19122,0.000000,1.601674,-2.497253,3.869277,-1.546199,2.519426,0.309541,1.822421,0.181262,-1.363866,1.454431
18222,0.056126,0.181749,0.967411,1.697782,0.390432,0.909817,2.173639,1.158485,2.997593,-0.355411,0.988750
5048,0.065156,-0.440005,-1.148300,2.201194,2.022221,-0.632905,8.123266,2.510213,0.003809,-4.116726,1.417378
16575,0.065391,-0.089294,-2.923234,1.565552,-0.640890,1.358834,-0.391381,3.393750,-0.540405,-0.272921,1.498444
12937,0.084389,1.334249,0.349714,1.285557,1.808610,2.000044,4.097830,1.606913,2.469520,-2.071107,0.562204
...,...,...,...,...,...,...,...,...,...,...,...
18113,0.957782,2.392594,1.854972,0.601992,-0.750567,-1.595369,-1.312557,-2.615930,1.918432,5.654639,-0.450359
18693,0.967070,1.012244,0.656648,2.563703,2.161564,-1.449783,-3.751036,-0.886663,-1.261222,5.066416,-1.996538
19356,0.980838,0.549781,-0.538857,2.424301,-0.093876,-2.005777,-2.220367,0.435805,1.151949,2.234996,-1.481924
19899,0.989750,-1.941979,1.737906,1.555522,0.447909,-3.137797,-0.363446,-1.405843,-0.358637,5.740815,-0.636199


<h3>Generate Random Bi-modal Data</h3>
<p>Generate 2 sets of normally distributed data</p>
<p>Concat, and join with original (sorted) dataset</p>

In [5]:
for i in range(0, 2):
    last_col_index = int(df.columns[-1].split('y')[1])
    N = df.shape[0]
    mu, sigma = np.random.randint(40, 100), np.random.randint(1, 5)
    mu2, sigma2 = int(mu % 5) + np.random.randint(1, 5), sigma**2 + 10
    bm1 = np.random.normal(mu, sigma, N)
    bm2 = np.random.normal(mu2, sigma2, N)
    bm = np.concatenate([bm1, bm2])
    bm[::2]
    # this may throw an error?
    df['y' + str(last_col_index + 1)] = bm[::2]
    # plt.hist(bm[::2])

<h3>Generate Random Data</h3>
<p>Generate 100 sets of normally distributed data</p>

In [6]:
for i in range(0, 100):
    x = df['x'].values
    y = x + np.random.normal(0.5, 3, len(x))
    y = y - np.random.normal(-1, 20, len(x))**3

    scaler = MinMaxScaler()
    y = scaler.fit_transform(y.reshape(-1, 1)).T[0]
#     plt.plot(x, y, '.')
#     plt.xlim(-1, 2)
#     plt.grid()
#     plt.show()
    last_col_index = int(df.columns[-1].split('y')[1])
    df['y' + str(last_col_index + 1)] = y

In [7]:
df.describe()

Unnamed: 0,x,y0,y1,y2,y3,y4,y5,y6,y7,y8,...,y102,y103,y104,y105,y106,y107,y108,y109,y110,y111
count,13422.0,13422.0,13422.0,13422.0,13422.0,13422.0,13422.0,13422.0,13422.0,13422.0,...,13422.0,13422.0,13422.0,13422.0,13422.0,13422.0,13422.0,13422.0,13422.0,13422.0
mean,0.516802,0.965742,0.25891,0.838398,0.026631,0.171669,0.178848,0.751405,0.724899,0.659114,...,0.432079,0.500772,0.519426,0.292533,0.442904,0.329378,0.452236,0.468533,0.397439,0.386484
std,0.158498,1.277788,1.394971,1.609465,2.074468,1.47426,1.73982,1.37091,1.576565,1.8495,...,0.038249,0.035335,0.040972,0.023874,0.03181,0.027104,0.034785,0.030247,0.033598,0.024668
min,0.0,-3.774285,-5.165432,-5.591468,-7.445485,-5.188493,-7.012611,-4.770026,-5.182501,-6.248515,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.39941,0.103568,-0.676807,-0.253916,-1.379242,-0.826276,-1.005409,-0.167151,-0.327267,-0.587349,...,0.428298,0.497146,0.514832,0.29007,0.439457,0.326299,0.448439,0.465808,0.394092,0.383832
50%,0.516688,0.966674,0.256321,0.821981,0.013877,0.183888,0.191913,0.73839,0.71982,0.646203,...,0.430492,0.499585,0.517759,0.291556,0.441502,0.327907,0.450936,0.467646,0.396435,0.385414
75%,0.633449,1.824849,1.190767,1.91706,1.43268,1.160164,1.352951,1.662316,1.775248,1.907659,...,0.434036,0.502791,0.521854,0.293923,0.444627,0.330583,0.454368,0.47044,0.399264,0.38763
max,1.0,5.67907,5.465839,7.181892,7.824974,5.876405,8.123266,6.05857,6.517971,7.392887,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
