# Generate artificial data sets
Here, we generate data sets to explore the behavior of scores describing differences between high-dimensional point clouds in presence of noise and outliers.  
The data is generated by sampling the morphological embedding space learnt on actual cellular measurements made on cytotoxic lymphocytes from healthy donors and ARPC1B deficient patients (described in German, Vulliard et al.) then using the inverse transformation to approximate high-dimensional points with *more realistic* properties than points sampled from usual statistical distributions.

This script output the following datasets:

* `matR.csv` - Reference dataset, following a centered multivariate normal distribution
* `matN.csv` - Negative control dataset, using the same generator used for reference dataset
* `matPS.csv` - Positive control dataset, shifted (center coordinates now follow a centered normal distribution with standard deviation 0.5) while the covariance is kept identical to reference dataset
* `matPR.csv` - Positive control dataset, centered but with the squared covariance matrix compared to the reference dataset

As well as `matRo.csv`, `matNo.csv`, `matPSo.csv` and `matPRo.csv` which follow the same rules but include outliers.

Approximate running time: **25mn**.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import umap
import plotly.express as px

%matplotlib inline

In [None]:
sns.set(style='white', rc={'figure.figsize':(10,10)})

The dataset was obtained as processed in German, Vulliard et al., adding the following block to `LT_ARPC1B.ipynb`:

    write.csv(file = "Tab/Export_ARPC1B_filtered.csv",
    x = data.frame(Row = as.factor(LT$Metadata_Row[fieldToKeep]), 
                   Col = as.factor(LT$Metadata_Column[fieldToKeep]),
                   URL = as.factor(LT$URL_Actin[fieldToKeep]),
                   Coating = as.factor(LT$Coating[fieldToKeep]),
                   Donor = as.factor(LT$Donor[fieldToKeep]),
                   transformedLT))

In [None]:
# Number of points in control dataset
NR = 3000
# Number of points in other datasets
N = 1000
# Percentage of datasets contaminated with outliers
pOutliers = 1/3
# Scaling of the transformation for positive controls
posScaling = 1.0
# Weirdness of the outliers
outScaling = 1.0

## Load and visualize original dataset

In [None]:
np.random.seed(0)

In [None]:
X = pd.read_csv("Export_ARPC1B_filtered.csv", index_col = list(range(6)))

In [None]:
X.shape

In [None]:
plane_mapper = umap.UMAP(random_state=42, min_dist = 1, spread = 10, n_epochs = 200).fit(X)

In [None]:
y = X.index.get_level_values('Donor').astype('category').values.codes
plt.scatter(plane_mapper.embedding_.T[0], plane_mapper.embedding_.T[1], 
            c=y, cmap='Spectral')

In [None]:
dfX = pd.DataFrame({'x': plane_mapper.embedding_.T[0], 
                    'y': plane_mapper.embedding_.T[1], 
                    'c': X.index.get_level_values('Donor')})
fig = px.scatter(dfX, x='x', y='y', color='c')
fig.show()

## Dataset 1 - Reference R_ARPC1B

In [None]:
np.random.seed(1)

In [None]:
rangeRefX = [-10,10]
rangeRefY = [20,40]

In [None]:
# We generate random points corresponding to a reference distribution
refPts = np.random.uniform([rangeRefX[0], rangeRefY[0]], 
                           [rangeRefX[1], rangeRefY[1]], [NR,2])

In [None]:
# We then convert back these points to the original dimensions
trRefPts = plane_mapper.inverse_transform(refPts)
trRefPts.shape

In [None]:
trRefPts = pd.DataFrame(trRefPts)
trRefPts.to_csv("Data/matR_ARPC1B.csv", sep = ',', header = False, index = False)

## Dataset 2 - Negative control N

In [None]:
np.random.seed(2)

In [None]:
# We sample random points from the reference distribution
nPts = np.random.uniform([rangeRefX[0], rangeRefY[0]], 
                         [rangeRefX[1], rangeRefY[1]], [N,2])

In [None]:
# We then convert back these points to the original dimensions
trNPts = plane_mapper.inverse_transform(nPts)
trNPts.shape

In [None]:
trNPts = pd.DataFrame(trNPts)
trNPts.to_csv("Data/matN_ARPC1B.csv", sep = ',', header = False, index = False)

## Dataset 3 - Positive control (shifted) PS

In [None]:
np.random.seed(3)

In [None]:
shiftedRangeX = rangeRefX + posScaling*np.array([-20,-20])
shiftedRangeY = rangeRefY + posScaling*np.array([-20,-20])

In [None]:
# We sample from another box of the morphological space
psPts = np.random.uniform([shiftedRangeX[0], shiftedRangeY[0]], 
                         [shiftedRangeX[1], shiftedRangeY[1]], [N,2])

In [None]:
# We then convert back these points to the original dimensions
trPsPts = plane_mapper.inverse_transform(psPts)
trPsPts.shape

In [None]:
trPsPts = pd.DataFrame(trPsPts)
trPsPts.to_csv("Data/matPS_ARPC1B.csv", sep = ',', header = False, index = False)

## Dataset 4 - Positive control (reshaped) PR


In [None]:
np.random.seed(4)

In [None]:
rescaledX = np.array(rangeRefX)*(1+posScaling)
rescaledY = (np.array(rangeRefY)-30) * (1 + .5 * posScaling) + 30

In [None]:
# We sample from another box of the morphological space
prPts = np.random.uniform([rescaledX[0], rescaledY[0]], 
                         [rescaledX[1], rescaledY[1]], [N,2])

In [None]:
# We then convert back these points to the original dimensions
trPrPts = plane_mapper.inverse_transform(prPts)
trPrPts.shape

In [None]:
trPrPts = pd.DataFrame(trPrPts)
trPrPts.to_csv("Data/matPR_ARPC1B.csv", sep = ',', header = False, index = False)

## Dataset 5 - Reference with outliers Ro

In [None]:
np.random.seed(5)

In [None]:
outlierX = rangeRefX + outScaling*np.array([30,30])
outlierY = rangeRefY + outScaling*np.array([-40,-40])

In [None]:
# Most points are generated as previously described
cleanRo = np.random.uniform([rangeRefX[0], rangeRefY[0]], 
                            [rangeRefX[1], rangeRefY[1]], [round(NR*(1-pOutliers)),2])
# We then convert back these points to the original dimensions
trCRo = plane_mapper.inverse_transform(cleanRo)

In [None]:
# The other points are generated from a distinct box in the UMAP space
outlierRo = np.random.uniform([outlierX[0], outlierY[0]], 
                              [outlierX[1], outlierY[1]], [round(NR*pOutliers),2])
# We then convert back these points to the original dimensions
trORo = plane_mapper.inverse_transform(outlierRo)

In [None]:
matRo = pd.DataFrame(np.vstack([trCRo, trORo]))
matRo.to_csv("Data/matRo_ARPC1B.csv", sep = ',', header = False, index = False)

## Dataset 6 - Negative control with outliers No

In [None]:
np.random.seed(6)

In [None]:
# Most points are generated as previously described
cleanNo = np.random.uniform([rangeRefX[0], rangeRefY[0]], 
                            [rangeRefX[1], rangeRefY[1]], [round(N*(1-pOutliers)),2])
# We then convert back these points to the original dimensions
trCNo = plane_mapper.inverse_transform(cleanNo)

In [None]:
# The other points are generated from a distinct box in the UMAP space
outlierNo = np.random.uniform([outlierX[0], outlierY[0]], 
                              [outlierX[1], outlierY[1]], [round(N*pOutliers),2])
# We then convert back these points to the original dimensions
trONo = plane_mapper.inverse_transform(outlierNo)

In [None]:
matNo = pd.DataFrame(np.vstack([trCNo, trONo]))
matNo.to_csv("Data/matNo_ARPC1B.csv", sep = ',', header = False, index = False)

## Dataset 7 - Positive control (shifted) with outliers PSo

In [None]:
np.random.seed(7)

In [None]:
# Most points are generated as previously described
cleanPSo = np.random.uniform([shiftedRangeX[0], shiftedRangeY[0]], 
                            [shiftedRangeX[1], shiftedRangeY[1]], [round(N*(1-pOutliers)),2])
# We then convert back these points to the original dimensions
trCPSo = plane_mapper.inverse_transform(cleanPSo)

In [None]:
# The other points are generated from a distinct box in the UMAP space
outlierPSo = np.random.uniform([outlierX[0], outlierY[0]], 
                               [outlierX[1], outlierY[1]], [round(N*pOutliers),2])
# We then convert back these points to the original dimensions
trOPSo = plane_mapper.inverse_transform(outlierPSo)

In [None]:
matPSo = pd.DataFrame(np.vstack([trCPSo, trOPSo]))
matPSo.to_csv("Data/matPSo_ARPC1B.csv", sep = ',', header = False, index = False)

## Dataset 8 - Positive control (reshaped) with outliers PRo

In [None]:
np.random.seed(8)

In [None]:
# Most points are generated as previously described
cleanPRo = np.random.uniform([rescaledX[0], rescaledY[0]], 
                            [rescaledX[1], rescaledY[1]], [round(N*(1-pOutliers)),2])
# We then convert back these points to the original dimensions
trCPRo = plane_mapper.inverse_transform(cleanPRo)

In [None]:
# The other points are generated from a distinct box in the UMAP space
outlierPRo = np.random.uniform([outlierX[0], outlierY[0]], 
                               [outlierX[1], outlierY[1]], [round(N*pOutliers),2])
# We then convert back these points to the original dimensions
trOPRo = plane_mapper.inverse_transform(outlierPRo)

In [None]:
matPRo = pd.DataFrame(np.vstack([trCPRo, trOPRo]))
matPRo.to_csv("Data/matPRo_ARPC1B.csv", sep = ',', header = False, index = False)