# Feature Report

## Read in data

In [None]:
import numpy as np
import pandas as pd

df = pd.concat([pd.read_parquet(fn) for fn in snakemake.input]).set_index(
	[
		"chrom",
		"start",
		"end",
		"cell_id",
		"donor_id",
	]
)

# make features
features = [
	x
	for x in df.columns
	if not any(
		x.endswith(y)
		for y in [
			"chrom",
			"start",
			"end",
			"cell_id",
			"donor_id",
			"label",
			"build",
			"db",
		]
	)
]

# replace NA values with 0
df[features] = df[features].fillna(0)

# take minimum of features and 4e9 to avoid overflow error
df[features] = np.minimum(df[features], 4e9)
print(df.shape)
print(df.columns)

labels = df['label'].unique()
print(labels)

## Downsample majority class for easier comparison

In [None]:
# downsample to smallest class
from imblearn.under_sampling import RandomUnderSampler

down_to = (
    df["label"].value_counts().min()
    if df["label"].value_counts().min() < 10000
    else 10000
)
sample_dict = {label: down_to for label in labels}
df, _ = RandomUnderSampler(sampling_strategy=sample_dict, random_state=42).fit_resample(
    df, df["label"]
)

## PCA

TODO: try PCA with different min_read thresholds

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# make input data
pca_df = StandardScaler().fit_transform(df[features])

# fit PCA
pca = PCA(n_components=50, svd_solver="arpack").fit_transform(pca_df)

# preprare for plotting
plot_df = pd.DataFrame(pca[:, :50])
plot_df.columns = ["PC{}".format(i) for i in range(1, len(plot_df.columns) + 1)]
plot_df["label"] = df["label"].to_numpy()

In [None]:
import seaborn as sns

sns.pairplot(
    plot_df[["PC1", "PC2", "PC3", "PC4", "PC5", "label"]],  # first 5 PCs
    hue="label",
    hue_order=labels,
    plot_kws={"alpha": 0.5, "size": 2},
)

## tSNE

In [None]:
# tSNE
# TODO: color by donor, other covariates
# TODO: try different resolutions
from sklearn.manifold import TSNE

tsne = TSNE(random_state=42, init="random").fit_transform(plot_df.drop("label", axis=1))

tsne_df = pd.DataFrame(tsne)
tsne_df.columns = ["tSNE1", "tSNE2"]
tsne_df["label"] = plot_df["label"]

In [None]:
sns.scatterplot(
    tsne_df, x="tSNE1", y="tSNE2", hue="label", hue_order=labels, alpha=0.5, s=3
)