In [None]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

Load data from first replicate and try plotting with kdeplot.

In [None]:
liquid1 = np.load('../../data/20220901_final_counts/counts.1_counts.npy')
semisolid1 = np.load('../../data/20220901_final_counts/counts.2_counts.npy')
small_spread1 = np.load('../../data/20220901_final_counts/counts.3_counts.npy')
bead_spread1 = np.load('../../data/20220901_final_counts/counts.4_counts.npy')
large_spread1 = np.load('../../data/20220901_final_counts/counts.5_counts.npy')
rep1 = [liquid1, semisolid1, small_spread1, bead_spread1, large_spread1]
rep1_labels = ['liquid', 'semisolid', 'small spread', 'bead spread', 'large spread']

Look at summary statistics

In [None]:
for sample, label in zip(rep1, rep1_labels):
  print(f'{label}: {stats.describe(sample)}')
  print(f'{label} median: {sample[len(sample)//2]}')
  print(f'{label} std: {np.std(sample)}')

Try plotting with kdeplot:

In [None]:
for data, label in zip(rep1, rep1_labels):
  sns.kdeplot(data, bw_adjust=5, label=label)
plt.legend()
plt.xlim([0,100])

Semisolid looks a bit weird; try plotting as histograms instead.

In [None]:
for data, label in zip(rep1, rep1_labels):
  plt.hist(data, label=label, bins=range(0, 100), alpha=0.5)
plt.legend()

Find the number of unique sequences seen in each library.

In [None]:
counts = [len(sample) for sample in rep1]
for i in range(5):
  print(f'{rep1_labels[i]}: {counts[i]}')

Find the maximum count for a sequence in each library.

In [None]:
rep1_sorted = [sorted(sample) for sample in rep1]
for i in range(5):
  print(f'{rep1_labels[i]}: {rep1_sorted[i][-5:]}')

Do same analysis for replicate 2.

In [None]:
liquid2 = np.load('../../data/20220901_final_counts/counts.6_counts.npy')
semisolid2 = np.load('../../data/20220901_final_counts/counts.7_counts.npy')
small_spread2 = np.load('../../data/20220901_final_counts/counts.8_counts.npy')
bead_spread2 = np.load('../../data/20220901_final_counts/counts.9_counts.npy')
large_spread2 = np.load('../../data/20220901_final_counts/counts.10_counts.npy')
rep2 = [liquid2, semisolid2, small_spread2, bead_spread2, large_spread2]
rep2_labels = ['liquid', 'semisolid', 'small spread', 'bead spread', 'large spread']

In [None]:
for sample, label in zip(rep2, rep2_labels):
  print(f'{label}: {stats.describe(sample)}')
  print(f'{label} median: {sample[len(sample)//2]}')
  print(f'{label} std: {np.std(sample)}')

In [None]:
for sample, label in zip(rep2, rep2_labels):
  print(f'{label}: {stats.describe(sample)}')
  print(f'{label} median: {sample[len(sample)//2]}')

In [None]:
for data, label in zip(rep2, rep2_labels):
  plt.hist(data, label=label, bins=range(0, 200), alpha=0.5)
plt.legend()

In [None]:
for data, label in zip(rep1, rep1_labels):
  plt.hist(data, label=label, bins=range(0, 100), alpha=0.5)
  plt.legend()
  plt.show()

In [None]:
for data, label in zip(rep2, rep2_labels):
  plt.hist(data, label=label, bins=range(0, 200), alpha=0.5)
  plt.legend()
  plt.show()

In [None]:
counts = [len(sample) for sample in rep2]
for i in range(5):
  print(f'{rep2_labels[i]}: {counts[i]}')

In [None]:
rep2_sorted = [sorted(sample) for sample in rep2]
for i in range(5):
  print(f'{rep2_labels[i]}: {rep2_sorted[i][-5:]}')

In [None]:
rep1

Try plotting as box plots

In [None]:
sns.boxplot(data=rep1)
plt.ylim([0, 500])

In [None]:
sns.boxplot(data=rep2)
plt.ylim([0, 500])

In [None]:
def lorenz_curve(X):
    X_lorenz = X.cumsum() / X.sum()
    X_lorenz = np.insert(X_lorenz, 0, 0) 
    X_lorenz[0], X_lorenz[-1]
    fig, ax = plt.subplots(figsize=[6,6])
    ## scatter plot of Lorenz curve
    ax.scatter(np.arange(X_lorenz.size)/(X_lorenz.size-1), X_lorenz)
    ## line plot of equality
    ax.plot([0,1], [0,1], color='k')
    return X_lorenz

lorenz_curve(rep1[0])
lorenz_curve(rep1[1])
lorenz_curve(rep1[2])
lorenz_curve(rep1[3])
lorenz_curve(rep1[4])

In [None]:
def gini(arr):
    ## first sort
    sorted_arr = arr.copy()
    sorted_arr.sort()
    n = arr.size
    coef_ = 2. / n
    const_ = (n + 1.) / n
    weighted_sum = sum([(i+1)*yi for i, yi in enumerate(sorted_arr)])
    return coef_*weighted_sum/(sorted_arr.sum()) - const_

gini_rep1 = []
for sample, label in zip(rep1, rep1_labels):
  print(f'{label :>20}1: {gini(sample)}')
  gini_rep1.append(gini(sample))

gini_rep2 = []
for sample, label in zip(rep2, rep2_labels):
  print(f'{label :>20}2: {gini(sample)}')
  gini_rep2.append(gini(sample))

In [None]:
for sample, label in zip(rep1, rep1_labels):
  lorenz = sample.cumsum() / sample.sum()
  auc = np.trapz(y=lorenz, x=np.arange(lorenz.size)/(lorenz.size-1))
  print(f'{label} AUC: {auc}')

for sample, label in zip(rep2, rep2_labels):
  lorenz = sample.cumsum() / sample.sum()
  auc = np.trapz(y=lorenz, x=np.arange(lorenz.size)/(lorenz.size-1))
  print(f'{label} AUC: {auc}')
