### Import libraries

In [2]:
from scipy.io import mmread
import pandas as pd
import numpy as np
import gzip
from joblib import dump, load
from utils.slice_data_HVG import slice_data_HVG
from utils.data_pretreatment import preprocess_sparse_matrix
from utils.sample_row_ind import sampled_ind_matrix

### Load raw gene expression data, transpose and row-compress. Load metadata.

In [3]:
%%time
data_path = '/home/luana/workspace/data/matrix.mtx'
data_sparse = mmread(data_path).transpose().tocsr()
metadata = pd.read_csv("data/2097-Lungcancer_metadata.csv.gz")

print('Shape of the sparse data matrix is', data_sparse.shape)
print(metadata)
print('Unique Cell Types in metadata:', pd.unique(metadata['CellType']))

Shape of the sparse data matrix is (93575, 33694)
                             Cell  nGene  nUMI  CellFromTumor  PatientNumber  \
0           BT1238_AAATCAACTGCCTC    897  3227           True              1   
1           BT1238_AACATTGACCTAAG    509   731           True              1   
2           BT1238_AACCAGTGCTTAGG    642  2958           True              1   
3           BT1238_AACCTACTCGCTAA    925  2781           True              1   
4           BT1238_AACTCTTGCTGTAG    713  3000           True              1   
...                           ...    ...   ...            ...            ...   
93570  scrBT1432_TTTGGTTCATTCTCAT   1419  5192           True              8   
93571  scrBT1432_TTTGGTTGTTGGTGGA    398   585           True              8   
93572  scrBT1432_TTTGTCACACATGTGT    625  1760           True              8   
93573  scrBT1432_TTTGTCAGTACGAAAT    284   491           True              8   
93574  scrBT1432_TTTGTCATCGCGGATC    766  6131           True         

### Downsample gene expression data by rows 

In [4]:
# Generate row indices
ind_rows_downsample = sampled_ind_matrix(metadata = metadata, nbr_samples=40000)
print(ind_rows_downsample[:10])

[944, 1018, 1407, 131, 1019, 202, 854, 691, 940, 973]


In [5]:
# Downsampling raw count data, and metadata with generated row indicies
downsampled_sparse = data_sparse[ind_rows_downsample]
metadata_sampled = metadata.iloc[ind_rows_downsample, ]
print('Shape of Downsampled gene expression data: ', downsampled_sparse.shape)
print('Shape of Metadata: ', metadata_sampled.shape)

Shape of Downsampled gene expression data:  (39970, 33694)
Shape of Metadata:  (39970, 8)


In [15]:
# save results
dump(downsampled_sparse, 'data/downsampled_40000_sparse_gzip', compress=('gzip', 3))
metadata_sampled.to_csv('data/metadata_sampled.csv', index=False)

In [7]:
# Load back
#downsampled_sparse_loaded = load('data/downsampled_40000_sparse_gzip')
#print(downsampled_sparse_loaded.shape)

### Slice sparse data by top HVG

In [8]:
# Top 10%
data_sp_csr_HVG = slice_data_HVG(downsampled_sparse, perc_top_genes=0.1)
print(data_sp_csr_HVG.shape)
print(data_sp_csr_HVG.__class__)

(39970, 3369)
<class 'scipy.sparse._csr.csr_matrix'>


### Data pre-treatment

In [9]:
%%time
# Preprocess downsampled data, sliced further by top 10 HVG
data_preprocessed_4000_10HVG = preprocess_sparse_matrix(data_sp_csr_HVG)
print(data_preprocessed_4000_10HVG.shape)

(39970, 3369)
CPU times: user 2.13 s, sys: 896 ms, total: 3.03 s
Wall time: 3.04 s


In [10]:
#%%time
# Preprocess downsampled data, all genes
#data_preprocessed_4000 = preprocess_sparse_matrix(downsampled_sparse)
#print(data_preprocessed_4000.shape)

In [13]:
# Save results
f = gzip.GzipFile("data/data_preprocessed_4000_10HVG", "w")
np.save(file=f, arr = data_preprocessed_4000_10HVG)
f.close()

In [12]:
# Load back
#f = gzip.GzipFile('data/data_preprocessed_4000_10HVG, "r"); a = np.load(f)