In [1]:
import MATES
from MATES import bam_processor
from MATES import data_processor
from MATES import MATES_model
from MATES import TE_quantifier

## Smart-seq2 Data

#### Needed file example

In [3]:
%%bash
echo -e 'sample_list_file:'
cat sample_list_subset.txt | head -n 5
echo -e '\n\nbam_path_file:' 
cat bam_path_file_subset.txt | head -n 5

sample_list_file:
SRR3934350
SRR3934360
SRR3934371
SRR3934376
SRR3934387


bam_path_file:
../smart_seq/STAR_Solo_subset/SRR3934350/SRR3934350_Aligned.sortedByCoord.out.bam
../smart_seq/STAR_Solo_subset/SRR3934360/SRR3934360_Aligned.sortedByCoord.out.bam
../smart_seq/STAR_Solo_subset/SRR3934371/SRR3934371_Aligned.sortedByCoord.out.bam
../smart_seq/STAR_Solo_subset/SRR3934376/SRR3934376_Aligned.sortedByCoord.out.bam
../smart_seq/STAR_Solo_subset/SRR3934387/SRR3934387_Aligned.sortedByCoord.out.bam


#### Get TE reference

In [None]:
python3 build_reference.py --species Human

### Step 1

In [None]:
bam_processor.split_bam_files('Smart_seq', 20, 'sample_list_subset.txt', 'bam_path_file_subset.txt')

### Step 2

In [None]:
bam_processor.count_coverage_vec('exclusive', 'Smart_seq', 'sample_list_subset.txt')

### Step 3

In [None]:
data_processor.calculate_UM_region('exclusive', 'Smart_seq', 'sample_list_subset.txt', 5, 80)

In [None]:
data_processor.generate_training_sample('Smart_seq', 'sample_list_subset.txt', 5, 80)

In [None]:
data_processor.generate_prediction_sample('exclusive','Smart_seq','sample_list_subset.txt', 5, 80)


### Step 4

In [None]:
MATES_model.train('Smart_seq', 'sample_list_subset.txt', bin_size = 5, proportion = 80, BATCH_SIZE= 512, 
                  AE_LR = 1e-4, MLP_LR = 1e-6, AE_EPOCHS = 200, MLP_EPOCHS = 200, DEVICE = 'cuda:0')

In [None]:
MATES_model.prediction('exclusive', 'Smart_seq', 'sample_list_subset.txt', bin_size = 5, proportion = 80, 
                       AE_trained_epochs =200, MLP_trained_epochs=200, DEVICE = 'cuda:0')

### Step 5 Quantifying TEs

In [None]:
TE_quantifier.unique_TE_MTX('exclusive', 'Smart_seq', 'sample_list_subset.txt', 20)

In [None]:
TE_quantifier.finalize_TE_MTX('Smart_seq')

### Step 6 Quantifying locus-level TEs

In [None]:
MATES_model.prediction_locus('exclusive', 'Smart_seq', 'sample_list.txt',bin_size = 5, proportion = 80, 
                       AE_trained_epochs =200, MLP_trained_epochs=200, DEVICE = 'cuda:0')

In [None]:
TE_quantifier.quantify_locus_TE_MTX('exclusive', 'Smart_seq', 'sample_list.txt')

## 10X Data

#### Needed file example

In [2]:
%%bash
echo -e 'sample_list_file:'
cat sample_list_file.txt
echo -e '\n\nbam_path_file:'
cat bam_path_file.txt
echo -e '\n\nbc_path_file:'
cat bc_path_file.txt

sample_list_file:
mouse_brain

bam_path_file:
/home/roxanne/scratch/MATES/mouse_brain_Aligned.sortedByCoord.out.bam

bc_path_file:
STAR_Solo/mouse_brain/mouse_brain_Solo.out/Gene/filtered/barcodes.tsv

#### Get TE reference

In [None]:
python3 build_reference.py --species Mouse

### Step 1

In [None]:
bam_processor.split_count_10X_data('exclusive', 'sample_list_file.txt', 'bam_path_file.txt', 'bc_path_file.txt')

### Step 2

In [None]:
data_processor.calculate_UM_region('exclusive', '10X', 'sample_list_file.txt', bin_size=5, proportion=80, bc_path_file='bc_path_file.txt')

In [None]:
data_processor.generate_training_sample('10X', 'sample_list_file.txt', bin_size=5, proportion=80)

In [None]:
data_processor.generate_prediction_sample('exclusive','10X','sample_list_file.txt',bin_size=5, proportion=80, bc_path_file='bc_path_file.txt')

### Step 3

In [None]:
MATES_model.train('10X', 'sample_list_file.txt', bin_size = 5, proportion = 80, BATCH_SIZE= 256, 
                  AE_LR = 1e-4, MLP_LR = 1e-6, AE_EPOCHS = 200, MLP_EPOCHS = 200, DEVICE = 'cuda:0')

In [None]:
MATES_model.prediction('exclusive', '10X', 'sample_list_file.txt', bin_size = 5, proportion = 80, 
                       AE_trained_epochs =200, MLP_trained_epochs=200, DEVICE = 'cuda:0')

### Step 4 Quantifying TEs

In [None]:
TE_quantifier.unique_TE_MTX('exclusive', '10X', 'sample_list_file.txt', 20, bc_path_file='bc_path_file.txt')

In [None]:
TE_quantifier.finalize_TE_MTX('10X', 'sample_list_file.txt')

### Step 5 Quantifying locus-level TEs

In [None]:
MATES_model.prediction_locus('exclusive', '10X', 'sample_list_file.txt',bin_size = 5, proportion = 80, 
                       AE_trained_epochs =200, MLP_trained_epochs=200, DEVICE = 'cuda:0')

In [None]:
TE_quantifier.quantify_locus_TE_MTX('exclusive', '10X', 'sample_list_file.txt')

## 10X Multi-omics Data

#### Needed file example

In [4]:
%%bash
echo -e 'sample_list_file:'
cat sample_MultiOmics.txt
echo -e '\n\nbam_path_file:' 
cat bam_path_MultiOmics.txt
echo -e '\n\nbc_path_file:'
cat barcode_path_MultiOmics.txt

sample_list_file:
pbmc_scRNA
pbmc_scATAC

bam_path_file:
/home/roxanne/scratch/pbmc_granulocyte_sorted_10k/gex/STAR_Solo/S1/S1_Aligned.sortedByCoord.out.bam
/home/roxanne/scratch/pbmc_granulocyte_sorted_10k/atac/STAR_Solo/S16/S16_Aligned.sortedByCoord.out.bam

bc_path_file:
/home/roxanne/scratch/pbmc_granulocyte_sorted_10k/gex/STAR_Solo/S1/S1_Solo.out/Gene/filtered/barcodes.tsv
/home/roxanne/scratch/pbmc_granulocyte_sorted_10k/atac/STAR_Solo/S16/S16_Solo.out/Gene/filtered/barcodes.tsv

#### Get TE reference

In [None]:
python3 build_reference.py --species Human

### Step 1

In [None]:
bam_processor.split_count_10X_data('exclusive','sample_MultiOmics.txt', 
                              'bam_path_MultiOmics.txt', 'barcode_path_MultiOmics.txt')

### Step 2

In [None]:
data_processor.calculate_UM_region('exclusive', '10X', 'sample_MultiOmics.txt',bin_size=5, proportion=80, bc_path_file='barcode_path_MultiOmics.txt')

In [None]:
data_processor.generate_training_sample('10X', 'sample_MultiOmics.txt', bin_size=5, proportion=80)

In [None]:
data_processor.generate_prediction_sample('exclusive','10X','sample_MultiOmics.txt', bin_size=5, proportion=80, bc_path_file='barcode_path_MultiOmics.txt')

### Step 3

In [None]:
MATES_model.train('10X', 'sample_MultiOmics.txt', bin_size = 5, proportion = 80, BATCH_SIZE= 4096, 
                  AE_LR = 1e-4, MLP_LR = 1e-6, AE_EPOCHS = 200, MLP_EPOCHS = 200, DEVICE = 'cuda:0')

In [None]:
MATES_model.prediction('exclusive', '10X', 'sample_MultiOmics.txt', bin_size = 5, proportion = 80, 
                       AE_trained_epochs =200, MLP_trained_epochs=200, DEVICE = 'cuda:0')

### Step 4 Quantifying TEs

In [None]:
TE_quantifier.unique_TE_MTX('exclusive', '10X', 'sample_MultiOmics.txt', 20, bc_path_file='barcode_path_MultiOmics.txt')

In [None]:
TE_quantifier.finalize_TE_MTX('10X', 'sample_MultiOmics.txt')

### Step 5 Quantifying locus-level TEs

In [None]:
MATES_model.prediction_locus('exclusive', '10X', 'sample_MultiOmics.txt', bin_size = 5, proportion = 80, 
                       AE_trained_epochs =200, MLP_trained_epochs=200, DEVICE = 'cuda:0')

In [None]:
TE_quantifier.quantify_locus_TE_MTX('exclusive', '10X', 'sample_MultiOmics.txt')