In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
from pathlib import Path

In [8]:
"""
Notebook 03: Single-Cell Data Loading & Quality Control
========================================================

This notebook loads 10X Genomics MTX format data from the University cluster,
performs quality control analysis, and generates summary statistics for each sample.

The pipeline is designed to be:
- User-friendly: Clear parameters, informative output
- Modular: Functions can be reused in downstream analyses
- Scalable: Works with single sample or batch processing
- Production-ready: Error handling, logging, and validation

This version uses your TheBigBoss_enhanced.csv metadata file to properly
map GEO IDs to actual filenames on the cluster.

Author: Alexandra Rolya
"""

# ============================================================================
# IMPORTS & SETUP
# ============================================================================

from pathlib import Path
from typing import Tuple, Dict
import pandas as pd
import numpy as np
import scanpy as sc
from dataclasses import dataclass
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)

In [9]:

# ============================================================================
# CONFIGURATION
# ============================================================================

@dataclass
class Config:
    """Configuration for data loading and QC pipeline"""
    
    # Cluster paths
    DATA_FOLDER: Path = Path("/triumvirate/home/alexarol/breast_cancer_analysis/data/GSE161529_RAW")
    FEATURES_FILE: Path = Path("/triumvirate/home/alexarol/breast_cancer_analysis/data/GSE161529_features.tsv")
    RESULTS_FOLDER: Path = Path("/triumvirate/home/alexarol/breast_cancer_analysis/results")
    METADATA_FILE: Path = Path("/triumvirate/home/alexarol/breast_cancer_analysis/results/TheBigBoss_enhanced.csv")
    
    # Pipeline parameters
    TEST_MODE: bool = True  # ← SET TO False TO RUN ALL SAMPLES
    N_TEST_SAMPLES: int = 3  # Number of samples to process in TEST_MODE
    COMPRESSION: str = 'gzip'  # For saving h5ad files
    VERBOSE: bool = True
    
    def __post_init__(self):
        """Validate and create output directories"""
        if not self.DATA_FOLDER.exists():
            raise FileNotFoundError(f"Data folder not found: {self.DATA_FOLDER}")
        if not self.METADATA_FILE.exists():
            raise FileNotFoundError(f"Metadata file not found: {self.METADATA_FILE}")
        
        self.RESULTS_FOLDER.mkdir(parents=True, exist_ok=True)
    
    def summary(self) -> str:
        """Print configuration summary"""
        mode = "TEST MODE" if self.TEST_MODE else "FULL BATCH MODE"
        return f"""
{'='*70}
PIPELINE CONFIGURATION
{'='*70}
Mode:              {mode}
Data folder:       {self.DATA_FOLDER}
Results folder:    {self.RESULTS_FOLDER}
Metadata file:     {self.METADATA_FILE}
Features file:     {self.FEATURES_FILE}
Compression:       {self.COMPRESSION}
{'='*70}
"""

In [3]:

# ============================================================================
# DATA DISCOVERY
# ============================================================================

def discover_samples(data_folder: Path) -> list:
    """
    Discover all available 10X MTX samples in data folder.
    
    Parameters
    ----------
    data_folder : Path
        Path to folder containing MTX files
    
    Returns
    -------
    list
        Sorted list of sample names
    
    Example
    -------
    >>> samples = discover_samples(Path("/triumvirate/home/alexarol/data"))
    >>> print(f"Found {len(samples)} samples")
    """
    matrix_files = sorted(data_folder.glob("*-matrix.mtx.gz"))
    
    if not matrix_files:
        raise FileNotFoundError(f"No MTX files found in {data_folder}")
    
    # Extract unique sample names (everything before "-matrix.mtx.gz")
    samples = sorted(set([f.name.rsplit('-', 2)[0] for f in matrix_files]))
    
    return samples

In [10]:
# ============================================================================
# METADATA HANDLING
# ============================================================================

def load_metadata(metadata_file: Path) -> pd.DataFrame:
    """
    Load metadata file containing sample information.
    
    Parameters
    ----------
    metadata_file : Path
        Path to TheBigBoss_enhanced.csv
    
    Returns
    -------
    pd.DataFrame
        Metadata with columns: GEO_ID, MatrixFile, BarcodesFile, SampleName, etc.
    """
    metadata = pd.read_csv(metadata_file)
    return metadata

def get_samples_from_metadata(metadata: pd.DataFrame) -> list:
    """
    Extract list of GEO IDs from metadata.
    
    Parameters
    ----------
    metadata : pd.DataFrame
        Loaded metadata
    
    Returns
    -------
    list
        List of GEO_ID values
    """
    return metadata['GEO_ID'].tolist()

def get_file_paths(geo_id: str, metadata: pd.DataFrame, data_folder: Path) -> Tuple[Path, Path]:
    """
    Get matrix and barcodes file paths for a given GEO ID.
    
    Parameters
    ----------
    geo_id : str
        GEO ID (e.g., 'GSM4909253')
    metadata : pd.DataFrame
        Loaded metadata
    data_folder : Path
        Path to data folder
    
    Returns
    -------
    tuple
        (matrix_file_path, barcodes_file_path)
    
    Raises
    ------
    ValueError
        If GEO ID not found in metadata
    FileNotFoundError
        If files don't exist
    """
    # Find row in metadata
    row = metadata[metadata['GEO_ID'] == geo_id]
    if row.empty:
        raise ValueError(f"GEO ID {geo_id} not found in metadata")
    
    # Get filenames
    matrix_filename = row.iloc[0]['MatrixFile']
    barcodes_filename = row.iloc[0]['BarcodesFile']
    
    # Construct full paths
    matrix_file = data_folder / matrix_filename
    barcodes_file = data_folder / barcodes_filename
    
    # Validate files exist
    if not matrix_file.exists():
        raise FileNotFoundError(f"Matrix file not found: {matrix_file}")
    if not barcodes_file.exists():
        raise FileNotFoundError(f"Barcodes file not found: {barcodes_file}")
    
    return matrix_file, barcodes_file

In [None]:
# ============================================================================
# DATA LOADING
# ============================================================================

def load_sample(
    geo_id: str,
    metadata: pd.DataFrame,
    data_folder: Path,
    features_file: Path,
    verbose: bool = True
) -> sc.AnnData:
    """
    Load a single 10X MTX sample into AnnData format.
    
    Parameters
    ----------
    geo_id : str
        GEO ID of sample (e.g., 'GSM4909253')
    metadata : pd.DataFrame
        Loaded metadata dataframe
    data_folder : Path
        Path to folder containing MTX files
    features_file : Path
        Path to features.tsv file
    verbose : bool, default=True
        Print progress messages
    
    Returns
    -------
    sc.AnnData
        Loaded single-cell expression matrix with metadata
    """
    
    if verbose:
        print(f"  Loading matrix and barcodes...")
    
    # Get file paths from metadata
    matrix_file, barcodes_file = get_file_paths(geo_id, metadata, data_folder)
    
    # Load sparse matrix and transpose
    adata = sc.read_mtx(matrix_file).T
    
    # Load cell barcodes
    adata.obs_names = pd.read_csv(barcodes_file, header=None, compression='gzip')[0]
    
    # Load gene features
    features_df = pd.read_csv(features_file, sep='\t', header=None)
    adata.var_names = features_df[0]  # Gene symbols
    adata.var['gene_id'] = features_df[1]  # Ensembl IDs
    
    # Add metadata
    sample_info = metadata[metadata['GEO_ID'] == geo_id].iloc[0]
    adata.obs['sample_name'] = sample_info['SampleName']
    adata.obs['sample_type'] = sample_info['SampleType']
    
    if verbose:
        print(f"  ✓ Loaded: {adata.n_obs:,} cells × {adata.n_vars:,} genes")
    
    return adata

In [None]:
# ============================================================================
# QUALITY CONTROL METRICS
# ============================================================================

def calculate_qc_metrics(adata: sc.AnnData) -> Dict[str, any]:
    """Calculate comprehensive quality control metrics for a sample."""
    
    # Per-cell metrics
    genes_per_cell = np.asarray((adata.X > 0).sum(axis=1)).flatten()
    umis_per_cell = np.asarray(adata.X.sum(axis=1)).flatten()
    
    # Per-gene metrics
    cells_per_gene = np.asarray((adata.X > 0).sum(axis=0)).flatten()
    gene_totals = np.asarray(adata.X.sum(axis=0)).flatten()
    
    # Sparsity
    sparsity = 1 - adata.X.nnz / (adata.n_obs * adata.n_vars)
    
    metrics = {
        'n_cells': adata.n_obs,
        'n_genes': adata.n_vars,
        'total_umis': int(adata.X.sum()),
        'genes_per_cell_mean': genes_per_cell.mean(),
        'genes_per_cell_median': np.median(genes_per_cell),
        'genes_per_cell_std': genes_per_cell.std(),
        'umis_per_cell_mean': umis_per_cell.mean(),
        'umis_per_cell_median': np.median(umis_per_cell),
        'umis_per_cell_std': umis_per_cell.std(),
        'genes_expressed': (cells_per_gene > 0).sum(),
        'sparsity_pct': 100 * sparsity,
        'genes_per_cell': genes_per_cell,
        'umis_per_cell': umis_per_cell,
        'cells_per_gene': cells_per_gene,
        'gene_totals': gene_totals
    }
    
    return metrics

def format_qc_table(geo_id: str, metrics: Dict) -> pd.DataFrame:
    """Format QC metrics into a readable table."""
    
    qc_table = pd.DataFrame({
        'Metric': [
            'Cells',
            'Genes',
            'Total UMIs',
            'Mean genes/cell',
            'Median genes/cell',
            'Mean UMIs/cell',
            'Median UMIs/cell',
            'Genes expressed',
            'Sparsity (%)'
        ],
        'Value': [
            f"{metrics['n_cells']:,}",
            f"{metrics['n_genes']:,}",
            f"{metrics['total_umis']:,}",
            f"{metrics['genes_per_cell_mean']:.1f}",
            f"{metrics['genes_per_cell_median']:.1f}",
            f"{metrics['umis_per_cell_mean']:.1f}",
            f"{metrics['umis_per_cell_median']:.1f}",
            f"{metrics['genes_expressed']:,} / {metrics['n_genes']:,}",
            f"{metrics['sparsity_pct']:.1f}"
        ]
    })
    
    return qc_table

def get_top_genes(adata: sc.AnnData, metrics: Dict, top_n: int = 20) -> pd.DataFrame:
    """Get top N most expressed genes in sample."""
    
    gene_totals = metrics['gene_totals']
    cells_per_gene = metrics['cells_per_gene']
    
    top_idx = np.argsort(gene_totals)[::-1][:top_n]
    
    top_genes_df = pd.DataFrame({
        'Gene': adata.var_names[top_idx],
        'UMIs': gene_totals[top_idx].astype(int),
        'Cells': cells_per_gene[top_idx].astype(int),
        '% Cells': (100 * cells_per_gene[top_idx] / adata.n_obs).round(1)
    })
    
    return top_genes_df

In [13]:
# ============================================================================
# FILE OUTPUT
# ============================================================================

def save_results(
    geo_id: str,
    adata: sc.AnnData,
    qc_table: pd.DataFrame,
    top_genes: pd.DataFrame,
    results_folder: Path,
    compression: str = 'gzip',
    verbose: bool = True
) -> Dict[str, Path]:
    """Save analysis results to disk."""
    
    output_files = {
        'qc_summary': results_folder / f"{geo_id}_summary.csv",
        'top_genes': results_folder / f"{geo_id}_top_genes.csv",
        'anndata': results_folder / f"{geo_id}.h5ad"
    }
    
    qc_table.to_csv(output_files['qc_summary'], index=False)
    top_genes.to_csv(output_files['top_genes'], index=False)
    adata.write_h5ad(output_files['anndata'], compression=compression)
    
    if verbose:
        print(f"  ✓ QC summary: {output_files['qc_summary'].name}")
        print(f"  ✓ Top genes: {output_files['top_genes'].name}")
        print(f"  ✓ AnnData object: {output_files['anndata'].name}")
    
    return output_files

In [None]:
# ============================================================================
# MAIN PIPELINE
# ============================================================================

def process_sample(
    geo_id: str,
    metadata: pd.DataFrame,
    config: Config
) -> Tuple[bool, Dict]:
    """Complete pipeline for a single sample."""
    
    sample_name = metadata[metadata['GEO_ID'] == geo_id].iloc[0]['SampleName']
    print(f"\n  Processing: {geo_id} ({sample_name})")
    
    try:
        # Load data
        adata = load_sample(
            geo_id,
            metadata,
            config.DATA_FOLDER,
            config.FEATURES_FILE,
            verbose=config.VERBOSE
        )
        
        # Calculate QC metrics
        metrics = calculate_qc_metrics(adata)
        qc_table = format_qc_table(geo_id, metrics)
        top_genes = get_top_genes(adata, metrics, top_n=20)
        
        # Save results
        output_files = save_results(
            geo_id,
            adata,
            qc_table,
            top_genes,
            config.RESULTS_FOLDER,
            config.COMPRESSION,
            verbose=config.VERBOSE
        )
        
        return True, {
            'geo_id': geo_id,
            'sample_name': sample_name,
            'adata': adata,
            'metrics': metrics,
            'qc_table': qc_table,
            'top_genes': top_genes,
            'output_files': output_files
        }
    
    except Exception as e:
        print(f"  ✗ ERROR: {str(e)}")
        return False, {'geo_id': geo_id, 'error': str(e)}

def main(config: Config = None):
    """Main pipeline execution."""
    
    if config is None:
        config = Config()
    
    # Print header
    print(config.summary())
    
    # Load metadata
    print("Loading metadata...")
    metadata = load_metadata(config.METADATA_FILE)
    all_samples = get_samples_from_metadata(metadata)
    print(f"✓ Found {len(all_samples)} samples in metadata\n")
    
    # Subset if in TEST_MODE
    if config.TEST_MODE:
        samples_to_process = all_samples[:config.N_TEST_SAMPLES]
        print(f"  TEST MODE: Processing {len(samples_to_process)} of {len(all_samples)} samples\n")
    else:
        samples_to_process = all_samples
        print(f"Processing all {len(samples_to_process)} samples...\n")
    
    # Process samples
    print(f"{'='*70}")
    print(f"PROCESSING SAMPLES")
    print(f"{'='*70}")
    
    all_results = []
    success_count = 0
    
    for idx, geo_id in enumerate(samples_to_process, 1):
        print(f"\n[{idx}/{len(samples_to_process)}]")
        
        success, result = process_sample(geo_id, metadata, config)
        all_results.append(result)
        
        if success:
            success_count += 1
    
    # Summary
    print(f"\n{'='*70}")
    print(f"PIPELINE COMPLETE")
    print(f"{'='*70}")
    print(f"Successfully processed: {success_count}/{len(samples_to_process)} samples")
    
    # Combined summary table
    if success_count > 0:
        summary_data = []
        for result in all_results:
            if 'error' not in result:
                summary_data.append({
                    'GEO_ID': result['geo_id'],
                    'Sample': result['sample_name'],
                    'Cells': result['metrics']['n_cells'],
                    'Genes': result['metrics']['n_genes'],
                    'Total UMIs': result['metrics']['total_umis'],
                    'Mean genes/cell': result['metrics']['genes_per_cell_mean'],
                    'Sparsity (%)': result['metrics']['sparsity_pct']
                })
        
        combined_summary = pd.DataFrame(summary_data)
        combined_summary.to_csv(
            config.RESULTS_FOLDER / "all_samples_summary.csv",
            index=False
        )
        
        print(f"\nCombined Summary:\n")
        print(combined_summary.to_string(index=False))
        print(f"\n✓ Combined summary saved to: {config.RESULTS_FOLDER / 'all_samples_summary.csv'}")
    
    print(f"{'='*70}\n")
    

In [15]:
# ============================================================================
# EXECUTION
# ============================================================================

if __name__ == "__main__":
    # Initialize configuration
    config = Config(
        TEST_MODE=True,  # ← CHANGE TO False TO RUN ALL SAMPLES
        N_TEST_SAMPLES=3,
        VERBOSE=True
    )
    
    # Run pipeline
    main(config)


PIPELINE CONFIGURATION
Mode:              TEST MODE
Data folder:       /triumvirate/home/alexarol/breast_cancer_analysis/data/GSE161529_RAW
Results folder:    /triumvirate/home/alexarol/breast_cancer_analysis/results
Metadata file:     /triumvirate/home/alexarol/breast_cancer_analysis/results/TheBigBoss_enhanced.csv
Features file:     /triumvirate/home/alexarol/breast_cancer_analysis/data/GSE161529_features.tsv
Compression:       gzip

Loading metadata...
✓ Found 69 samples in metadata

  TEST MODE: Processing 3 of 69 samples

PROCESSING SAMPLES

[1/3]

  Processing: GSM4909253 (N-0092-total)
  Loading matrix and barcodes...
  ✓ Loaded: 4,966 cells × 33,538 genes
  ✗ ERROR: Can't implicitly convert non-string objects to strings

[2/3]

  Processing: GSM4909254 (N-0019-total)
  Loading matrix and barcodes...
  ✓ Loaded: 7,130 cells × 33,538 genes
  ✗ ERROR: Can't implicitly convert non-string objects to strings

[3/3]

  Processing: GSM4909255 (N-0280-epi)
  Loading matrix and barcodes

In [None]:
DATA_FOLDER = Path("/Users/alex/Desktop/breast_cancer_analysis/data/GSE161529_RAW")
FEATURES_FILE = Path("/Users/alex/Desktop/breast_cancer_analysis/data/GSE161529_features.tsv")
SAMPLE_NAME = "GSM4909276_N-PM0372-Total"  # Just the sample name

print(f"\n{'='*70}")
print(f"SAMPLE: {SAMPLE_NAME}")
print(f"{'='*70}\n")

matrix_file = DATA_FOLDER / f"{SAMPLE_NAME}-matrix.mtx.gz"
barcodes_file = DATA_FOLDER / f"{SAMPLE_NAME}-barcodes.tsv.gz"

if not matrix_file.exists() or not barcodes_file.exists():
    print(f"Files not found. Available samples:")
    for f in sorted(set([f.name.rsplit('-', 2)[0] for f in DATA_FOLDER.glob("*-matrix.mtx.gz")]))[:10]:
        print(f"  - {f}")
    exit()

adata = sc.read_mtx(matrix_file).T
adata.obs_names = pd.read_csv(barcodes_file, header=None, compression='gzip')[0]
adata.var_names = pd.read_csv(FEATURES_FILE, sep='\t', header=None)[0]
adata.var['gene_id'] = pd.read_csv(FEATURES_FILE, sep='\t', header=None)[1]

print(f"✓ Loaded: {adata.n_obs:,} cells × {adata.n_vars:,} genes\n")

genes_per_cell = (adata.X > 0).sum(axis=1).A1
umis_per_cell = adata.X.sum(axis=1).A1
cells_per_gene = (adata.X > 0).sum(axis=0).A1
gene_totals = adata.X.sum(axis=0).A1

#Summary
stats = pd.DataFrame({
    'Metric': ['Cells', 'Genes', 'Total UMIs', 'Mean genes/cell', 'Median genes/cell',
               'Mean UMIs/cell', 'Median UMIs/cell', 'Genes expressed', 'Sparsity (%)'],
    'Value': [f"{adata.n_obs:,}", f"{adata.n_vars:,}", f"{adata.X.sum():.0f}",
              f"{genes_per_cell.mean():.1f}", f"{np.median(genes_per_cell):.1f}",
              f"{umis_per_cell.mean():.1f}", f"{np.median(umis_per_cell):.1f}",
              f"{(cells_per_gene > 0).sum():,} / {adata.n_vars:,}",
              f"{100 * (1 - adata.X.nnz / (adata.n_obs * adata.n_vars)):.1f}"]
})

print(stats.to_string(index=False))

#Top genes
top_20_idx = np.argsort(gene_totals)[::-1][:20]
top_genes = pd.DataFrame({
    'Gene': adata.var_names[top_20_idx],
    'UMIs': gene_totals[top_20_idx].astype(int),
    'Cells': cells_per_gene[top_20_idx].astype(int),
    '% Cells': (100 * cells_per_gene[top_20_idx] / adata.n_obs).round(1)
})

print(f"\n{'TOP 20 EXPRESSED GENES':-^70}")
print(top_genes.to_string(index=False))

Path("../results").mkdir(exist_ok=True)
stats.to_csv("../results/summary.csv", index=False)
top_genes.to_csv("../results/top_genes.csv", index=False)
print(f"\n{'✓ Done':-^70}\n")


SAMPLE: GSM4909276_N-PM0372-Total

Files not found. Available samples:


FileNotFoundError: [Errno 2] No such file or directory: '/Users/alex/Desktop/breast_cancer_analysis/data/GSE161529_RAW/GSM4909276_N-PM0372-Total-matrix.mtx.gz'

: 

In [None]:
from scipy.io import mmread

DATA_FOLDER = Path("/Users/alex/Desktop/breast_cancer_analysis/data/GSE161529_RAW")
SAMPLE_NAME = "GSM4909276_N-PM0372-Total"

matrix_file = DATA_FOLDER / f"{SAMPLE_NAME}-matrix.mtx.gz"
barcodes_file = DATA_FOLDER / f"{SAMPLE_NAME}-barcodes.tsv.gz"
features_file = Path("/Users/alex/Desktop/breast_cancer_analysis/data/GSE161529_features.tsv")

print(f"\nReading {SAMPLE_NAME}...\n")

matrix = mmread(matrix_file).T.tocsr()  # Transpose to cells x genes
barcodes = pd.read_csv(barcodes_file, header=None, compression='gzip')[0].values
features = pd.read_csv(features_file, sep='\t', header=None)[0].values

df = pd.DataFrame(matrix.toarray(), index=barcodes, columns=features)

print(f"Matrix shape: {df.shape} (cells x genes)")
print(f"Memory size: ~{df.memory_usage(deep=True).sum() / 1e9:.2f} GB\n")

output_file = Path("/Users/alex/Desktop/breast_cancer_analysis/results/matrix_view.csv")
df.to_csv(output_file)

print(f"✓ Saved to: {output_file}")
print(f"✓ Open in Excel or text editor to view\n")


Reading GSM4909276_N-PM0372-Total...

Matrix shape: (4825, 33538) (cells x genes)
Memory size: ~1.29 GB

✓ Saved to: /Users/alex/Desktop/breast_cancer_analysis/results/matrix_view.csv
✓ Open in Excel or text editor to view



In [None]:
DATA_FOLDER = Path("../data/GSE161529_RAW")
SAMPLE_NAME = "GSM4909253_N-PM0092-Total"

#Load
matrix = mmread(DATA_FOLDER / f"{SAMPLE_NAME}-matrix.mtx.gz").T.tocsr()
features = pd.read_csv("../data/GSE161529_features.tsv", sep='\t', header=None)[0]

values = matrix.data  #All non-zero values
print(f"Min count: {values.min()}")
print(f"Max count: {values.max()}")
print(f"Mean count: {values.mean():.1f}")
print(f"Median count: {np.median(values):.1f}")

Min count: 1
Max count: 4561
Mean count: 5.0
Median count: 1.0


In [None]:
DATA = Path("../data/GSE161529_RAW")
SAMPLE = "GSM4909253_N-PM0092-Total"
features = pd.read_csv("../data/GSE161529_features.tsv", sep='\t', header=None)
matrix = mmread(DATA / f"{SAMPLE}-matrix.mtx.gz").T.tocsr()
barcodes = pd.read_csv(DATA / f"{SAMPLE}-barcodes.tsv.gz", header=None, compression='gzip')[0]

#Find max
coo = matrix.tocoo()
max_idx = np.argmax(coo.data)
r, c, val = coo.row[max_idx], coo.col[max_idx], coo.data[max_idx]

print(f"\n{'HIGHEST EXPRESSION':-^50}")
print(f"Gene: {features[0][c]}")
print(f"Count: {int(val)}")
print(f"Cell: {barcodes[r]}\n")

#Top 10
coo = matrix.tocoo()
top_10 = np.argsort(coo.data)[::-1][:10]

genes = np.array(features[0])[np.array(coo.col)[top_10]]
cells = np.array(barcodes)[np.array(coo.row)[top_10]]
counts = np.array(coo.data)[top_10]

import pandas as pd
df = pd.DataFrame({'Gene': genes, 'Count': counts, 'Cell': cells})

print(f"\n{'TOP 10 EXPRESSED':-^50}")
print(df.to_string(index=False))


----------------HIGHEST EXPRESSION----------------
Gene: ENSG00000167996
Count: 4561
Cell: TCACAAGCAATGTAAG-1


-----------------TOP 10 EXPRESSED-----------------
           Gene  Count               Cell
ENSG00000167996   4561 TCACAAGCAATGTAAG-1
ENSG00000167996   3457 GATGCTACACCTCGTT-1
ENSG00000167996   3366 CCACTACAGAATTCCC-1
ENSG00000167996   3230 GAGCAGAAGATCCCAT-1
ENSG00000167996   3197 CCACTACTCGGCGCTA-1
ENSG00000167996   3077 AGGTCATGTGTTCGAT-1
ENSG00000167996   3067 ATGCGATAGTTCGATC-1
ENSG00000167996   2941 AGATCTGAGCCCAGCT-1
ENSG00000167996   2889 TGCGGGTAGCGCTTAT-1
ENSG00000167996   2867 GATCAGTCAGTCGATT-1
