In [1]:
import subprocess
import sys
import os
import stat
import pandas as pd

sys.path.append('../../')
from utils import hardware_utils, file_utils

## Step 0: install necessary tools

Install [hisat 2.2.1](https://daehwankimlab.github.io/hisat2/download/) in addition to conda environment

In [2]:
operating_system = hardware_utils.get_os()

if operating_system == 'macOS':
    file_utils.download_file_chunks('https://cloud.biohpc.swmed.edu/index.php/s/zMgEtnF6LjnjFrr/download', './hisat2-2.2.1.zip')
elif operating_system == 'Linux':
    file_utils.download_file_chunks('https://cloud.biohpc.swmed.edu/index.php/s/oTtGWbWjaxsQ2Ho/download', './hisat2-2.2.1.zip')
else:
    raise(ValueError(f'{operating_system} is not compatible with hisat2 2.2.1. Compatible operating systems are macOS and Linux.'))

FILE SAVED: /Users/michaelbertagna/git/TGNE-2022/TGNE/microarray_probe_alignment_and_filtering/hisat2-2.2.1.zip


In [3]:
file_utils.unzip_file('./hisat2-2.2.1.zip')

## Step 1: make probe fasta

Load the probe dataset

The .ndf file describes the design of the Roche Nimblegen microarray chips that were used in this study. This file is renamed from the GPL6759.ndf file corresponding to the raw data repository GEO accession GSE11300, which can be downloaded from https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file

In [5]:
probe_df = pd.read_csv('../../new_raw_data/GSE11300/GPL6759.ndf.gz', compression='gzip', sep='\t')

Take a look

In [6]:
probe_df.head()

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
0,5314_0001_0001,BLOCK1,rank_selected,rank:09;score:376;uniq:03;count:37;freq:00;rul...,TETRA00S0021925,AGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGC...,0,64456195,64456195,1,1,experimental,TETRAP00318583,2262,5314,1,1
1,5314_0023_0001,BLOCK1,rank_selected,rank:09;score:402;uniq:26;count:37;freq:00;rul...,TETRA00S0012676,AAGGGCGCATTAGTGCCAAATGGGACACGTAGAAAGGTAGAGGGAT...,0,64330340,64330340,1,23,experimental,TETRAP00183246,3282,5314,23,1
2,5314_0025_0001,BLOCK1,rank_selected,rank:08;score:115;uniq:01;count:31;freq:00;rul...,TETRA00S0002513,TATCATATAAAAATGTGTCTTGTTTATAAGACATTAATTGCTTAAT...,0,64193974,64193974,1,25,experimental,TETRAP00036232,262,5314,25,1
3,5314_0027_0001,BLOCK1,rank_selected,rank:11;score:341;uniq:23;count:37;freq:00;rul...,TETRA00S0000062,AGATTTGATGTAATATAAGTTATCCGAAACATGGCAGTAGAATGCA...,0,64161203,64161203,1,27,experimental,TETRAP00000895,825,5314,27,1
4,5314_0029_0001,BLOCK1,rank_selected,rank:01;score:560;uniq:21;count:37;freq:00;rul...,TETRA00S0006635,TAAATAACTAAGAGCATAAGGCTGGTGCAAGTAACTCAATGCAACC...,0,64249458,64249458,1,29,experimental,TETRAP00096103,3290,5314,29,1


Note: the SEQ_ID is what the chip designers thought the genes were (later converted to TTHERM_ format); the PROBE_ID is unique for each PROBE_SEQUENCE. Consequently, for all of the quality control and filtering, we want to work with the PROBE_IDs, which we can then map to our current understanding of the genome by their sequence.

The seq ids are not unique because multiple probes can target a single gene. Keep in mind that our understanding of what genes exist has dramatically changed since these microarrays were designed.

In [7]:
len(probe_df)

392778

This length is indicative of a 1:2 design for the chip. See http://mtweb.cs.ucl.ac.uk/mus/mus/binnaz/CNV/NimbleGene/DATA/OID8421-2/Documentation/NimbleGen_data_formats.pdf for more info

In [15]:
probe_df['PROBE_CLASS'].unique()

array(['experimental', nan, 'control:reseq_qc:synthesis', 'fiducial',
       'linker', 'synthesis', 'control:sample_tracking:A',
       'control:empty', 'encoded number', 'control:reseq_qc:label',
       'uniformity', 'control', 'control:sample_tracking:B'], dtype=object)

In [16]:
probe_df[probe_df.duplicated(subset=['PROBE_ID'])]['PROBE_CLASS'].unique()

array(['fiducial', 'linker', 'synthesis', 'control:empty',
       'encoded number', 'uniformity', 'control',
       'control:sample_tracking:B'], dtype=object)

These are all controls of various sorts, etc. and I can exclude them.

In [17]:
experimental_probe_df = probe_df.loc[probe_df['PROBE_CLASS']=='experimental']

In [18]:
experimental_probe_df.head()

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
0,5314_0001_0001,BLOCK1,rank_selected,rank:09;score:376;uniq:03;count:37;freq:00;rul...,TETRA00S0021925,AGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGC...,0,64456195,64456195,1,1,experimental,TETRAP00318583,2262,5314,1,1
1,5314_0023_0001,BLOCK1,rank_selected,rank:09;score:402;uniq:26;count:37;freq:00;rul...,TETRA00S0012676,AAGGGCGCATTAGTGCCAAATGGGACACGTAGAAAGGTAGAGGGAT...,0,64330340,64330340,1,23,experimental,TETRAP00183246,3282,5314,23,1
2,5314_0025_0001,BLOCK1,rank_selected,rank:08;score:115;uniq:01;count:31;freq:00;rul...,TETRA00S0002513,TATCATATAAAAATGTGTCTTGTTTATAAGACATTAATTGCTTAAT...,0,64193974,64193974,1,25,experimental,TETRAP00036232,262,5314,25,1
3,5314_0027_0001,BLOCK1,rank_selected,rank:11;score:341;uniq:23;count:37;freq:00;rul...,TETRA00S0000062,AGATTTGATGTAATATAAGTTATCCGAAACATGGCAGTAGAATGCA...,0,64161203,64161203,1,27,experimental,TETRAP00000895,825,5314,27,1
4,5314_0029_0001,BLOCK1,rank_selected,rank:01;score:560;uniq:21;count:37;freq:00;rul...,TETRA00S0006635,TAAATAACTAAGAGCATAAGGCTGGTGCAAGTAACTCAATGCAACC...,0,64249458,64249458,1,29,experimental,TETRAP00096103,3290,5314,29,1


In [19]:
len(probe_df)

392778

In [20]:
len(experimental_probe_df)

384999

In [21]:
experimental_probe_df['MISMATCH'].unique()

array([0])

Extract the probe ids and sequences to build a fasta file

In [22]:
probe_ids = experimental_probe_df['PROBE_ID'].values
probe_seqs = experimental_probe_df['PROBE_SEQUENCE'].values

Build the fasta file

In [23]:
with open('./2007-02-28_microarray_experimental_probes.fna', 'w') as f:
    for i, p in zip(probe_ids, probe_seqs):
        f.write(f">{i}\n")
        f.write(f"{p}\n\n")

## Step 2: use hisat 2 to align probes to newest genome

Bash command to index the 2021 _T. thermophila_ genome CDS fasta

In [24]:
hisat2_build_path = './hisat2-2.2.1/hisat2-2.2.1/hisat2-build'

os.chmod(hisat2_build_path, os.stat(hisat2_build_path).st_mode | stat.S_IEXEC)

hisat2_build_s_path = './hisat2-2.2.1/hisat2-2.2.1/hisat2-build-s'

os.chmod(hisat2_build_s_path, os.stat(hisat2_build_s_path).st_mode | stat.S_IEXEC)

hisat2_build_l_path = './hisat2-2.2.1/hisat2-2.2.1/hisat2-build-l'

os.chmod(hisat2_build_l_path, os.stat(hisat2_build_l_path).st_mode | stat.S_IEXEC)

In [25]:
index_genome_command = f"{hisat2_build_path} -f ../../active_files/cds.fasta ttherm_2021"

In [26]:
index_genome_command.split()

['./hisat2-2.2.1/hisat2-2.2.1/hisat2-build',
 '-f',
 '../../active_files/cds.fasta',
 'ttherm_2021']

In [28]:
r = subprocess.run(args=index_genome_command.split(), capture_output=True)

In [29]:
print(r.stdout.decode('utf-8'))

Building DifferenceCoverSample
  Building sPrime
  Building sPrimeOrder
  V-Sorting samples
  V-Sorting samples time: 00:00:01
  Allocating rank array
  Ranking v-sort output
  Ranking v-sort output time: 00:00:00
  Invoking Larsson-Sadakane on ranks
  Invoking Larsson-Sadakane on ranks time: 00:00:00
  Sanity-checking and returning
Building samples
Reserving space for 12 sample suffixes
Generating random suffixes
QSorting 12 sample offsets, eliminating duplicates
QSorting sample offsets, eliminating duplicates time: 00:00:00
Multikey QSorting 12 samples
  (Using difference cover)
  Multikey QSorting samples time: 00:00:00
Calculating bucket sizes
Splitting and merging
  Splitting and merging time: 00:00:00
Avg bucket size: 6.73142e+06 (target: 10097132)
Getting block 1 of 8
  Reserving size (10097133) for bucket 1
  Calculating Z arrays for bucket 1
  Entering block accumulator loop for bucket 1:
  bucket 1: 10%
  bucket 1: 20%
  bucket 1: 30%
  bucket 1: 40%
  bucket 1: 50%
  bucket 

In [30]:
print(r.stderr.decode('utf-8'))

Settings:
  Output files: "ttherm_2021.*.ht2"
  Line rate: 6 (line is 64 bytes)
  Lines per side: 1 (side is 64 bytes)
  Offset rate: 4 (one in 16)
  FTable chars: 10
  Strings: unpacked
  Local offset rate: 3 (one in 8)
  Local fTable chars: 6
  Local sequence length: 57344
  Local sequence overlap between two consecutive indexes: 1024
  Endianness: little
  Actual local endianness: little
  Sanity checking: disabled
  Assertions: disabled
  Random seed: 0
  Sizeofs: void*:8, int:4, long:8, size_t:8
Input files DNA, FASTA:
  ../../active_files/cds.fasta
Reading reference sizes
  Time reading reference sizes: 00:00:00
Calculating joined length
Writing header
Reserving space for joined string
Joining reference sequences
  Time to join reference sequences: 00:00:00
  Time to read SNPs and splice sites: 00:00:00
Using parameters --bmax 10097133 --dcv 1024
  Doing ahead-of-time memory usage test
  Passed!  Constructing with these parameters: --bmax 10097133 --dcv 1024
Constructing suffix-a

Bash command to align the probe sequences to the CDS regions

In [31]:
hisat2_path = './hisat2-2.2.1/hisat2-2.2.1/hisat2'
os.chmod(hisat2_path, os.stat(hisat2_path).st_mode | stat.S_IEXEC)

hisat2_path_s = './hisat2-2.2.1/hisat2-2.2.1/hisat2-align-s'

os.chmod(hisat2_path_s, os.stat(hisat2_path_s).st_mode | stat.S_IEXEC)

hisat2_path_l = './hisat2-2.2.1/hisat2-2.2.1/hisat2-align-l'

os.chmod(hisat2_path_l, os.stat(hisat2_path_l).st_mode | stat.S_IEXEC)

In [32]:
align_to_genome_command = f"""
{hisat2_path} -f -x ttherm_2021 --no-hd
-U ./2007-02-28_microarray_experimental_probes.fna 
-S microarray_probe_alignment.sam"""

In [33]:
align_to_genome_command.split()

['./hisat2-2.2.1/hisat2-2.2.1/hisat2',
 '-f',
 '-x',
 'ttherm_2021',
 '--no-hd',
 '-U',
 './2007-02-28_microarray_experimental_probes.fna',
 '-S',
 'microarray_probe_alignment.sam']

In [34]:
r2 = subprocess.run(args=align_to_genome_command.split(), capture_output=True)

In [35]:
print(r2.stdout.decode('utf-8'))




In [36]:
print(r2.stderr.decode('utf-8'))

384999 reads; of these:
  384999 (100.00%) were unpaired; of these:
    61733 (16.03%) aligned 0 times
    313654 (81.47%) aligned exactly 1 time
    9612 (2.50%) aligned >1 times
83.97% overall alignment rate

