# Genome-wide periodicity analysis


The goal of the periodicty phase is assess the periodicity of the $WW$ mofitf ($WW = \{AA, AT, TA, TT\}$) in the categorical signal defined by an input DNA sequence.

Note, this analysis generated a huge amount of intermediate data. Thus, we are deleting most of them and kept only the esential files. If you would like to check all the intermediate files, please change the temporary folder for another.

## Data

To be able to run this analysis you need to download some external datasets.
Datasets consist on genome DNA sequences (fasta files) downloaded from [UCSC genome browser](https://genome.ucsc.edu/) and [ENSEMBL genome browse](http://ensemblgenomes.org/)

For analysing the data, you should create a separate folder for each genome you want to analyse. The name of the folder must be: ``<family>_<species>``. If the species names has more than one word, separate them with ``_``; e.g.: ``insects_apis_mellifera``.

For ensembl genomes the files named as ``.dna_rm.toplevel.fa.gz`` have been used and for UCSC we have downloaded the ``.fa.masked.gz`` files.

If there is only one file inside, it is assumed that it contains all the chromosomes/groups from the whole genome analysis. If there are many files, it is assume that there is already one file per chromosome/group.

Please, extract *tar* files, but compress all fasta files in *gzip* format.

Periodicity analysis for all data (we only keep intermediate files generated for S. cerevisiae).

In [None]:
%%bash

source activate env_nucperiod
cores=6

for organism_folder in *_*
do

    if [ -f "${organism_folder}" ]
    then
        continue
    elif [ "${organism_folder}" == "fungi_saccharomyces_cerevisiae" ]
    then
        tmp_folder=${organism_folder}
    else
        tmp_folder=`mktemp -d`
    fi
    
    chr_folder=${tmp_folder}/chroms
    mkdir -p ${chr_folder}
    chunks_folder=${tmp_folder}/fasta_chunks
    mkdir -p ${chunks_folder}
    rand_chunks_folder=${tmp_folder}/random_fasta_chunks
    mkdir -p ${rand_chunks_folder}
    motif_folder=${tmp_folder}/motif_counts
    mkdir -p ${motif_folder}
    rand_motif_folder=${tmp_folder}/random_motif_counts
    mkdir -p ${rand_motif_folder}

    # Generate the input
    nfiles=$(ls ${organism_folder}/*.fa.*gz | wc -l)
    if [ $nfiles = 1 ]
    then
        # Split the file into one file per CHR
        zcat ${organism_folder}/*.fa.*gz | \
            awk -v outdir="${chr_folder}/" '/^>/{s=substr($0,2); split(s, res, " "); t=res[1] ".fa"} {print > (outdir t)}'         
    else
        # Assume it is already splited and just uncompress the files
        for file in ${organism_folder}/*.fa.*gz
        do
            name=$(basename ${file})
            name=${name/.fa/}
            name=${name/.gz/}
            name=${name/.masked/}
            zcat ${file} > ${chr_folder}/${name}.fa
        done
    fi

    # Delete all files with size less than 1M
    find ${chr_folder} -name "*.fa" -not -size +1M -delete

    # Create the chunck for each fasta file per chromosome
    for file in ${chr_folder}/*.fa
    do
        chr=$(basename $file)
        chr=${chr/.fa/}
        mkdir -p ${chunks_folder}/${chr}
        faSplit size ${file} 500000 ${chunks_folder}/${chr}/ \
            -lift="${chunks_folder}/${chr}/${chr}.lft" -extra=500000 -verbose=0
    done

    # Remove small files
    for file in ${chunks_folder}/*/*.fa
    do
        if [ -f ${file} ]
        then
            letters=`cat ${file} | wc -c`
            if [ ${letters} -lt 1000000 ]
            then
                rm ${file}
            fi
        fi
    done

    # Generate the Markov-models from the chunks
    for file in ${chunks_folder}/*/*.fa
    do
        if [ -f ${file} ]
        then
            chr=$(basename $(dirname ${file}))
            name=$(basename ${file})
            name=${name%.fa}
            output_dir=${rand_chunks_folder}/${chr}/${name}
            mkdir -p ${output_dir}
            python scripts/shuffle.py \
                --chunk ${file} --output_folder ${output_dir} \
                --repeats 100 --cores ${cores}
        fi
    done

    # Compute motif counts
    for dir in $(find ${chunks_folder} -maxdepth 1 -mindepth 1 -type d)
    do
        chr=$(basename ${dir})
        output=${motif_folder}/${chr}
        mkdir -p ${output}
        python scripts/correlation.py ${dir} ${output} --cores ${cores} --wildcard "*.fa"
    done

    # Compute motif counts for random chunks
    for dir in $(find ${rand_chunks_folder} -maxdepth 1 -mindepth 1 -type d)
    do
        chr=$(basename ${dir})
        for subdir in $(find ${dir} -maxdepth 1 -mindepth 1 -type d)
        do
            name=$(basename ${subdir})
            output=${rand_motif_folder}/${chr}/${name}
            mkdir -p ${output}
            python scripts/correlation.py ${subdir} ${output} --cores ${cores} --wildcard "*.txt.gz"
        done
    done

    # Compute the power and SNR for all the chunks
    python scripts/spectral.py ${motif_folder} ${organism_folder}/observed_spectra.npz --cores ${cores} \
        --table ${organism_folder}/observed_chunks.tsv

    # Same for randomizations
    python scripts/spectral.py ${rand_motif_folder} ${organism_folder}/random_spectra.npz --cores ${cores} \
            --table ${organism_folder}/random_chunks.tsv --randomized

    # Build summary and metrics
    python scripts/summary.py  ${organism_folder}/observed_chunks.tsv ${organism_folder}/random_chunks.tsv \
        ${organism_folder}/summary.tsv ${organism_folder}/metrics.tsv

    # Clean up
    if [ "${tmp_folder}" != "${organism_folder}" ]
    then
        rm -r ${tmp_folder}
    fi
    
done