# Analyze 2018 Pilot Data to Counts

## Load Variables and Make Directories

In [None]:
source star_2018_pilot_config.sh
# rm -rf $CUROUT
mkdir -p $STAR_OUT $GENOME_DIR $FINAL_COUNTS # $TRIMMED

## Download Genome and Annotation

In [None]:
# # ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/fungi/Cryptococcus_neoformans/all_assembly_versions/GCF_000149245.1_CNA3
# ftp://ftp.ensemblgenomes.org/pub/release-39/fungi/fasta/fungi_basidiomycota1_collection/cryptococcus_neoformans_var_grubii_h99/dna/Cryptococcus_neoformans_var_grubii_h99.CNA3.dna.toplevel.fa.gz
# ftp://ftp.ensemblgenomes.org/pub/release-39/fungi/gff3/fungi_basidiomycota1_collection/cryptococcus_neoformans_var_grubii_h99/Cryptococcus_neoformans_var_grubii_h99.CNA3.39.gff3.gz

for CUR in $FA_URL $GTF_URL ; do
    wget --directory-prefix ${GENOME_DIR} ${CUR}
done

In [None]:
gunzip --force ${GENOME_DIR}/${GTF}.gz
gunzip --force ${GENOME_DIR}/${FA}.gz

## Index Genome

In [None]:
STAR \
    --runThreadN $THREADS \
    --runMode genomeGenerate \
    --genomeDir $GENOME_DIR \
    --genomeFastaFiles ${GENOME_DIR}/${FA} \
    --sjdbGTFfile ${GENOME_DIR}/${GTF} \
    --outFileNamePrefix ${STAR_OUT}/genome_ \
    --sjdbGTFfeatureExon exon \
    --sjdbGTFtagExonParentTranscript transcript_id \
    --sjdbGTFtagExonParentGene gene_id \
    --genomeSAindexNbases 6

## Trim and Map Reads

In [None]:
trim_and_star_func() {
    FASTQ=$1
    FASTQ_BASE=${FASTQ##*/} # strip directory from file path
    SAMPLE="${FASTQ_BASE%_R1_001.fastq.gz}" # strip .fq.gz file extension
    echo $SAMPLE
    echo $FASTQ
    echo $FASTQ_BASE
    # exit 1


    # make a pipe for trimmed fastq
    CUR_PIPE=`mktemp --dry-run`_${SAMPLE}_pipe.fq
    mkfifo $CUR_PIPE

    # Run fastq-mcf
    fastq-mcf \
        $ADAPTERS \
        $FASTQ \
        -o $CUR_PIPE \
        -q 20 -x 0.5 &
        
    STAR \
    --runMode alignReads \
    --runThreadN $THREADS \
    --genomeDir $GENOME_DIR \
    --outSAMtype None \
    --quantMode GeneCounts \
    --genomeLoad LoadAndKeep \
    --twopassMode None \
    --limitBAMsortRAM 1280000000 \
    --outFileNamePrefix ${STAR_OUT}/${SAMPLE}_ \
    --outFilterScoreMinOverLread 0 --outFilterMatchNminOverLread 0 \
    --outFilterMatchNmin 0 --outFilterMismatchNmax 2 \
    --readFilesIn $CUR_PIPE        
        
    rm -f $CUR_PIPE
}
export -f trim_and_star_func

STAR --genomeDir $GENOME_DIR \
    --outFileNamePrefix ${STAR_OUT}/genomeload_ \
    --genomeLoad LoadAndExit 

# parallel --jobs $MAX_JOBS trim_and_star_func ::: $RAW_FASTQS/35_MA_*_L001_R1_001.fastq.gz
# parallel --jobs $MAX_JOBS trim_and_star_func ::: $RAW_FASTQS/*_L001_R1_001.fastq.gz
# parallel --jobs $MAX_JOBS trim_and_star_func ::: $RAW_FASTQS/[3-4]_RZ_*_L001_R1_001.fastq.gz
# parallel --jobs $MAX_JOBS trim_and_star_func ::: $RAW_FASTQS/4_RZ_P_S43_L003_R1_001.fastq.gz $RAW_FASTQS/3_TOT_J_S34_L003_R1_001.fastq.gz
# parallel --jobs $MAX_JOBS trim_and_star_func ::: $RAW_FASTQS/35_MA_P_S39_L002_R1_001.fastq.gz
parallel --jobs $MAX_JOBS trim_and_star_func ::: $RAW_FASTQS/*.fastq.gz

STAR --genomeDir $GENOME_DIR \
    --outFileNamePrefix ${STAR_OUT}/genomeremove_ \
    --genomeLoad Remove

In [None]:
mv $STAR_OUT/*_ReadsPerGene.out.tab $FINAL_COUNTS
chmod a-w $DATA_BASE