## **Project**: Natural products from the Palaeolithic

## **Section**: Genome wide functional analysis: EGGNOG analysis


Anan Ibrahim, 01.05.2022

**Contents**
 - **Step1**: Create conda envirorment with required dependencies if not already installed
 - **Step2**: Download and create the eggnog-database
 - **Step3**: Prokka annotation of genomes
 - **Step4**: Map ancient MAGs and modern complete chlorobiaceae genomes to the KEGG database

##########

**Step1**: Create conda envirorment with required dependencies if not already installed

##########

In [None]:
# All conda envs can be found in EMN001_Paleofuran/02-scripts/ENVS_*.yml
conda env create -f prokka.yml
conda env create -f eggnog-mapper.yml

##########

**Step2**: Download and create the eggnog-database

##########

In [None]:
mkdir /Net/Groups/ccdata/databases/eggnog-mapper-data
cd /Net/Groups/ccdata/apps/conda_envs/eggnog-mapper/lib/python3.8/site-packages/data
export EGGNOG_DATA_DIR=/Net/Groups/ccdata/databases/eggnog-mapper-data
download_eggnog_data.py --data_dir /Net/Groups/ccdata/databases/eggnog-mapper-data

##########

**Step3**: Prokka annotation of genomes

##########

In [None]:
#!/bin/bash

############################
#Hashes and Directories
############################

# NOTE: Change directories in bash script accordingly 
# NOTE: Add the ancient Bins/MAGs in $BINS

# Directories: 
OUT=/Net/Groups/ccdata/users/AIbrahim/ancientDNA/Deep-Evo/BGC/final-butyrolactone/Output
BINS=/Net/Groups/ccdata/users/AIbrahim/ancientDNA/Deep-Evo/BGC/final-butyrolactone/Input/BINS

KEGG_DB=/Net/Groups/ccdata/databases/kegg_db/
CHLOROBIALES=/Net/Groups/ccdata/databases/ncbi-ref-genomes/Chlorobiales

# Ancient samples:
EMN=$BINS/EMN001_021.fna
GOY6=$BINS/GOY006_RA.fna
GOY5=$BINS/GOY005_001.fna
PES=$BINS/PES001_018.fna
RIG=$BINS/RIG001_014.fna
PLV18=$BINS/PLV001_001.fna
PLV20=$BINS/PLV001_002.fna
TAF=$BINS/TAF017_RA.fna

mkdir $OUT
############################
# Prokka annotation of MAGs
############################
mkdir $OUT/PROKKA

# Ancient genomes
eval "$(conda shell.bash hook)"
conda activate prokka

for F in $BINS/*.fna; do 
  N=$(basename $F .fna) ;
  mkdir $OUT/PROKKA/$N ;
  prokka --quiet --metagenome --outdir $OUT/PROKKA/$N --prefix $N --locustag $N --force --cpus 0  $F ;
done 

# Modern genomes
for F in $CHLOROBIALES/fasta/GCA_013335335.1.fna; do 
  N=$(basename $F .fna) ;
  mkdir $OUT/PROKKA/$N ;
  prokka --quiet --metagenome --outdir $OUT/PROKKA/$N --prefix $N --locustag $N --force --cpus 30  $F ;
done 

for F in $CHLOROBIALES/fasta/GCA_013335765.1.fna; do 
  N=$(basename $F .fna) ;
  mkdir $OUT/PROKKA/$N ;
  prokka --quiet --metagenome --outdir $OUT/PROKKA/$N --prefix $N --locustag $N --force --cpus 30  $F ;
done 

for F in $CHLOROBIALES/fasta/GCF_000020465.1.fna; do 
  N=$(basename $F .fna) ;
  mkdir $OUT/PROKKA/$N ;
  prokka --quiet --metagenome --outdir $OUT/PROKKA/$N --prefix $N --locustag $N --force --cpus 30  $F ;
done 

for F in $CHLOROBIALES/fasta/GCF_001509575.1.1.fna; do 
  N=$(basename $F .fna) ;
  mkdir $OUT/PROKKA/$N ;
  prokka --quiet --metagenome --outdir $OUT/PROKKA$N --prefix $N --locustag $N --force --cpus 30  $F ;
done 

for F in $CHLOROBIALES/fasta/GCF_000015125.1.fna; do 
  N=$(basename $F .fna) ;
  mkdir $OUT/PROKKA/$N ;
  prokka --quiet --metagenome --outdir $OUT/PROKKA/$N --prefix $N --locustag $N --force --cpus 30  $F ;
done 

for F in $CHLOROBIALES/fasta/GCA_019163275.1.fna; do 
  N=$(basename $F .fna) ;
  mkdir $OUT/PROKKA/$N ;
  prokka --quiet --metagenome --outdir $OUT/PROKKA/$N --prefix $N --locustag $N --force --cpus 30  $F ;
done 

conda deactivate

##########

**Step4**: Map ancient MAGs and modern complete chlorobiaceae genomes to the KEGG database

##########

In [None]:
#!/bin/bash

############################
#Hashes and Directories
############################

# NOTE: Change directories in bash script accordingly 
# NOTE: Add the ancient Bins/MAGs in $BINS

# Directories: 
OUT=/Net/Groups/ccdata/users/AIbrahim/ancientDNA/Deep-Evo/BGC/final-butyrolactone/Output
BINS=/Net/Groups/ccdata/users/AIbrahim/ancientDNA/Deep-Evo/BGC/final-butyrolactone/Input/BINS

KEGG_DB=/Net/Groups/ccdata/databases/kegg_db/
CHLOROBIALES=/Net/Groups/ccdata/databases/ncbi-ref-genomes/Chlorobiales

# Ancient samples:
EMN=$BINS/EMN001_021.fna
GOY6=$BINS/GOY006_RA.fna
GOY5=$BINS/GOY005_001.fna
PES=$BINS/PES001_018.fna
RIG=$BINS/RIG001_014.fna
PLV18=$BINS/PLV001_001.fna
PLV20=$BINS/PLV001_002.fna
TAF=$BINS/TAF017_RA.fna

mkdir $OUT

############################
# Functional analysis : EGGNOG analysis of the MAGS 
############################
eval "$(conda shell.bash hook)"
conda activate eggnog-mapper

mkdir $OUT/EGGNOG
cd $OUT/EGGNOG
mkdir $OUT/EGGNOG/temp

# Ancient bins : Diamond
for F in $OUT/PROKKA/*/*.faa; do 
  N=$(basename $F .faa) ;
  mkdir $OUT/EGGNOG/$N ;
  emapper.py -i $F -o test --cpu 28 --output_dir $OUT/EGGNOG/$N --temp_dir $OUT/EGGNOG/temp --excel ;
  #emapper.py -i $F -o test --cpu 28 --output_dir $OUT/EGGNOG/$N --temp_dir $OUT/EGGNOG/temp --excel --decorate_gff yes ;
done

# Modern genomes : Diamond
for F in $OUT/EGGNOG/*; do
sed -i -e '1,4d' $F/test.emapper.annotations;
done

for F in $OUT/EGGNOG/prokka/GCA_013335335.1/*.faa; do 
  N=$(basename $F .faa) ;
  mkdir $OUT/EGGNOG/$N ;
  emapper.py -i $F -o test --cpu 28 --output_dir $OUT/EGGNOG/$N --temp_dir $OUT/EGGNOG/temp --excel ;
  sed -i -e '1,4d' $OUT/EGGNOG/$N/test.emapper.annotations
done 
for F in $OUT/EGGNOG/prokka/GCA_013335765.1/*.faa; do 
  N=$(basename $F .faa) ;
  mkdir $OUT/EGGNOG/$N ;
  emapper.py -i $F -o test --cpu 28 --output_dir $OUT/EGGNOG/$N --temp_dir $OUT/EGGNOG/temp --excel ;
  sed -i -e '1,4d' $OUT/EGGNOG/$N/test.emapper.annotations
done
for F in $OUT/PROKKA-drep-refgenomes/GCF_000015125.1/*.faa; do 
  N=$(basename $F .faa) ;
  mkdir $OUT/EGGNOG/$N ;
  emapper.py -i $F -o test --cpu 28 --output_dir $OUT/EGGNOG/$N --temp_dir $OUT/EGGNOG/temp --excel ;
  sed -i -e '1,4d' $OUT/EGGNOG/$N/test.emapper.annotations
done 
for F in $OUT/PROKKA-drep-refgenomes/GCA_019163275.1/*.faa; do 
  N=$(basename $F .faa) ;
  mkdir $OUT/EGGNOG/$N ;
  emapper.py -i $F -o test --cpu 28 --output_dir $OUT/EGGNOG/$N --temp_dir $OUT/EGGNOG/temp --excel ;
  sed -i -e '1,4d' $OUT/EGGNOG/$N/test.emapper.annotations
done 
for F in $OUT/PROKKA-drep-refgenomes/GCF_001509575.1/*.faa; do 
  N=$(basename $F .faa) ;
  mkdir $OUT/EGGNOG/$N ;
  emapper.py -i $F -o test --cpu 28 --output_dir $OUT/EGGNOG/$N --temp_dir $OUT/EGGNOG/temp --excel ;
  sed -i -e '1,4d' $OUT/EGGNOG/$N/test.emapper.annotations
done 
for F in $OUT/PROKKA-drep-refgenomes/GCF_000020465.1/*.faa; do 
  N=$(basename $F .faa) ;
  mkdir $OUT/EGGNOG/$N ;
  emapper.py -i $F -o test --cpu 28 --output_dir $OUT/EGGNOG/$N --temp_dir $OUT/EGGNOG/temp --excel ;
  sed -i -e '1,4d' $OUT/EGGNOG/$N/test.emapper.annotations
done 
for F in $OUT/PROKKA-drep-refgenomes/GCF_001509575.1/*.faa; do 
  N=$(basename $F .faa) ;
  mkdir $OUT/EGGNOG/$N ;
  emapper.py -i $F -o test --cpu 28 --output_dir $OUT/EGGNOG/$N --temp_dir $OUT/EGGNOG/temp --excel ;
  sed -i -e '1,4d' $OUT/EGGNOG/$N/test.emapper.annotations
done 

conda deactivate

# Convert the kegg ids to labels

wget -O $KEGG_DB/KO.txt "http://rest.kegg.jp/list/ko"
wget -O $KEGG_DB/module.txt "http://rest.kegg.jp/list/module"
wget -O $KEGG_DB/pathway.txt "http://rest.kegg.jp/list/pathway"

sed -i 's/path://g' $KEGG_DB/pathway.txt
echo -e "KEGG_Pathway\tLabel" | cat - $KEGG_DB/pathway.txt > $KEGG_DB/pathway2.txt