In [9]:
import os
import pandas as pd
from pathlib import Path
import numpy as np
from Bio import SeqIO
from Bio.Seq import Seq  # Import the Seq class
from Bio.SeqRecord import SeqRecord  # Import the SeqRecord class for creating sequence records
from collections import defaultdict

In [3]:
proj_dir="/master/nplatt/sch_hae_its-nigeria"
results_dir=f"{proj_dir}/results"

In [122]:
#you can get this table from the SCAN dryad link
info_df=pd.read_csv(f"{proj_dir}/its-nigeria_samplesheet.csv", sep=",")
info_df

Unnamed: 0,wgs_id,its_id,sra,species,country,locale,patient,miracidum
0,Sb_NG_ak_1.1,Sb.ng.ak.1.1F,,sbovis,nigeria,ak,1,1
1,Sb_NG_ak_2.1,sb_ng_ak_2.1,,sbovis,nigeria,ak,2,1
2,Sb_NG_ak_2.2,Sb.ng.ak.2.2F,,sbovis,nigeria,ak,2,2
3,Sb_NG_ak_2.3,Sb.ng.ak.2.3F,,sbovis,nigeria,ak,2,3
4,Sb_NG_ak_3.1,Sb.ng.ak.3.1R,,sbovis,nigeria,ak,3,1
...,...,...,...,...,...,...,...,...
200,Sh_NG_os_3_1,sh_ng_os_3_1,,shaematobium,nigeria,osun,3,1
201,c_Sh_NG_os_3_11,Sh.ng.os.3.11F,,shaematobium,nigeria,osun,3,11
202,c_Sh_NG_os_3_5,Sh.ng.os.3.5F,,shaematobium,nigeria,osun,3,5
203,c_Sh_NG_os_3_6,Sh.ng.os.3.6F,,shaematobium,nigeria,osun,3,6


# Get results from other analyses

In [123]:
Path(f"{results_dir}/sanger").mkdir(parents=True, exist_ok=True)
os.chdir(f"{results_dir}/sanger")

In [None]:
%%bash

cp ~/sch_hae_its-nigeria/data/sanger_its_sequences.fas .

In [None]:
%%bash 

conda run -n muscle --align sanger_its_sequences.fas --output sanger_its_seqeunces.muscle.fas

In [125]:
#manually trimmed the ends so that each sample contains a complete sequence

In [126]:
its_fasta_file = 'sanger_its_seqeunces.muscle.trimmed.fas'

In [127]:
merged_df=info_df.copy()

## Add ITS sequences

In [128]:
# Dictionary to store sequences and their identifiers
# Key: sequence (as a string), Value: list of sequence identifiers
sequence_dict = defaultdict(list)

# Read the FASTA file
for record in SeqIO.parse(its_fasta_file, "fasta"):
    # Append the record.id to the list of ids for this sequence
    sequence_dict[record.id].append(str(record.seq))

its_sequences=[]

for index, row in merged_df.iterrows():
    try:
        seq=sequence_dict[row["its_id"]][0]
        its_sequences.append(seq)
    except:
        its_sequences.append("")
        
merged_df["its_seq"] = its_sequences

merged_df

Unnamed: 0,wgs_id,its_id,sra,species,country,locale,patient,miracidum,its_seq
0,Sb_NG_ak_1.1,Sb.ng.ak.1.1F,,sbovis,nigeria,ak,1,1,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...
1,Sb_NG_ak_2.1,sb_ng_ak_2.1,,sbovis,nigeria,ak,2,1,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...
2,Sb_NG_ak_2.2,Sb.ng.ak.2.2F,,sbovis,nigeria,ak,2,2,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...
3,Sb_NG_ak_2.3,Sb.ng.ak.2.3F,,sbovis,nigeria,ak,2,3,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...
4,Sb_NG_ak_3.1,Sb.ng.ak.3.1R,,sbovis,nigeria,ak,3,1,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...
...,...,...,...,...,...,...,...,...,...
200,Sh_NG_os_3_1,sh_ng_os_3_1,,shaematobium,nigeria,osun,3,1,CATAATGATGCATGCACCTGGCTTCTTGCTGGACTGTATGTACCCT...
201,c_Sh_NG_os_3_11,Sh.ng.os.3.11F,,shaematobium,nigeria,osun,3,11,TATAATGATGCATGCACCTGGCTTCTTGCTGGACTGTATGTACCCT...
202,c_Sh_NG_os_3_5,Sh.ng.os.3.5F,,shaematobium,nigeria,osun,3,5,TATAATGATGCATGCACCTGGCTTCTTGCTGGACTGTATGTACCCT...
203,c_Sh_NG_os_3_6,Sh.ng.os.3.6F,,shaematobium,nigeria,osun,3,6,TATAATGATGCATGCACCTGGCTTCTTGCTGGACTGTATGTACCCT...


## Assign ITS seqeunces haplotype IDS

In [129]:
genotype_groups = (
    merged_df.groupby("its_seq")
    .agg(genotype_count=("its_seq", "count"))
    .reset_index()
)

genotype_groups

# Assign haplotype labels
genotype_groups["genotype_label"] = genotype_groups.apply(
    lambda row: f"its-{row.name + 1}_n{row.genotype_count}", axis=1
)

# Merge the haplotype labels back to the original DataFrame
merged_df = merged_df.merge(
    genotype_groups[["its_seq", "genotype_label"]],
    on="its_seq",
    how="left"
)

# Display the resulting DataFrame
merged_df.to_csv("its_df.csv", sep=",", header=True, index=False)
merged_df

Unnamed: 0,wgs_id,its_id,sra,species,country,locale,patient,miracidum,its_seq,genotype_label
0,Sb_NG_ak_1.1,Sb.ng.ak.1.1F,,sbovis,nigeria,ak,1,1,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...,its-32_n34
1,Sb_NG_ak_2.1,sb_ng_ak_2.1,,sbovis,nigeria,ak,2,1,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...,its-32_n34
2,Sb_NG_ak_2.2,Sb.ng.ak.2.2F,,sbovis,nigeria,ak,2,2,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...,its-32_n34
3,Sb_NG_ak_2.3,Sb.ng.ak.2.3F,,sbovis,nigeria,ak,2,3,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...,its-33_n1
4,Sb_NG_ak_3.1,Sb.ng.ak.3.1R,,sbovis,nigeria,ak,3,1,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...,its-32_n34
...,...,...,...,...,...,...,...,...,...,...
200,Sh_NG_os_3_1,sh_ng_os_3_1,,shaematobium,nigeria,osun,3,1,CATAATGATGCATGCACCTGGCTTCTTGCTGGACTGTATGTACCCT...,its-3_n7
201,c_Sh_NG_os_3_11,Sh.ng.os.3.11F,,shaematobium,nigeria,osun,3,11,TATAATGATGCATGCACCTGGCTTCTTGCTGGACTGTATGTACCCT...,its-9_n44
202,c_Sh_NG_os_3_5,Sh.ng.os.3.5F,,shaematobium,nigeria,osun,3,5,TATAATGATGCATGCACCTGGCTTCTTGCTGGACTGTATGTACCCT...,its-17_n14
203,c_Sh_NG_os_3_6,Sh.ng.os.3.6F,,shaematobium,nigeria,osun,3,6,TATAATGATGCATGCACCTGGCTTCTTGCTGGACTGTATGTACCCT...,its-9_n44


In [130]:
with open("its_genotypes.fas", 'w') as f:
    for index, row in merged_df.iterrows():
        its_id=row["its_id"]
        geno_id=row["genotype_label"]
        seq=row["its_seq"]
        f.write(f">{its_id}#{geno_id}\n{seq}\n")

In [131]:
diagnostic_positions = [33, 685, 740, 790, 860]

sample_diagnostic_calls = []
sample_diagnostic_genotype=[]

for index, row in merged_df.iterrows():
    seq=row["its_seq"]
    try:
        calls=[seq[pos - 1] for pos in diagnostic_positions]
        genotype="".join(calls)
        
        sample_diagnostic_calls.append(calls)
        sample_diagnostic_genotype.append(genotype)
    except:
        calls=["?","?","?","?","?"]
        genotype="".join(calls)
    
        sample_diagnostic_calls.append(calls)
        sample_diagnostic_genotype.append(genotype)

merged_df["diagnostic_genotype"]=sample_diagnostic_genotype

# Create a DataFrame from the diagnostic calls
diagnostic_df = pd.DataFrame(
    sample_diagnostic_calls, 
    columns=[f"diagnostic_site_{pos}" for pos in diagnostic_positions]
)

# Concatenate the diagnostic DataFrame with merged_df
merged_df = pd.concat([merged_df, diagnostic_df], axis=1)

# Display the updated DataFrame
merged_df.to_csv("its_df.csv", sep=",", header=True, index=False)
merged_df

Unnamed: 0,wgs_id,its_id,sra,species,country,locale,patient,miracidum,its_seq,genotype_label,diagnostic_genotype,diagnostic_site_33,diagnostic_site_685,diagnostic_site_740,diagnostic_site_790,diagnostic_site_860
0,Sb_NG_ak_1.1,Sb.ng.ak.1.1F,,sbovis,nigeria,ak,1,1,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...,its-32_n34,GATAT,G,A,T,A,T
1,Sb_NG_ak_2.1,sb_ng_ak_2.1,,sbovis,nigeria,ak,2,1,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...,its-32_n34,GATAT,G,A,T,A,T
2,Sb_NG_ak_2.2,Sb.ng.ak.2.2F,,sbovis,nigeria,ak,2,2,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...,its-32_n34,GATAT,G,A,T,A,T
3,Sb_NG_ak_2.3,Sb.ng.ak.2.3F,,sbovis,nigeria,ak,2,3,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...,its-33_n1,GATAT,G,A,T,A,T
4,Sb_NG_ak_3.1,Sb.ng.ak.3.1R,,sbovis,nigeria,ak,3,1,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...,its-32_n34,GATAT,G,A,T,A,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,Sh_NG_os_3_1,sh_ng_os_3_1,,shaematobium,nigeria,osun,3,1,CATAATGATGCATGCACCTGGCTTCTTGCTGGACTGTATGTACCCT...,its-3_n7,AGCGC,A,G,C,G,C
201,c_Sh_NG_os_3_11,Sh.ng.os.3.11F,,shaematobium,nigeria,osun,3,11,TATAATGATGCATGCACCTGGCTTCTTGCTGGACTGTATGTACCCT...,its-9_n44,AGCGC,A,G,C,G,C
202,c_Sh_NG_os_3_5,Sh.ng.os.3.5F,,shaematobium,nigeria,osun,3,5,TATAATGATGCATGCACCTGGCTTCTTGCTGGACTGTATGTACCCT...,its-17_n14,AGCGC,A,G,C,G,C
203,c_Sh_NG_os_3_6,Sh.ng.os.3.6F,,shaematobium,nigeria,osun,3,6,TATAATGATGCATGCACCTGGCTTCTTGCTGGACTGTATGTACCCT...,its-9_n44,AGCGC,A,G,C,G,C


In [132]:
merged_df["diagnostic_genotype"].unique()

array(['GATAT', 'AGCGC', 'ARYRC', 'ARYRY', 'AATAT', 'ARYAT', 'ARCRC',
       '?????'], dtype=object)

In [133]:
diagnostic_classes = {'AGCGC': 'SHxSH', 
                      'ARYRC': 'SHxSC', 
                      'ARYRY': 'SHxSC', 
                      'AATAT': 'SCxSC', 
                      'GATAT': 'SBxSB', 
                      'ARCRC': 'SHxSC',
                      'ARYAT': 'SHxSC',
                      '?????': 'NA'}


classes=[]
for index, row in merged_df.iterrows():
    classes.append(diagnostic_classes[row["diagnostic_genotype"]])

merged_df["its_class"] = classes
merged_df

Unnamed: 0,wgs_id,its_id,sra,species,country,locale,patient,miracidum,its_seq,genotype_label,diagnostic_genotype,diagnostic_site_33,diagnostic_site_685,diagnostic_site_740,diagnostic_site_790,diagnostic_site_860,its_class
0,Sb_NG_ak_1.1,Sb.ng.ak.1.1F,,sbovis,nigeria,ak,1,1,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...,its-32_n34,GATAT,G,A,T,A,T,SBxSB
1,Sb_NG_ak_2.1,sb_ng_ak_2.1,,sbovis,nigeria,ak,2,1,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...,its-32_n34,GATAT,G,A,T,A,T,SBxSB
2,Sb_NG_ak_2.2,Sb.ng.ak.2.2F,,sbovis,nigeria,ak,2,2,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...,its-32_n34,GATAT,G,A,T,A,T,SBxSB
3,Sb_NG_ak_2.3,Sb.ng.ak.2.3F,,sbovis,nigeria,ak,2,3,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...,its-33_n1,GATAT,G,A,T,A,T,SBxSB
4,Sb_NG_ak_3.1,Sb.ng.ak.3.1R,,sbovis,nigeria,ak,3,1,TATAATGATGCATGCACCTGGCTTCTTGCTGGGCTGTATGTACCCT...,its-32_n34,GATAT,G,A,T,A,T,SBxSB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,Sh_NG_os_3_1,sh_ng_os_3_1,,shaematobium,nigeria,osun,3,1,CATAATGATGCATGCACCTGGCTTCTTGCTGGACTGTATGTACCCT...,its-3_n7,AGCGC,A,G,C,G,C,SHxSH
201,c_Sh_NG_os_3_11,Sh.ng.os.3.11F,,shaematobium,nigeria,osun,3,11,TATAATGATGCATGCACCTGGCTTCTTGCTGGACTGTATGTACCCT...,its-9_n44,AGCGC,A,G,C,G,C,SHxSH
202,c_Sh_NG_os_3_5,Sh.ng.os.3.5F,,shaematobium,nigeria,osun,3,5,TATAATGATGCATGCACCTGGCTTCTTGCTGGACTGTATGTACCCT...,its-17_n14,AGCGC,A,G,C,G,C,SHxSH
203,c_Sh_NG_os_3_6,Sh.ng.os.3.6F,,shaematobium,nigeria,osun,3,6,TATAATGATGCATGCACCTGGCTTCTTGCTGGACTGTATGTACCCT...,its-9_n44,AGCGC,A,G,C,G,C,SHxSH


In [137]:
merged_df[["wgs_id", "its_id", "its_seq", "genotype_label", "diagnostic_genotype", "diagnostic_site_33", "diagnostic_site_685", "diagnostic_site_740", 
           "diagnostic_site_790", "diagnostic_site_860", "its_class"]].to_csv("its_class.csv", sep=",", header=True, index=False)

In [135]:
target_species = ['shaematobium', 'sbovis', 'shxsb', 'sbxsc', 'scurassoni']
merged_df.loc[~merged_df["species"].isin(target_species)]

Unnamed: 0,wgs_id,its_id,sra,species,country,locale,patient,miracidum,its_seq,genotype_label,diagnostic_genotype,diagnostic_site_33,diagnostic_site_685,diagnostic_site_740,diagnostic_site_790,diagnostic_site_860,its_class
148,ERR119612,,ERR119612,guineensis,saotome,na,na,na,,its-1_n26,?????,?,?,?,?,?,
149,ERR119613,,ERR119613,intercalatum,drcongo,na,na,na,,its-1_n26,?????,?,?,?,?,?,
150,ERR310940,,ERR310940,margrebowiei,zambia,na,na,na,,its-1_n26,?????,?,?,?,?,?,
151,ERR103051,,ERR103051,matthei,zambia,na,na,na,,its-1_n26,?????,?,?,?,?,?,


In [136]:
merged_df["its_class"].value_counts()

its_class
SHxSH    72
SBxSB    41
SHxSC    37
SCxSC    29
NA       26
Name: count, dtype: int64