In [1]:
import pandas as pd
from Bio import Entrez, SeqIO
from Bio.Blast import NCBIWWW, NCBIXML
import re
from collections import Counter
import os

In [2]:
fastq_folder = "sch fastq files"
fastq_files = [os.path.join(fastq_folder, file) for file in os.listdir(fastq_folder) if file.endswith(".fastq")]

all_genera = []

for fastq_file in fastq_files:
    ids = []
    sequences = []

    with open(fastq_file, "r") as handle:
        for record in SeqIO.parse(handle, "fastq"):
            ids.append(record.id)
            sequences.append(str(record.seq))


    df1 = pd.DataFrame({'ID': ids, 'Sequence': sequences})

    genomic_sequence = df1['Sequence'][0][:1000]
    result_handle = NCBIWWW.qblast("blastn", "nt", genomic_sequence)
    blast_records = NCBIXML.parse(result_handle)

    genera = []

    for blast_record in blast_records:
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < 0.05:
                    order_match = re.search(r"Uncultured (\b[A-Z][a-z]+(?:ales|aceae|ota|etes)\b)", alignment.title)
                    match = re.search(r"Uncultured ([A-Z][a-z]+) sp", alignment.title)
                    general_match = re.search(r"\b([A-Z][a-z]+)\b(?!\s*(bacterium|organism|RNA|DNA))", alignment.title)
                    unknown_match = re.search(r"Uncultured (bacterium|organism)", alignment.title, re.IGNORECASE)
                    rumen_match = re.search(r"Uncultured rumen bacterium", alignment.title, re.IGNORECASE)

                    if order_match:
                        genera.append(order_match.group(1))
                    elif match:
                        genera.append(match.group(1))
                    elif rumen_match:
                        genera.append("Rumen Bacterium")
                    elif general_match:
                        genera.append(general_match.group(1))
                    elif unknown_match:
                        genera.append("Uncultured Organism")
                    
    genus_counts = Counter(genera)

    df = pd.DataFrame(genus_counts.items(), columns=['Genus', 'Count'])

    df['Relative Abundance'] = df['Count'] / df['Count'].sum()

    df = df.drop(columns=['Count'])

    df = df.sort_values(by='Relative Abundance', ascending=False)
    
    df = df.T

    df.insert(0, 'File Name', os.path.basename(fastq_file))
    
    df = df.reset_index(drop=True)
    
    df.columns = df.iloc[0]
    df = df.drop(0)
    
    df = df.rename(columns={os.path.basename(fastq_file): 'File Name'})
    
    all_genera.append(df)
    
final_df = pd.concat(all_genera, ignore_index=True)




In [3]:
final_df

Unnamed: 0,File Name,Uncultured Organism,Faecalibacterium,Oscillospiraceae,Rumen Bacterium,Clostridium,Monoglobus,Uncultured,Clostridiaceae,Coprococcus,...,Escherichia,Chain,Streptococcus,Prevotellaceae,Christensenella,Eubacterium,Ruminococcaceae,Holdemanella,Eisenbergiella,Bacterium
0,SRR18576085.fastq,0.88,0.12,,,,,,,,...,,,,,,,,,,
1,SRR18576120.fastq,0.74,,0.1,0.06,0.04,0.02,0.02,0.02,,...,,,,,,,,,,
2,SRR18576104.fastq,0.82,,,,,,0.08,,0.1,...,,,,,,,,,,
3,SRR18576106.fastq,0.836364,0.163636,,,,,,,,...,,,,,,,,,,
4,SRR18576087.fastq,1.0,,,,,,,,,...,,,,,,,,,,
5,SRR18576122.fastq,1.0,,,,,,,,,...,,,,,,,,,,
6,SRR18576126.fastq,1.0,,,,,,,,,...,,,,,,,,,,
7,SRR18576102.fastq,0.94,,,,,,,,,...,,,,,,,,,,
8,SRR18576100.fastq,0.94,,,,0.06,,,,,...,,,,,,,,,,
9,SRR18576098.fastq,,,,,,,,,,...,,,,,,,,,,


In [14]:
final_df.to_csv('extra_sch_data.csv')

In [4]:
# final_df = pd.concat(all_genera, ignore_index=True)
# #final_df = final_df.drop(['File Name'], axis=1)
# final_df

In [5]:
# dft = final_df.T
# dft = dft.reset_index()
# dft.columns = dft.iloc[0]
# df_final = dft.drop(0).reset_index(drop=True)
# #df_final = df_final.iloc[1:].reset_index(drop=True)
# #df_final = df_final.drop(df_final.columns[0], axis=1)
# df_final = df_final.rename(columns={'Genus': 'File Name'})
# df_final

In [6]:
# dft.reset_index()
# dft

In [7]:
# genomic_sequence = df1['Sequence'][0][:1000]
# result_handle = NCBIWWW.qblast("blastn", "nt", genomic_sequence)
# blast_records = NCBIXML.parse(result_handle)

In [8]:
# genera = []

# for blast_record in blast_records:
#     for alignment in blast_record.alignments:
#         for hsp in alignment.hsps:
#             if hsp.expect < 0.05:
#                 match = re.search(r"\b([A-Z][a-z]+)\b", alignment.title)  # Extract first capitalized word
#                 if match:
#                     genus = match.group(1)
#                     genera.append(genus)
                    
# genus_counts = Counter(genera)

# df = pd.DataFrame(genus_counts.items(), columns=['Genus', 'Count'])

# df['Relative Abundance'] = df['Count'] / df['Count'].sum()

# df = df.drop(columns=['Count'])

# df = df.sort_values(by='Relative Abundance', ascending=False)



In [9]:
# df = df.rename(columns={'Genus': 'Run ID', 'Relative Abundance': 'file name'})
# df

In [10]:
# dft = df.T
# dft = dft.reset_index()
# dft.columns = dft.iloc[0]
# df_final = dft.drop(0).reset_index(drop=True)
# df_final

In [11]:
# all_genera = []
# for fastq_file in os.listdir(fastq_folder):
#     fastq_path = os.path.join(fastq_folder, fastq_file)
#     for path in fastq_path:
#         genera = []

#         for blast_record in blast_records:
#             for alignment in blast_record.alignments:
#                 for hsp in alignment.hsps:
#                     if hsp.expect < 0.05:
#                         match = re.search(r"\b([A-Z][a-z]+)\b", alignment.title)
#                         if match:
#                             genus = match.group(1)
#                             if genus.lower() not in ["uncultured", "bacterium", "organism"]:
#                                 genera.append(genus)

# list_genera = all_genera

# # genus_counts = Counter(genera)

# # df = pd.DataFrame(genus_counts.items(), columns=['Genus', 'Count'])

# # df['Relative Abundance'] = df['Count'] / df['Count'].sum()

# # df = df.sort_values(by='Relative Abundance', ascending=False)

# # df.insert(0, 'Unique Sequence ID', [seq_id for seq_id, _ in all_genera])

# # df

In [12]:
# all_genera = []

# for index, row in df.iterrows():
#     unique_id = row["ID"]
#     genomic_sequence = row["Sequence"][:1000]


#     result_handle = NCBIWWW.qblast("blastn", "nt", genomic_sequence)
#     blast_records = NCBIXML.parse(result_handle)

#     genera = []
#     for blast_record in blast_records:
#         for alignment in blast_record.alignments:
#             for hsp in alignment.hsps:
#                 if hsp.expect < 0.05:  # Consider only significant matches
#                     match = re.search(r"\b([A-Z][a-z]+)\b", alignment.title)
#                     if match:
#                         genus = match.group(1)
#                         if genus.lower() not in ["uncultured", "bacterium", "organism"]:
#                             genera.append(genus)

# all_genera.append({"ID": unique_id, "Genera": genera})


In [13]:
# flat_genera = [genus for _, genera in all_genera for genus in genera]
# genus_counts = Counter(flat_genera)

# df_genera = pd.DataFrame(genus_counts.items(), columns=['Genus', 'Count'])

# df_genera['Relative Abundance'] = df_genera['Count'] / df_genera['Count'].sum()

# df_genera = df_genera.sort_values(by='Relative Abundance', ascending=False)

# print(df_genera)