In [1]:
import pandas as pd
from Bio import Entrez, SeqIO
from Bio.Blast import NCBIWWW, NCBIXML
import re
from collections import Counter
import os
from collections import Counter


In [2]:
ids = []
sequences = []

fastq_file = "dep fastq files/ERR10786170.fastq"

with open(fastq_file, "r") as handle:
    for i, record in enumerate(SeqIO.parse(handle, "fastq")):
        ids.append(record.id)
        sequences.append(record.seq)

df1 = pd.DataFrame({'ID': ids, 'Sequence': sequences})

df1.head()

Unnamed: 0,ID,Sequence
0,ERR10786170.1,"(G, G, T, T, C, A, A, A, A, T, G, C, C, C, G, ..."
1,ERR10786170.1,"(C, N, C, C, A, T, T, A, C, C, T, T, T, C, G, ..."
2,ERR10786170.2,"(A, G, T, T, T, C, T, T, C, C, A, G, T, T, T, ..."
3,ERR10786170.2,"(T, T, T, T, G, C, A, A, G, T, A, T, T, C, A, ..."
4,ERR10786170.3,"(A, G, C, C, T, G, A, A, C, A, T, C, T, C, C, ..."


In [3]:
genomic_sequence = df1['Sequence'][0]

result_handle = NCBIWWW.qblast("blastn", "nt", genomic_sequence)

blast_records = NCBIXML.parse(result_handle)

for blast_record in blast_records:
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < 0.05:
                print("Bacteria Name:", alignment.title)

Bacteria Name: gi|1887004768|gb|CP040530.1| Bacteroides thetaiotaomicron strain DSM 2079 chromosome, complete genome
Bacteria Name: gi|2323338499|gb|CP083687.1| Bacteroides thetaiotaomicron strain VPI-2808B chromosome
Bacteria Name: gi|29342101|gb|AE015928.1| Bacteroides thetaiotaomicron VPI-5482, complete genome
Bacteria Name: gi|2290644306|gb|CP103155.1| Bacteroides thetaiotaomicron strain BFG-576 chromosome, complete genome
Bacteria Name: gi|2290595203|gb|CP103214.1| Bacteroides thetaiotaomicron strain BFG-498 chromosome, complete genome
Bacteria Name: gi|2202814649|gb|CP092641.1| Bacteroides thetaiotaomicron strain VPI 5482 chromosome, complete genome
Bacteria Name: gi|2290770942|gb|CP103238.1| Bacteroides thetaiotaomicron strain BFG-130 chromosome, complete genome
Bacteria Name: gi|2323328724|gb|CP083685.1| Bacteroides thetaiotaomicron strain VPI-3443 chromosome
Bacteria Name: gi|2290599450|gb|CP103118.1| Bacteroides thetaiotaomicron strain BFG-510 chromosome, complete genome
Bact

In [2]:
fastq_file = "dep fastq files/ERR10786170.fastq"

ids = []
sequences = []

with open(fastq_file, "r") as handle:
    for record in SeqIO.parse(handle, "fastq"):
        ids.append(record.id)
        sequences.append(str(record.seq))

df1 = pd.DataFrame({'ID': ids, 'Sequence': sequences})

genomic_sequence = df1['Sequence'][0][:1000]
result_handle = NCBIWWW.qblast("blastn", "nt", genomic_sequence)
blast_records = NCBIXML.parse(result_handle)

genera = []

for blast_record in blast_records:
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < 0.05:
                order_match = re.search(r"Uncultured (\b[A-Z][a-z]+(?:ales|aceae|ota|etes)\b)", alignment.title)
                match = re.search(r"Uncultured ([A-Z][a-z]+) sp", alignment.title) 
                general_match = re.search(r"\b([A-Z][a-z]+)\b(?!\s*(bacterium|organism|RNA|DNA))", alignment.title)
                unknown_match = re.search(r"Uncultured (bacterium|organism)", alignment.title, re.IGNORECASE)
                rumen_match = re.search(r"Uncultured rumen bacterium", alignment.title, re.IGNORECASE)

                
                if order_match:
                    genera.append(order_match.group(1))
                elif match:
                    genera.append(match.group(1))
                elif rumen_match:
                    genera.append("Rumen Bacterium")  # Custom category for clarity
                elif general_match:
                    genera.append(general_match.group(1))
                elif unknown_match:
                    genera.append("Uncultured Organism")



genus_counts = Counter(genera)

df = pd.DataFrame(genus_counts.items(), columns=['Genus', 'Count'])
df['Relative Abundance'] = df['Count'] / df['Count'].sum()
df = df.drop(columns=['Count'])
df = df.sort_values(by='Relative Abundance', ascending=False)

df = df.T
df.insert(0, 'File Name', os.path.basename(fastq_file))
df = df.reset_index(drop=True)
df.columns = df.iloc[0] 
df = df.drop(0)
df = df.rename(columns={os.path.basename(fastq_file): 'File Name'})

df


Unnamed: 0,File Name,Bacteroides,Parabacteroides
1,ERR10786170.fastq,0.921569,0.078431


In [3]:
df.columns

Index(['File Name', 'Enterocloster', 'Ruminococcus', 'Eubacterium',
       'Caproicibacterium', 'Agathobacter', 'Dorea', 'Blautia',
       'Eisenbergiella', 'Roseburia', 'Clostridiales', 'Anaerocolumna',
       'Lachnoclostridium', 'Marvinbryantia', 'Caudoviricetes',
       'Rossellomorea', 'Bacteriophage', 'Candidatus', 'Bacillota',
       'Domibacillus', 'Caproicibacter', 'Caproiciproducens'],
      dtype='object', name=0)

In [3]:
# title = "Bacteria Name: gi|2025053910|dbj|AP023418.1| Vescimonas coprocola MM50 DNA, complete genome"

# match = re.search(r"\b([A-Z][a-z]+[a-zA-Z]*)\b(?!.*bacterium)(?!.*organism)(?!.*RNA)(?!.*DNA)", title)

# print("Match:", match.group(1) if match else "No match")


In [4]:
# title = "gi|2025053910|dbj|AP023418.1| Vescimonas coprocola MM50 DNA, complete genome"

# # Perform regex searches
# order_match = re.search(r"Uncultured (\b[A-Z][a-z]+(?:ales|aceae|ota|etes)\b)", title)
# match = re.search(r"Uncultured ([A-Z][a-z]+) sp", title)
# rumen_match = re.search(r"Uncultured rumen bacterium", title, re.IGNORECASE)
# unknown_match = re.search(r"Uncultured (bacterium|organism)", title, re.IGNORECASE)
# general_match = re.search(r"\b([A-Z][a-z]+)\b(?!\s*(bacterium|organism|RNA|DNA))", title)

# # Initialize the genera list
# genera = []

# # Check matches and append accordingly
# if order_match:
#     print("Order match found:", order_match.group(1))
#     genera.append(order_match.group(1))
# elif match:
#     print("Species match found:", match.group(1))
#     genera.append(match.group(1))
# elif rumen_match:
#     print("Rumen match found")
#     genera.append("Rumen Bacterium")
# elif unknown_match:
#     print("Unknown match found")
#     genera.append("Uncultured Organism")
# elif general_match:
#     print("General match found:", general_match.group(1))
#     genera.append(general_match.group(1))
# else:
#     print("No match found for:", title)

# # Print results
# print("Final genera list:", genera)


In [5]:
# title = "gi|2025053910|dbj|AP023418.1| Vescimonas coprocola MM50 DNA, complete genome"

# # Simplified regex to check for "Vescimonas"
# match = re.search(r"\b([A-Z][a-z]+)\b", title)

# if match:
#     print("Match found:", match.group(1))
# else:
#     print("No match found")
