In [1]:
import pandas as pd
import json

from Bio import Entrez, SeqIO
import lxml

API_KEY = "aabef60cf233ca4c95ce4dd917ddca3a5309" # Please extract api key from NCBI website (sign in and get access to API KEY) 
Entrez.email = 'b_thapamagar@mail.fhsu.edu' # Mention the email address same as which is used to sign in NCBI.
Entrez.api_key = API_KEY

In [2]:
sra_metadata_list = []
with open("Data/SRA_data/DATA_sra_info_in_pmc.json", 'r') as file:
    sra_metadata_list = json.load(file)
# sra_metadata_list

In [3]:
nucleotide_metadata_list = []
with open("Data/Nucleotide_metadata/DATA_nucleotide_metadata_list.json", 'r') as file:
    nucleotide_metadata_list = json.load(file)
# nucleotide_metadata_list

In [4]:
print(len(sra_metadata_list))
print(len(nucleotide_metadata_list))

1976
5144


In [5]:
# Nucleotide csv file was missing bioporject accession information. So, this script is used to extract the bioproject information from nucleotide csv and add it to the nucleotide metadata json file.

nucleotide_list = pd.read_csv("DATA_Nucleotide_Pubmed_data_combined.csv")
nucleotide_list_for_bioproj = []
for item in nucleotide_metadata_list:
    nucleotide_row = nucleotide_list[nucleotide_list['AccessionID'] == item['AccessionID']]
    item["BioProject"] = nucleotide_row["BioProject"].values[0]
    nucleotide_list_for_bioproj.append(item)

In [6]:
from collections import defaultdict

grouped_sra_data = defaultdict(list)
for item in sra_metadata_list:
    grouped_sra_data[item['BioProject']].append(item)

In [7]:
len(grouped_sra_data.keys())

28

In [8]:
grouped_nucleotide_data = defaultdict(list)
for item in nucleotide_metadata_list:
    grouped_nucleotide_data[item['BioProject']].append(item)

In [9]:
len(grouped_nucleotide_data.keys())

28

In [10]:
count = 0
is_found = False
for item in nucleotide_metadata_list:
    if("isolate" in item.get("Source", {})):
        isolate_id = item["Source"]["isolate"]
        for sra_item in sra_metadata_list:
            if "LIBRARY_NAME" in sra_item.get("Library", {}):
                if sra_item["Library"]["LIBRARY_NAME"] == isolate_id:
                    is_found = True
                    print(f"Found matching SRA item for {item["AccessionID"]}: {sra_item["SRA_Id"]}")
        if is_found:
            count += 1
        is_found = False
print(f"\nTotal number of matching SRA items found comparing isolate from Nucleotide and Library name from SRA: {count}")

Found matching SRA item for KY797263: 4685339
Found matching SRA item for KY797262: 4685351
Found matching SRA item for KY797261: 4685347
Found matching SRA item for KY797260: 4685346
Found matching SRA item for KY797259: 4685344
Found matching SRA item for KY797258: 4685343
Found matching SRA item for KY797257: 4685341
Found matching SRA item for KY797256: 4685340
Found matching SRA item for KY797255: 4685353
Found matching SRA item for KY797254: 4685352
Found matching SRA item for KY797253: 4685345
Found matching SRA item for KY797250: 4685348
Found matching SRA item for MK139624: 11322282
Found matching SRA item for MK139623: 11322296
Found matching SRA item for MK139622: 11322277
Found matching SRA item for MK139621: 11322297
Found matching SRA item for MK139620: 11322283
Found matching SRA item for MK139619: 11322293
Found matching SRA item for MK139618: 11322292
Found matching SRA item for MK139617: 11322291
Found matching SRA item for MK139616: 11322276
Found matching SRA item f

In [11]:
# iterate through key in grouped dict: Specific BioProject contains multiple SRA and Nucleotide data

for key in grouped_nucleotide_data.keys():
    nucleotide_list = grouped_nucleotide_data[key]
    sra_list = grouped_sra_data[key]
    print(f"{key}: nucleotide_len: {len(nucleotide_list)} and sra_len: {len(sra_list)}")

PRJNA889366: nucleotide_len: 1 and sra_len: 2
PRJNA850430: nucleotide_len: 1 and sra_len: 1
PRJNA644600: nucleotide_len: 1 and sra_len: 10
PRJNA636291: nucleotide_len: 105 and sra_len: 97
PRJNA421333: nucleotide_len: 55 and sra_len: 80
PRJNA416844: nucleotide_len: 98 and sra_len: 15
PRJNA335708: nucleotide_len: 1 and sra_len: 14
PRJNA295854: nucleotide_len: 1 and sra_len: 1
PRJNA266605: nucleotide_len: 1 and sra_len: 1
PRJNA213001: nucleotide_len: 44 and sra_len: 44
PRJEB58818: nucleotide_len: 1 and sra_len: 2
PRJEB58193: nucleotide_len: 1 and sra_len: 1
PRJEB47085: nucleotide_len: 1142 and sra_len: 4
PRJEB46830: nucleotide_len: 1856 and sra_len: 95
PRJEB4417: nucleotide_len: 1058 and sra_len: 623
PRJEB43051: nucleotide_len: 1 and sra_len: 8
PRJEB42372: nucleotide_len: 84 and sra_len: 50
PRJEB36639: nucleotide_len: 415 and sra_len: 649
PRJEB35466: nucleotide_len: 6 and sra_len: 40
PRJEB29663: nucleotide_len: 22 and sra_len: 10
PRJEB29569: nucleotide_len: 73 and sra_len: 48
PRJEB29189: 

In [12]:
# PRJNA213001: nucleotide_len: 44 and sra_len: 44

file_name = "Data/Nucleotide_SRA_Mapped_data/PRJNA213001_mapped_record.json"

sra_list = []
sra_list = grouped_sra_data["PRJNA213001"]
nucleotide_list = grouped_nucleotide_data["PRJNA213001"]
mapped_records = []
for nucleotide_item in nucleotide_list:
    isolate_name = nucleotide_item["Source"]["isolate"]
    isolate_chunks = isolate_name.split("_")
    if(len(isolate_chunks)> 1):
        isolate_name = f"{isolate_chunks[0]} {isolate_chunks[1]}"
        # print(isolate_name)
        matched_sra_list = [sra_item for sra_item in sra_list if sra_item["Sample_Alias"] == isolate_name]
        mapped_object = {
            "nucleotide_info": nucleotide_item,
            "sra_info": matched_sra_list,
            "sra_count": len(matched_sra_list)
        }
        mapped_records.append(mapped_object)

# len(mapped_records)
with open(file_name, "w") as file:
        json.dump(mapped_records, file, indent=4)
        
print(f"Total mapped count: {len(mapped_records)}")
print(f"Unique one to one mapped count: {len([item for item in mapped_records if item["sra_count"] == 1])}")

Total mapped count: 44
Unique one to one mapped count: 44


In [13]:
# PRJEB22699: nucleotide_len: 18 and sra_len: 23

file_name = "Data/Nucleotide_SRA_Mapped_data/PRJEB22699_mapped_record.json"

sra_list = grouped_sra_data["PRJEB22699"]
nucleotide_list = grouped_nucleotide_data["PRJEB22699"]
mapped_records = []
for nucleotide_item in nucleotide_list:
    isolate_name = nucleotide_item["Source"]["isolate"]
    matched_sra_list = [sra_item for sra_item in sra_list if sra_item["Library"]["LIBRARY_NAME"] == isolate_name]
    if(len(matched_sra_list) > 0):
        mapped_object = {
            "nucleotide_info": nucleotide_item,
            "sra_info": matched_sra_list,
            "sra_count": len(matched_sra_list)
        }
        mapped_records.append(mapped_object)

with open(file_name, "w") as file:
        json.dump(mapped_records, file, indent=4)

print(f"Total mapped count: {len(mapped_records)}")
print(f"Unique one to one mapped count: {len([item for item in mapped_records if item["sra_count"] == 1])}")

Total mapped count: 13
Unique one to one mapped count: 4


In [14]:
# PRJEB29569: nucleotide_len: 73 and sra_len: 48

file_name = "Data/Nucleotide_SRA_Mapped_data/PRJEB29569_mapped_record.json"

sra_list = grouped_sra_data["PRJEB29569"]
nucleotide_list = grouped_nucleotide_data["PRJEB29569"]
mapped_records = []
for nucleotide_item in nucleotide_list:
    isolate_name = nucleotide_item["Source"]["isolate"]
    matched_sra_list = [sra_item for sra_item in sra_list if sra_item["Library"]["LIBRARY_NAME"] == isolate_name]
    if(len(matched_sra_list) > 0):
        mapped_object = {
            "nucleotide_info": nucleotide_item,
            "sra_info": matched_sra_list,
            "sra_count": len(matched_sra_list)
        }
        mapped_records.append(mapped_object)

with open(file_name, "w") as file:
        json.dump(mapped_records, file, indent=4)

print(f"Total mapped count: {len(mapped_records)}")
print(f"Unique one to one mapped count: {len([item for item in mapped_records if item["sra_count"] == 1])}")

Total mapped count: 48
Unique one to one mapped count: 48


In [15]:
# PRJEB29189: nucleotide_len: 16 and sra_len: 21

file_name = "Data/Nucleotide_SRA_Mapped_data/PRJEB29189_mapped_record.json"

PRJEB29189_sra_list = grouped_sra_data["PRJEB29189"]
nucleotide_list = grouped_nucleotide_data["PRJEB29189"]
mapped_records = []
for nucleotide_item in nucleotide_list:
    isolate_name = nucleotide_item["Source"]["isolate"]
    matched_sra_list = [sra_item for sra_item in PRJEB29189_sra_list if sra_item["Library"]["LIBRARY_NAME"] == isolate_name]
    if(len(matched_sra_list) > 0):
        mapped_object = {
            "nucleotide_info": nucleotide_item,
            "sra_info": matched_sra_list,
            "sra_count": len(matched_sra_list)
        }
        mapped_records.append(mapped_object)

with open(file_name, "w") as file:
        json.dump(mapped_records, file, indent=4)

print(f"Total mapped count: {len(mapped_records)}")
print(f"Unique one to one mapped count: {len([item for item in mapped_records if item["sra_count"] == 1])}")

Total mapped count: 16
Unique one to one mapped count: 12


In [16]:
# ERP000381: nucleotide_len: 109 and sra_len: 109

file_name = "Data/Nucleotide_SRA_Mapped_data/ERP000381_mapped_record.json"

sra_list = grouped_sra_data["ERP000381"]
nucleotide_list = grouped_nucleotide_data["ERP000381"]

mapped_records = []
for nucleotide_item in nucleotide_list:
    isolate_name = nucleotide_item["Source"]["isolate"]
    matched_sra_list = [sra_item for sra_item in sra_list if sra_item["Sample_Ref_Name"] == isolate_name]
    if(len(matched_sra_list) > 0):
        mapped_object = {
            "nucleotide_info": nucleotide_item,
            "sra_info": matched_sra_list,
            "sra_count": len(matched_sra_list)
        }
        mapped_records.append(mapped_object)

with open(file_name, "w") as file:
        json.dump(mapped_records, file, indent=4)

print(f"Total mapped count: {len(mapped_records)}")
print(f"Unique one to one mapped count: {len([item for item in mapped_records if item["sra_count"] == 1])}")

Total mapped count: 109
Unique one to one mapped count: 109


In [17]:
# Downloading Nucleotide records for Early farmers from across Europe directly descended from Neolithic Aegeans	bioRxivorg, 032763 (2016)	KU171094-KU171100	PRJEB11848

import logging

from CODE_mitochondrial_nucleotide_info_extraction import NucleotideInteract

log = logging.getLogger(__name__)

nucleotide_interact = NucleotideInteract(email="b_thapamagar@mail.fhsu.edu", logger=log)

nucleotide_data_list = []
for i in range(4, 10):
    accession_id = f"KU17109{i}"
    if(i == 10):
        accession_id = f"KU171100"

    nucleotide_item = {
                "AccessionID": accession_id,
                "BioProject": "PRJEB11848"
            }
    nucleotide_info = nucleotide_interact.fetch_nucleotide_info(accession_id)
    if(len(nucleotide_info) > 0):
            if("GBSeq_definition" in nucleotide_info[0]):
                nucleotide_item["Title"] = nucleotide_info[0]["GBSeq_definition"]
            if("GBSeq_comment" in nucleotide_info[0]):
                nucleotide_item["Assembly_Info"] = (nucleotide_info[0]["GBSeq_comment"])
    source_info = nucleotide_interact.extract_source_info_from_nucleotide_info(nucleotide_info)
    nucleotide_item["Source"] = source_info
    nucleotide_data_list.append(nucleotide_item)
    
    with open("Data/Nucleotide_metadata/PRJEB11848_nucleotide.json", "w") as file:
        json.dump(nucleotide_data_list, file, indent=4)


In [18]:
# PRJEB11848: nucleotide_len: 6 and sra_len: 5

file_name = "Data/Nucleotide_SRA_Mapped_data/PRJEB11848_mapped_record.json"

sra_list = grouped_sra_data["PRJEB11848"]
    
nucleotide_list = []
with open("Data/Nucleotide_metadata/PRJEB11848_nucleotide.json", 'r') as file:
    nucleotide_list = json.load(file)
    
mapped_records = []
for nucleotide_item in nucleotide_list:
    isolate_name = nucleotide_item["Source"]["isolate"]
    matched_sra_list = [sra_item for sra_item in sra_list if sra_item["submitter_id"] == isolate_name]
    if(len(matched_sra_list) > 0):
        mapped_object = {
            "nucleotide_info": nucleotide_item,
            "sra_info": matched_sra_list,
            "sra_count": len(matched_sra_list)
        }
        mapped_records.append(mapped_object)

with open(file_name, "w") as file:
        json.dump(mapped_records, file, indent=4)

print(f"Total mapped count: {len(mapped_records)}")
print(f"Unique one to one mapped count: {len([item for item in mapped_records if item["sra_count"] == 1])}")

Total mapped count: 4
Unique one to one mapped count: 4


In [19]:
# PRJEB36639: nucleotide_len: 415 and sra_len: 649

file_name = "Data/Nucleotide_SRA_Mapped_data/PRJEB36639_mapped_record.json"

sra_list = grouped_sra_data["PRJEB36639"]
nucleotide_list = grouped_nucleotide_data["PRJEB36639"]
mapped_records = []
for nucleotide_item in nucleotide_list:
    isolate_name = nucleotide_item["Source"]["isolate"]
    matched_sra_list = [sra_item for sra_item in sra_list if sra_item["Sample_Title"] == isolate_name]
    if(len(matched_sra_list) > 0):
        mapped_object = {
            "nucleotide_info": nucleotide_item,
            "sra_info": matched_sra_list,
            "sra_count": len(matched_sra_list)
        }
        mapped_records.append(mapped_object)

with open(file_name, "w") as file:
        json.dump(mapped_records, file, indent=4)

print(f"Total mapped count: {len(mapped_records)}")
print(f"Unique one to one mapped count: {len([item for item in mapped_records if item["sra_count"] == 1])}")

Total mapped count: 415
Unique one to one mapped count: 182


In [20]:
# PRJEB4417: nucleotide_len: 1058 and sra_len: 623

file_name = "Data/Nucleotide_SRA_Mapped_data/PRJEB4417_mapped_record.json"

sra_list = grouped_sra_data["PRJEB4417"]
nucleotide_list = grouped_nucleotide_data["PRJEB4417"]

mapped_records = []
for nucleotide_item in nucleotide_list:
    isolate_name = nucleotide_item["Source"]["isolate"]
    matched_sra_list = [sra_item for sra_item in sra_list if sra_item["Sample_Title"] == isolate_name]
    if(len(matched_sra_list) > 0):
        mapped_object = {
            "nucleotide_info": nucleotide_item,
            "sra_info": matched_sra_list,
            "sra_count": len(matched_sra_list)
        }
        mapped_records.append(mapped_object)

with open(file_name, "w") as file:
        json.dump(mapped_records, file, indent=4)

print(f"Total mapped count: {len(mapped_records)}")
print(f"Unique one to one mapped count: {len([item for item in mapped_records if item["sra_count"] == 1])}")

Total mapped count: 622
Unique one to one mapped count: 621


In [21]:
# PRJNA416844: nucleotide_len: 98 and sra_len: 15

file_name = "Data/Nucleotide_SRA_Mapped_data/PRJNA416844_mapped_record.json"

sra_list = grouped_sra_data["PRJNA416844"]
nucleotide_list = grouped_nucleotide_data["PRJNA416844"]

mapped_records = []
for nucleotide_item in nucleotide_list:
    isolate_name = nucleotide_item["Source"]["isolate"]
    matched_sra_list = [sra_item for sra_item in sra_list if sra_item["Library"]["LIBRARY_NAME"] == isolate_name]
    if(len(matched_sra_list) > 0):
        mapped_object = {
            "nucleotide_info": nucleotide_item,
            "sra_info": matched_sra_list,
            "sra_count": len(matched_sra_list)
        }
        mapped_records.append(mapped_object)

with open(file_name, "w") as file:
        json.dump(mapped_records, file, indent=4)

print(f"Total mapped count: {len(mapped_records)}")
print(f"Unique one to one mapped count: {len([item for item in mapped_records if item["sra_count"] == 1])}")

Total mapped count: 12
Unique one to one mapped count: 12


In [22]:
# PRJNA421333: nucleotide_len: 55 and sra_len: 80

file_name = "Data/Nucleotide_SRA_Mapped_data/PRJNA421333_mapped_record.json"

sra_list = grouped_sra_data["PRJNA421333"]
nucleotide_list = grouped_nucleotide_data["PRJNA421333"]
mapped_records = []
for nucleotide_item in nucleotide_list:
    isolate_name = nucleotide_item["Source"]["isolate"]
    isolate_chunks = isolate_name.split("_")
    if(len(isolate_chunks)> 0):
        isolate_name = isolate_chunks[0]
        matched_sra_list = [sra_item for sra_item in sra_list if sra_item["Title"].split(": ")[1] == isolate_name]
        if(len(matched_sra_list) > 0):
            mapped_object = {
                "nucleotide_info": nucleotide_item,
                "sra_info": matched_sra_list,
                "sra_count": len(matched_sra_list)
            }
            mapped_records.append(mapped_object)

with open(file_name, "w") as file:
        json.dump(mapped_records, file, indent=4)

print(f"Total mapped count: {len(mapped_records)}")
print(f"Unique one to one mapped count: {len([item for item in mapped_records if item["sra_count"] == 1])}")

Total mapped count: 36
Unique one to one mapped count: 10


In [23]:
# PRJEB29663: nucleotide_len: 22 and sra_len: 10

file_name = "Data/Nucleotide_SRA_Mapped_data/PRJEB29663_mapped_record.json"

sra_list = grouped_sra_data["PRJEB29663"]
nucleotide_list = grouped_nucleotide_data["PRJEB29663"]

mapped_records = []
for nucleotide_item in nucleotide_list:
    isolate_name = nucleotide_item["Source"]["isolate"]
    isolate_chunks = isolate_name.split(" ")
    if(len(isolate_chunks)> 0):
        isolate_name = isolate_chunks[-1]
        matched_sra_list = [sra_item for sra_item in sra_list if sra_item["Sample_Title"] == isolate_name]
        if(len(matched_sra_list) > 0):
            mapped_object = {
                "nucleotide_info": nucleotide_item,
                "sra_info": matched_sra_list,
                "sra_count": len(matched_sra_list)
            }
            mapped_records.append(mapped_object)

with open(file_name, "w") as file:
        json.dump(mapped_records, file, indent=4)

print(f"Total mapped count: {len(mapped_records)}")
print(f"Unique one to one mapped count: {len([item for item in mapped_records if item["sra_count"] == 1])}")

Total mapped count: 9
Unique one to one mapped count: 9


In [24]:
# PRJNA636291: nucleotide_len: 105 and sra_len: 97

file_name = "Data/Nucleotide_SRA_Mapped_data/PRJNA636291_mapped_record.json"

sra_list = grouped_sra_data["PRJNA636291"]
nucleotide_list = grouped_nucleotide_data["PRJNA636291"]

mapped_records = []
for nucleotide_item in nucleotide_list:
    isolate_name = nucleotide_item["Source"]["isolate"]
    if("AGENOME_" in isolate_name):
        isolate_name = isolate_name.replace("AGENOME_", "")
    elif("AGENOME-" in isolate_name):
        isolate_name = isolate_name.replace("AGENOME-", "")
    matched_sra_list = [sra_item for sra_item in sra_list if sra_item["Library"]["LIBRARY_NAME"] == isolate_name]
    if(len(matched_sra_list) > 0):
        mapped_object = {
            "nucleotide_info": nucleotide_item,
            "sra_info": matched_sra_list,
            "sra_count": len(matched_sra_list)
        }
        mapped_records.append(mapped_object)

with open(file_name, "w") as file:
        json.dump(mapped_records, file, indent=4)

print(f"Total mapped count: {len(mapped_records)}")
print(f"Unique one to one mapped count: {len([item for item in mapped_records if item["sra_count"] == 1])}")

Total mapped count: 97
Unique one to one mapped count: 97


In [25]:
# PRJEB22217: nucleotide_len: 5 and sra_len: 5

file_name = "Data/Nucleotide_SRA_Mapped_data/PRJEB22217_mapped_record.json"

sra_list = grouped_sra_data["PRJEB22217"]
nucleotide_list = grouped_nucleotide_data["PRJEB22217"]

mapped_records = []
for nucleotide_item in nucleotide_list:
    isolate_name = nucleotide_item["Source"]["isolate"]
    if(len(isolate_name) == 4):
        isolate_name = f"{isolate_name[:2]}0{isolate_name[2:]}"
        print(isolate_name)
    matched_sra_list = [sra_item for sra_item in sra_list if sra_item["Sample_Title"] == isolate_name]
    if(len(matched_sra_list) > 0):
        mapped_object = {
            "nucleotide_info": nucleotide_item,
            "sra_info": matched_sra_list,
            "sra_count": len(matched_sra_list)
        }
        mapped_records.append(mapped_object)

with open(file_name, "w") as file:
        json.dump(mapped_records, file, indent=4)

print(f"Total mapped count: {len(mapped_records)}")
print(f"Unique one to one mapped count: {len([item for item in mapped_records if item["sra_count"] == 1])}")

RN039
Total mapped count: 5
Unique one to one mapped count: 5


These BioProjects SRA and nucleotide yet to match
- PRJNA850430: sra_len: 1 and nucleotide_len: 1

- PRJNA266605: sra_len: 1 and nucleotide_len: 1

- PRJEB58193: sra_len: 1 and nucleotide_len: 1
- PRJNA889366: sra_len: 2 and nucleotide_len: 1
- PRJNA295854: sra_len: 1 and nucleotide_len: 1


### Combining nucleotide-SRA mapped metadata

In [38]:
from pathlib import Path

nucleotide_sra_mapped_data_directory = Path("Data/Nucleotide_SRA_Mapped_data")

print(nucleotide_sra_mapped_data_directory.exists())

mapped_data_list = []
for json_file in nucleotide_sra_mapped_data_directory.glob(
    "*.json"
):  # use .rglob("*.json") to include sub‑folders
    # read / process each file (optional)
    # with json_file.open(encoding="utf-8") as f:
    #     payload = json.load(f)

    with open(json_file, "r") as file:
        nucleotide_sra_mapped_data = json.load(file)
        mapped_data_list.extend(nucleotide_sra_mapped_data)

        # do whatever you need here
        print(f"FileName: {json_file.name}")
        print(f"Length: {len(nucleotide_sra_mapped_data)}")
        one_to_one_mapped_records = [item for item in nucleotide_sra_mapped_data if item["sra_count"] == 1]
        print(f"Unique one to one nucleotide SRA mapping count: {len(one_to_one_mapped_records)}")
        print(f"mapped total length: {len(mapped_data_list)}")
        print("\n")

# sra_metadata_list = []
# with open("Data/SRA_data/DATA_sra_info_in_pmc.json", 'r') as file:
#     sra_metadata_list = json.load(file)

True
FileName: PRJEB22217_mapped_record.json
Length: 5
Unique one to one nucleotide SRA mapping count: 5
mapped total length: 5


FileName: PRJNA421333_mapped_record.json
Length: 36
Unique one to one nucleotide SRA mapping count: 10
mapped total length: 41


FileName: PRJNA416844_mapped_record.json
Length: 12
Unique one to one nucleotide SRA mapping count: 12
mapped total length: 53


FileName: PRJEB29663_mapped_record.json
Length: 9
Unique one to one nucleotide SRA mapping count: 9
mapped total length: 62


FileName: PRJEB29569_mapped_record.json
Length: 48
Unique one to one nucleotide SRA mapping count: 48
mapped total length: 110


FileName: PRJEB11848_mapped_record.json
Length: 4
Unique one to one nucleotide SRA mapping count: 4
mapped total length: 114


FileName: PRJEB4417_mapped_record.json
Length: 622
Unique one to one nucleotide SRA mapping count: 621
mapped total length: 736


FileName: PRJEB29189_mapped_record.json
Length: 16
Unique one to one nucleotide SRA mapping count: 1

In [39]:
for item in mapped_data_list:
    # print(item["nucleotide_info"]["AccessionID"])
    accession = item["nucleotide_info"]["AccessionID"]
    try:
        fetch_handle = Entrez.efetch(db="nucleotide", id=accession,  rettype="gb", retmode="text")
        nucleotide_data = list(SeqIO.parse(fetch_handle, "genbank"))
        # print(f"Seqence length: {len(nucleotide_data[0])}")
        item["nucleotide_info"]["SequenceLength"] = len(nucleotide_data[0])
    except Exception as e:
        print(f"Accession: {accession} had problem with sequence length")
        print(f"Error: {e}")
    
    

In [44]:
## Total mapped object length
print(f"mapped total length: {len(mapped_data_list)}")

records_with_unique_mapping = [item for item in mapped_data_list if item["sra_count"] == 1]
print(f"mapped records total length having one to one nucleotide and SRA mapping: {len(records_with_unique_mapping)}")


mapped total length: 1430
mapped records total length having one to one nucleotide and SRA mapping: 1157


In [45]:
### Store in JSON 
combined_json_directory = Path("Data/Data_for_DataVerse")

all_combined_json_file_name = "Complete_Nucleotide_SRA_mapped_data.json"
file_path = combined_json_directory / all_combined_json_file_name
with open(file_path, "w") as file:
        json.dump(mapped_data_list, file, indent=4)
        
        
one_to_one_mapped_json_file_name = "Nucleotide_SRA_one_to_one_mapped_data.json"
file_path = combined_json_directory / one_to_one_mapped_json_file_name
with open(file_path, "w") as file:
        json.dump(mapped_data_list, file, indent=4)