In [1]:
import pandas as pd
from Bio import SeqIO
from pathlib import Path
from io import StringIO
import requests, sys, json
from tqdm import tqdm

ModuleNotFoundError: No module named 'Bio'

In [7]:
# Load the tab-delimited file (adjust path as needed)
df = pd.read_csv("TAIR2UniprotMapping.txt", sep="\t", header=None,
                 names=["uniprot_id", "locus_id", "tair_id"])

print(df.head())

     uniprot_id       locus_id    tair_id
0  A0A0A7EPL0-2  locus:2035999  AT1G08910
1    A0A178UFM8  locus:2153097  AT5G51560
2    A0A178VEK7  locus:2103386  AT3G60460
3    A0A178WF56  locus:2205245  AT1G56060
4    A0A1I9LMX5  locus:2098695  AT3G50610


In [8]:
def load_high_conf_tair_ids_from_excel(filepath):
    xls = pd.ExcelFile(filepath)
    combined = pd.DataFrame()

    for sheet in xls.sheet_names:
        df = xls.parse(sheet)
        df.columns = [str(c).strip() for c in df.columns]

        if "Protein FDR Confidence: Combined" in df.columns and "Accession" in df.columns:
            high_conf = df[df["Protein FDR Confidence: Combined"].str.strip().str.lower() == "high"]
            combined = pd.concat([combined, high_conf], ignore_index=True)

    combined["clean_tair_id"] = combined["Accession"].str.extract(r"(AT[1-5CM]G\d{5})")
    return combined[["clean_tair_id", "Description"]].dropna().drop_duplicates()

# Load both datasets
fl_df = load_high_conf_tair_ids_from_excel("FreeLumenal.xlsx")
ml_df = load_high_conf_tair_ids_from_excel("MembraneAssociatedLumenal.xlsx")

# Combine them
all_high_conf_df = pd.concat([fl_df, ml_df], ignore_index=True).drop_duplicates()
all_high_conf_df.head()


Unnamed: 0,clean_tair_id,Description
0,AT5G23120,"photosystem II stability/assembly factor, chlo..."
1,AT4G09010,ascorbate peroxidase 4
2,ATCG00490,ribulose-bisphosphate carboxylases
3,AT1G06680,photosystem II subunit P-1
4,AT1G20020,ferredoxin-NADP(+)-oxidoreductase 2


In [9]:
# Load your mapping file (adjust filename if needed)
mapping_df = pd.read_csv("TAIR2UniprotMapping.txt", sep="\t", header=None,
                         names=["uniprot_id", "locus_id", "tair_id"])

# Merge high-confidence TAIR IDs with UniProt IDs
merged_df = pd.merge(all_high_conf_df, mapping_df, left_on="clean_tair_id", right_on="tair_id", how="inner")
merged_df = merged_df.drop_duplicates(subset=["uniprot_id"])
merged_df[["uniprot_id", "tair_id", "Description"]].head()


Unnamed: 0,uniprot_id,tair_id,Description
0,O82660,AT5G23120,"photosystem II stability/assembly factor, chlo..."
1,A0A654G449,AT5G23120,"photosystem II stability/assembly factor, chlo..."
2,A0A1P8BG37,AT5G23120,"photosystem II stability/assembly factor, chlo..."
3,P82281,AT4G09010,ascorbate peroxidase 4
4,A0A1P8B8W6,AT4G09010,ascorbate peroxidase 4


In [33]:
merged_df_cleaned = merged_df.drop_duplicates(subset=["tair_id"]).reset_index(drop=True)
merged_df_cleaned[["uniprot_id", "tair_id", "Description"]].tail()



Unnamed: 0,uniprot_id,tair_id,Description
799,P56777,ATCG00680,photosystem II reaction center protein B
800,Q501G5,AT5G03100,F-box/RNI-like superfamily protein
801,A0A5S9YI34,AT5G65750,"2-oxoglutarate dehydrogenase, E1 component"
802,A0A178VT66,AT2G33845,"Nucleic acid-binding, OB-fold-like protein"
803,A0A384KLD6,AT1G79640,Protein kinase superfamily protein


In [36]:
protein_sequences = {}

for up_id in merged_df_cleaned.uniprot_id:
    params = {
      "fields": [
        "sequence"
      ]
    }
    headers = {
      "accept": "application/json"
    }
    base_url = "https://rest.uniprot.org/uniprotkb/Q8H1Q1"
    
    response = requests.get(base_url, headers=headers, params=params)
    if not response.ok:
      response.raise_for_status()
      sys.exit()
    
    data = response.json()
    protein_sequences[data["primaryAccession"]] = data["sequence"]["value"]


KeyboardInterrupt: 

In [None]:
print(protein_sequences)