## Find the sequences I need

### Find all cytoplasmically localised sequences to check their transit peptides

In [24]:
import pandas as pd

threshold_df = pd.read_csv('localisation-pH_accuracy_full_data.csv')
seq_lst = threshold_df.loc[threshold_df['Cytoplasm'] == 'No']['seq_id'].to_list()

wrk_index = pd.read_csv('wrk_index2.csv')
wrk_index = wrk_index[wrk_index['Unid'].isin(seq_lst)]

wrk_index = wrk_index[['PACid', 'Unid', 'Species', 'Datasource',
       'Header', 'Sequence', 'Onekp_index_id', 'Scaffold',
       'Subject Seq-id']]

In [34]:
wrk_index

Unnamed: 0,PACid,Unid,Species,Datasource,Header,Sequence,Onekp_index_id,Scaffold,Subject Seq-id
4,,97558,Nymphaea_sp.,Onekp,>97558 gnl|onekp|PZRT_scaffold_2006360 Nelumbo...,KYDRELDVAVRVVQLACSLCQRVQDRLVQNKEQVKSKEDHSLVTVA...,PZRT,2006360,gnl|onekp|PZRT_scaffold_2006360
6,,49849,Botryococcus_terribilis,Onekp,>49849 gnl|onekp|QYXY_scaffold_2046297 Botryoc...,YATELESAIRAVRLAAVLCETVQLKLQAGEFAEKGDASPVTIADYG...,QYXY,2046297,gnl|onekp|QYXY_scaffold_2046297
7,,76363,Bryopsis_plumosa,Onekp,>76363 gnl|onekp|JTIG_scaffold_2030560 Bryopsi...,RHQTVWMERPYMRAMQTNSSVAEQSGAAMEALDELKSGCEAVRLAS...,JTIG,2030560,gnl|onekp|JTIG_scaffold_2030560
8,,19421,Chaetopeltis_orbicularis,Onekp,>19421 gnl|onekp|BAZF_scaffold_2072188 Chaetop...,ESAKHGRALEAARRAVRLASKLCRKVQLQLGAEERQDKQDASPVTV...,BAZF,2072188,gnl|onekp|BAZF_scaffold_2072188
9,,86642,Chlamydomonas_bilatus,Onekp,>86642 gnl|onekp|OVHR_scaffold_3010360 Chlamyd...,HASIKSVKDIGDDAPYAKQLEQGCKAVRLAAKLCQVVQKQLGDSEK...,OVHR,3010360,gnl|onekp|OVHR_scaffold_3010360
...,...,...,...,...,...,...,...,...,...
499,,34845,Onychonema_laeve,Onekp,>34845 gnl|onekp|GGWH_scaffold_2005072 Onychon...,LEVAVNIVELACWLTERVQAQLRLAEETANTKADKSFVTLADYGVQ...,GGWH,2005072,gnl|onekp|GGWH_scaffold_2005072
500,,55808,Penium_exiguum,Onekp,>55808 gnl|onekp|YSQT_scaffold_2036623 Penium_...,MQIAEKAVQLACWLTQRVQQQLRKQEESAQSKADKSFVTVADYGVQ...,YSQT,2036623,gnl|onekp|YSQT_scaffold_2036623
501,,61527,Staurastrum_sebaldi,Onekp,>61527 gnl|onekp|ISHC_scaffold_2046236 Stauras...,MKVAERVVQLSCWLTERVQAQLRAAEEAADTKADKSFVTVADYGVQ...,ISHC,2046236,gnl|onekp|ISHC_scaffold_2046236
503,,37245,Staurodesmus_omearii,Onekp,>37245 gnl|onekp|RPRU_scaffold_2005579 Staurod...,MRVAEKAVQLACWLTQRVQAQLRKEEEQAHSKADKSFVTAADYGVQ...,RPRU,2005579,gnl|onekp|RPRU_scaffold_2005579


In [31]:
onekp_df = wrk_index.loc[wrk_index['Datasource'] == 'Onekp']
phyto_df = wrk_index.loc[wrk_index['Datasource'] == 'Phytozome']

In [43]:
onekp_df = onekp_df[['Unid', 'Species', 'Header', 'Sequence', 'Onekp_index_id', 'Scaffold', 'Subject Seq-id']]
phyto_df = phyto_df[['PACid', 'Unid', 'Species', 'Header', 'Sequence']]

sample_lst = onekp_df['Unid'].to_list()

In [45]:
with open('sample_lst.txt', 'w') as f:
    for i in sample_lst:
        f.write(f'{i}\n')


## Download the assemblies

In [44]:
import subprocess
from pathlib import Path

def download_soap_assemblies(sample_ids, base_dir="./downloaded"):
    """
    Download SOAPdenovo-Trans assembly files for specified sample IDs.
    
    Args:
        sample_ids (list): List of sample IDs (e.g., ['AFLV'])
        base_dir (str): Local directory to store downloads
    """
    # Create download directory
    Path(base_dir).mkdir(parents=True, exist_ok=True)
    
    # Base path in CyVerse
    cyverse_base = "/iplant/home/shared/commons_repo/curated/oneKP_capstone_2019/transcript_assemblies"
    
    for sample_id in sample_ids:
        # First, list the full directory name that starts with this sample ID
        cmd = ['ils', cyverse_base]
        result = subprocess.run(cmd, capture_output=True, text=True)
        
        if result.returncode != 0:
            print(f"Error listing directory for {sample_id}: {result.stderr}")
            continue
            
        # Find the matching directory
        matching_dir = None
        for line in result.stdout.splitlines():
            if line.strip().startswith(f"  C- /{sample_id}-"):  # ils prepends "  C- " to directory names
                matching_dir = line.strip().split('/')[-1]
                break
        
        if not matching_dir:
            print(f"No directory found for sample ID {sample_id}")
            continue
            
        # Construct paths
        source_path = f"{cyverse_base}/{matching_dir}/{sample_id}-SOAPdenovo-Trans-assembly.fa.bz2"
        dest_path = f"{base_dir}/{sample_id}-SOAPdenovo-Trans-assembly.fa.bz2"
        
        # Download the file with -PT flag for parallel transfer
        cmd = ['iget', '-PT', '-K', source_path, dest_path]
        print(f"Downloading {source_path}...")
        
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            print(f"Error downloading {sample_id}: {result.stderr}")
        else:
            print(f"Successfully downloaded {sample_id}")

# Example usage:
if __name__ == "__main__":
    download_soap_assemblies(sample_lst, base_dir="./downloaded")

FileNotFoundError: [Errno 2] No such file or directory: 'ils'