In [1]:
import os
import tqdm
import glob
import gdown

from utils import file_utils

In [2]:
file_utils.create_directories('./new_raw_data/')
file_utils.create_directories('./new_raw_data/expression')
file_utils.create_directories('./new_raw_data/expression/All_pair_sample_keys/')

Get the fasta file for the CDSs of the 2021 genome

In [3]:
file_utils.download_file_chunks('https://tet.ciliate.org/common/downloads/tet/legacy/3-upd-cds-fasta-2021.fasta', 
                                './new_raw_data/Tthermophila_MAC_CDS_2021.fasta')

FILE SAVED: /Users/michaelbertagna/git/TGNE-2022/new_raw_data/Tthermophila_MAC_CDS_2021.fasta


Get the fasta file for the protein sequences of the 2021 genome

In [4]:
file_utils.download_file_chunks('https://tet.ciliate.org/common/downloads/tet/legacy/4-upd-Protein-fasta-2021.fasta', 
                                './new_raw_data/Tthermophila_MAC_protein_2021.fasta')

FILE SAVED: /Users/michaelbertagna/git/TGNE-2022/new_raw_data/Tthermophila_MAC_protein_2021.fasta


Get the .ndf file

In [5]:
file_utils.download_file_chunks('https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file', 
                                './new_raw_data/GSE11300.tar')

FILE SAVED: /Users/michaelbertagna/git/TGNE-2022/new_raw_data/GSE11300.tar


In [6]:
file_utils.extract_tar('./new_raw_data/GSE11300.tar')

Get the raw reads

In [7]:
def move_expression_files(data_files, target_dir='./new_raw_data/expression/'):
    for f in data_files:
        name = os.path.basename(f)
        os.rename(f, f'{target_dir}{name}')

In [8]:
file_utils.download_geo_data_file('https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file', 'GSE11300')

FILE SAVED: /Users/michaelbertagna/git/TGNE-2022/new_raw_data/GSE11300_RAW.tar


In [9]:
raw_data_url_dict = {
    'GSE26650': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE26650&format=file', #Pearlman submission, .pair formatted
    'GSE11300C1': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file&file=GSE11300%5FRaw%5FC1%5FSamples%2Etxt%2Egz', #Miao submission, C1, "samples" format
    'GSE11300C2': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file&file=GSE11300%5FRaw%5FC2%5FSamples%2Etxt%2Egz', #Miao submission, C2, "samples" format
    'GSE11300L1': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file&file=GSE11300%5FRaw%5FL1%5FSamples%2Etxt%2Egz', #Miao submission, L1, "samples" format
    'GSE11300L2': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file&file=GSE11300%5FRaw%5FL2%5FSamples%2Etxt%2Egz', #Miao submission, L2, "samples" format
    'GSE11300L3': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file&file=GSE11300%5FRaw%5FL3%5FSamples%2Etxt%2Egz', #Miao submission, L3, "samples" format
    'GSE11300S1': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file&file=GSE11300%5FRaw%5FS1%5FSamples%2Etxt%2Egz', #Miao submission, S1, "samples" format
    'GSE11300S2': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file&file=GSE11300%5FRaw%5FS2%5FSamples%2Etxt%2Egz', #Miao submission, S2, "samples" format
    'GSE11300S3': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file&file=GSE11300%5FRaw%5FS3%5FSamples%2Etxt%2Egz', #Miao submission, S3, "samples" format
    'GSE26384': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE26384&format=file', #Miao submission, .pair formatted
    'GSE26385': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE26385&format=file', #Liu submission, .pair formatted
}

In [10]:
for name, url in tqdm.tqdm(raw_data_url_dict.items()):
    file_utils.download_geo_data_file(url, name)

  9%|▉         | 1/11 [00:16<02:45, 16.55s/it]

FILE SAVED: /Users/michaelbertagna/git/TGNE-2022/new_raw_data/GSE26650_RAW.tar


 18%|█▊        | 2/11 [00:21<01:29,  9.99s/it]

FILE SAVED: /Users/michaelbertagna/git/TGNE-2022/new_raw_data/GSE11300_Raw_C1_Samples.txt.gz


 27%|██▋       | 3/11 [00:27<01:04,  8.12s/it]

FILE SAVED: /Users/michaelbertagna/git/TGNE-2022/new_raw_data/GSE11300_Raw_C2_Samples.txt.gz


 36%|███▋      | 4/11 [00:29<00:38,  5.52s/it]

FILE SAVED: /Users/michaelbertagna/git/TGNE-2022/new_raw_data/GSE11300_Raw_L1_Samples.txt.gz


 45%|████▌     | 5/11 [00:30<00:23,  3.94s/it]

FILE SAVED: /Users/michaelbertagna/git/TGNE-2022/new_raw_data/GSE11300_Raw_L2_Samples.txt.gz


 55%|█████▍    | 6/11 [00:32<00:16,  3.26s/it]

FILE SAVED: /Users/michaelbertagna/git/TGNE-2022/new_raw_data/GSE11300_Raw_L3_Samples.txt.gz


 64%|██████▎   | 7/11 [00:35<00:12,  3.22s/it]

FILE SAVED: /Users/michaelbertagna/git/TGNE-2022/new_raw_data/GSE11300_Raw_S1_Samples.txt.gz


 73%|███████▎  | 8/11 [00:37<00:08,  2.80s/it]

FILE SAVED: /Users/michaelbertagna/git/TGNE-2022/new_raw_data/GSE11300_Raw_S2_Samples.txt.gz


 82%|████████▏ | 9/11 [00:39<00:05,  2.71s/it]

FILE SAVED: /Users/michaelbertagna/git/TGNE-2022/new_raw_data/GSE11300_Raw_S3_Samples.txt.gz


 91%|█████████ | 10/11 [00:51<00:05,  5.41s/it]

FILE SAVED: /Users/michaelbertagna/git/TGNE-2022/new_raw_data/GSE26384_RAW.tar


100%|██████████| 11/11 [01:01<00:00,  5.59s/it]

FILE SAVED: /Users/michaelbertagna/git/TGNE-2022/new_raw_data/GSE26385_RAW.tar





In [11]:
gdrive_url = 'https://drive.google.com/uc?id=1YkRZlYmfkq1m3oJ-zSJFTTZKT70HKCMA'

gdrive_output = './new_raw_data/all_pair.zip'

gdown.download(gdrive_url, gdrive_output, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1YkRZlYmfkq1m3oJ-zSJFTTZKT70HKCMA
From (redirected): https://drive.google.com/uc?id=1YkRZlYmfkq1m3oJ-zSJFTTZKT70HKCMA&confirm=t&uuid=e459c552-9ca3-46f6-8768-4334d4071844
To: /Users/michaelbertagna/git/TGNE-2022/new_raw_data/all_pair.zip
100%|██████████| 157M/157M [00:15<00:00, 9.93MB/s] 


'./new_raw_data/all_pair.zip'

In [12]:
zip_file_path = './new_raw_data/all_pair.zip'

extract_to_directory = './new_raw_data/all_pair'

file_utils.unzip_file(zip_file_path, extract_to_directory)

In [13]:
data_files = glob.glob('./new_raw_data/all_pair/Microarray_Aaron/un-normalized-raw_data/*.txt') + glob.glob('./new_raw_data/*.txt.gz') + glob.glob('./new_raw_data/**/*pair.txt.gz', recursive=True) + glob.glob('./new_raw_data/**/*pair.gz', recursive=True)
data_files

['./new_raw_data/all_pair/Microarray_Aaron/un-normalized-raw_data/All_pair-S2,S3.txt',
 './new_raw_data/all_pair/Microarray_Aaron/un-normalized-raw_data/All_pair-L1,L2.txt',
 './new_raw_data/all_pair/Microarray_Aaron/un-normalized-raw_data/All_pair-L3,S1.txt',
 './new_raw_data/all_pair/Microarray_Aaron/un-normalized-raw_data/All_pair-C1,C2.txt',
 './new_raw_data/GSE11300_Raw_C1_Samples.txt.gz',
 './new_raw_data/GSE11300_Raw_L1_Samples.txt.gz',
 './new_raw_data/GSE11300_Raw_L3_Samples.txt.gz',
 './new_raw_data/GSE11300_Raw_C2_Samples.txt.gz',
 './new_raw_data/GSE11300_Raw_L2_Samples.txt.gz',
 './new_raw_data/GSE11300_Raw_S2_Samples.txt.gz',
 './new_raw_data/GSE11300_Raw_S3_Samples.txt.gz',
 './new_raw_data/GSE11300_Raw_S1_Samples.txt.gz',
 './new_raw_data/GSE26650_RAW/GSM656231_4257502_532_pair.txt.gz',
 './new_raw_data/GSE26650_RAW/GSM656232_4257702_532_pair.txt.gz',
 './new_raw_data/GSE26650_RAW/GSM656237_4258302_532_pair.txt.gz',
 './new_raw_data/GSE26650_RAW/GSM656239_4261302_532_pa

In [14]:
file_utils.move_files(data_files, './new_raw_data/expression/')

In [15]:
All_pair_sample_key_files = glob.glob('./new_raw_data/all_pair/Microarray_Aaron/normalized_for_each_probe_data/SampleKey*.txt')
All_pair_sample_key_files

['./new_raw_data/all_pair/Microarray_Aaron/normalized_for_each_probe_data/SampleKey-C1 and C2.txt',
 './new_raw_data/all_pair/Microarray_Aaron/normalized_for_each_probe_data/SampleKey-S2 and S3.txt',
 './new_raw_data/all_pair/Microarray_Aaron/normalized_for_each_probe_data/SampleKey-L3 and S1.txt',
 './new_raw_data/all_pair/Microarray_Aaron/normalized_for_each_probe_data/SampleKey-L1 and L2.txt']

In [16]:
file_utils.move_files(All_pair_sample_key_files, './new_raw_data/expression/All_pair_sample_keys/')