In [7]:
import requests
import zipfile
import tarfile
import os
import re
import tqdm
import glob
import gdown
import zipfile

In [8]:
def unzip_file(zip_file, extract_to):
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

In [9]:
if not os.path.exists('./new_raw_data/'):
    os.makedirs('./new_raw_data/')
    
if not os.path.exists('./new_raw_data/expression'):
    os.makedirs('./new_raw_data/expression')

if not os.path.exists('./new_raw_data/expression/All_pair_sample_keys/'):
    os.makedirs('./new_raw_data/expression/All_pair_sample_keys/')

Get the fasta file for the CDSs of the 2021 genome

In [3]:
r = requests.get('https://tet.ciliate.org/common/downloads/tet/3-upd-cds-fasta-2021.fasta', stream=True)
with open('./new_raw_data/Tthermophila_MAC_CDS_2021.fasta', 'wb') as f:
    for chunk in r.iter_content(chunk_size=128):
        f.write(chunk)

Get the fasta file for the protein sequences of the 2021 genome

In [3]:
r = requests.get('https://tet.ciliate.org/common/downloads/tet/4-upd-Protein-fasta-2021.fasta', stream=True)
with open('./new_raw_data/Tthermophila_MAC_protein_2021.fasta', 'wb') as f:
    for chunk in r.iter_content(chunk_size=128):
        f.write(chunk)

Get the .ndf file

In [4]:
r = requests.get('https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file', stream=True)

In [5]:
with open('./new_raw_data/GSE11300.tar', 'wb') as f:
    for chunk in r.iter_content(chunk_size=128):
            f.write(chunk)

In [6]:
tar_folder = tarfile.open('./new_raw_data/GSE11300.tar')
tar_folder.extractall(path='./new_raw_data/GSE11300/')
tar_folder.close()

Get the raw reads

In [16]:
def download_raw_data(url, name):
    r = requests.get(url, stream=True)
    d = r.headers['content-disposition']
    fname = re.findall("filename=(.+)", d)[0]
    name = re.search(r'GSE.*(tar|gz)', fname).group()
    
    if '.tar' in name:
        with open(f'./new_raw_data/{name}', 'wb') as f:
            for chunk in r.iter_content(chunk_size=128):
                    f.write(chunk)

        tar_folder = tarfile.open(f'./new_raw_data/{name}')
        tar_folder.extractall(path=f'./new_raw_data/{name[:-4]}')
        tar_folder.close()
    else:
        with open(f'./new_raw_data/{name}', 'wb') as f:
            for chunk in r.iter_content(chunk_size=128):
                f.write(chunk)
    return

def move_expression_files(data_files, target_dir='./new_raw_data/expression/'):
    for f in data_files:
        name = os.path.basename(f)
        os.rename(f, f'{target_dir}{name}')

In [8]:
r = requests.get('https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file&file=GSE11300%5FRaw%5FC1%5FSamples%2Etxt%2Egz', stream=True)

In [9]:
d = r.headers['content-disposition']
fname = re.findall("filename=(.+)", d)[0]

In [10]:
re.search(r'GSE.*(tar|gz)', fname).group()[:-3]

'GSE11300_Raw_C1_Samples.txt'

In [13]:
r.status_code

200

In [14]:
download_raw_data('https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file', 'GSE11300')

In [15]:
raw_data_url_dict = {
    'GSE26650': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE26650&format=file', #Pearlman submission, .pair formatted
    'GSE11300C1': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file&file=GSE11300%5FRaw%5FC1%5FSamples%2Etxt%2Egz', #Miao submission, C1, "samples" format
    'GSE11300C2': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file&file=GSE11300%5FRaw%5FC2%5FSamples%2Etxt%2Egz', #Miao submission, C2, "samples" format
    'GSE11300L1': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file&file=GSE11300%5FRaw%5FL1%5FSamples%2Etxt%2Egz', #Miao submission, L1, "samples" format
    'GSE11300L2': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file&file=GSE11300%5FRaw%5FL2%5FSamples%2Etxt%2Egz', #Miao submission, L2, "samples" format
    'GSE11300L3': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file&file=GSE11300%5FRaw%5FL3%5FSamples%2Etxt%2Egz', #Miao submission, L3, "samples" format
    'GSE11300S1': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file&file=GSE11300%5FRaw%5FS1%5FSamples%2Etxt%2Egz', #Miao submission, S1, "samples" format
    'GSE11300S2': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file&file=GSE11300%5FRaw%5FS2%5FSamples%2Etxt%2Egz', #Miao submission, S2, "samples" format
    'GSE11300S3': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file&file=GSE11300%5FRaw%5FS3%5FSamples%2Etxt%2Egz', #Miao submission, S3, "samples" format
    'GSE26384': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE26384&format=file', #Miao submission, .pair formatted
    'GSE26385': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE26385&format=file', #Liu submission, .pair formatted
}

In [16]:
%pdb

Automatic pdb calling has been turned ON


In [17]:
for name, url in tqdm.tqdm(raw_data_url_dict.items()):
    download_raw_data(url, name)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:56<00:00,  5.11s/it]


In [12]:
gdrive_url = 'https://drive.google.com/uc?id=1YkRZlYmfkq1m3oJ-zSJFTTZKT70HKCMA'

gdrive_output = './new_raw_data/all_pair.zip'

gdown.download(gdrive_url, gdrive_output, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1YkRZlYmfkq1m3oJ-zSJFTTZKT70HKCMA
From (redirected): https://drive.google.com/uc?id=1YkRZlYmfkq1m3oJ-zSJFTTZKT70HKCMA&confirm=t&uuid=e7b5d1a9-101b-42aa-8c91-70ee79595fd7
To: /Users/michaelbertagna/git/TGNE-2022/new_raw_data/all_pair.zip
100%|██████████| 157M/157M [00:11<00:00, 13.1MB/s] 


'./new_raw_data/all_pair.zip'

In [13]:
zip_file_path = './new_raw_data/all_pair.zip'

extract_to_directory = './new_raw_data/all_pair'

unzip_file(zip_file_path, extract_to_directory)

In [14]:
data_files = glob.glob('./new_raw_data/all_pair/Microarray_Aaron/un-normalized-raw_data/*.txt') + glob.glob('./new_raw_data/*.txt.gz') + glob.glob('./new_raw_data/**/*pair.txt.gz', recursive=True) + glob.glob('./new_raw_data/**/*pair.gz', recursive=True)
data_files

['./new_raw_data/all_pair/Microarray_Aaron/un-normalized-raw_data/All_pair-S2,S3.txt',
 './new_raw_data/all_pair/Microarray_Aaron/un-normalized-raw_data/All_pair-L1,L2.txt',
 './new_raw_data/all_pair/Microarray_Aaron/un-normalized-raw_data/All_pair-L3,S1.txt',
 './new_raw_data/all_pair/Microarray_Aaron/un-normalized-raw_data/All_pair-C1,C2.txt',
 './new_raw_data/expression/GSM656231_4257502_532_pair.txt.gz',
 './new_raw_data/expression/GSM656232_4257702_532_pair.txt.gz',
 './new_raw_data/expression/GSM656237_4258302_532_pair.txt.gz',
 './new_raw_data/expression/GSM656239_4261302_532_pair.txt.gz',
 './new_raw_data/expression/GSM656234_4257802_532_pair.txt.gz',
 './new_raw_data/expression/GSM656240_4261102_532_pair.txt.gz',
 './new_raw_data/expression/GSM647653_13401502_532_pair.txt.gz',
 './new_raw_data/expression/GSM647654_13401702_532_pair.txt.gz',
 './new_raw_data/expression/GSM656236_4257902_532_pair.txt.gz',
 './new_raw_data/expression/GSM656238_4259002_532_pair.txt.gz',
 './new_ra

In [17]:
move_expression_files(data_files)

In [18]:
All_pair_sample_key_files = glob.glob('./new_raw_data/all_pair/Microarray_Aaron/normalized_for_each_probe_data/SampleKey*.txt')
All_pair_sample_key_files

['./new_raw_data/all_pair/Microarray_Aaron/normalized_for_each_probe_data/SampleKey-C1 and C2.txt',
 './new_raw_data/all_pair/Microarray_Aaron/normalized_for_each_probe_data/SampleKey-S2 and S3.txt',
 './new_raw_data/all_pair/Microarray_Aaron/normalized_for_each_probe_data/SampleKey-L3 and S1.txt',
 './new_raw_data/all_pair/Microarray_Aaron/normalized_for_each_probe_data/SampleKey-L1 and L2.txt']

In [19]:
for f in All_pair_sample_key_files:
    name = os.path.basename(f)
    os.rename(f, f'{"./new_raw_data/expression/All_pair_sample_keys/"}{name}')