In [1]:
from dotenv import dotenv_values
import snowflake.connector
import pandas as pd
import numpy as np
from IPython.display import display, Markdown, Latex


In [2]:
# there is probably a better way to do this. But this works for now
data_tables = ['AD','ELITE', 'GENIE', 'HTAN', 'NF', 'PSYCHENCODE']
table_names = ['SAGE.PORTAL_RAW.' + d for d in data_tables]
table_names

['SAGE.PORTAL_RAW.AD',
 'SAGE.PORTAL_RAW.ELITE',
 'SAGE.PORTAL_RAW.GENIE',
 'SAGE.PORTAL_RAW.HTAN',
 'SAGE.PORTAL_RAW.NF',
 'SAGE.PORTAL_RAW.PSYCHENCODE']

In [4]:
config = dotenv_values("../.env")
conn = snowflake.connector.connect(
    user=config['USER'],
    account=config['ACCOUNT_IDENTIFIER'],
    authenticator="externalbrowser", # FOR browser-based SSO for authentication since account uses Google account for login. It is organization-username
    warehouse=config['WAREHOUSE'],
    database=config['DATABASE'],
    role=config['ROLE'],
    login_timeout = 60,
    network_timeout=30,
    socket_timeout=10
)

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://accounts.google.com/o/saml2/idp?idpid=C030aw0es&SAMLRequest=lZLRbtowGIVfJfKuEzsJWzuLgBiMNRLdGIRJ250Tm%2BDi2MF2CPTp54RG6i5aaReRIvsc%2B%2Ft9znh6qYR3ZtpwJRMQBgh4TBaKclkmYJct%2FXvgGUskJUJJloArM2A6GRtSiRrPGnuQG3ZqmLGeO0ga3G8koNESK2K4wZJUzGBb4O3scYWjAOFaK6sKJcAry%2FsOYgzT1hEOFmq4wztYW2MI27YN2jhQuoQRQgiiz9CpOsmHQX9xM72hDyEadXqncPL1C9sXLm9P8B5WfhMZ%2FJBla3%2F9Y5sBbzagzpU0TcX0lukzL9hus7oBGEdQnZ73B0H9c%2B1441FgpGr3ghxZoaq6se7MwP3BPaNQqJK7sdNFAuojp982x%2FQp3K132fwU5ZV4uvzO7%2BPlz8bWeb5Kr5l62JBl%2Fvz1MCqA92vINepyTY1pWCq7NK1bQlHsh5EfjrLwDocfMYqD%2BO7TH%2BAtXJpcEts7B2RSFKqR1gSlUqVgPZ%2BCXXQR5LSeuo%2FTZI5iRFrkOnKrCO7v1JP%2FG3wMX3tfuvbdPX%2B6WCvBi6u3VLoi9u10wiDsVzj1970Us4pwMaNUM2NcSkKodq4Zsa7SVjcMwMnt1n9LPfkL&RelayState=50803&SigAlg=http%3A%2F%2Fwww

In [5]:
# create cursor
cur = conn.cursor()

# Try to join all the tables together

sf_tables = {}

for t in table_names: 
    query = f"""
        SELECT * FROM {t}
    """

    cur.execute(query)

    # Retrieve results
    df = pd.concat([d for d in cur.fetch_pandas_batches()])
    df = df.reset_index(drop=True)
    df['TABLE'] = t
    sf_tables[t] = df

cur.close()

True

In [6]:
comb_df = pd.concat(sf_tables.values()).reset_index(drop=True)

original_shape = comb_df.shape
original_cols = sorted(comb_df.columns)
comb_df = comb_df.dropna(how='all', axis =1)
# removing empty lists and changing all nonetypes to nans
comb_df = comb_df.replace('[]', np.nan).fillna(value=np.nan)
# cleanup lists and values for new lines, double spaces and quotes
comb_df = comb_df.apply(lambda x: x.str.replace('\n|\s+|"', '', regex = True), axis = 1)
# drop empty columns
comb_df = comb_df.dropna(how='all', axis =1)
comb_df = comb_df[sorted(comb_df.columns)]

display(Markdown(f"""
|ORIGINAL|COMBINED|
|---|---|
|{original_shape}|{comb_df.shape}|
"""))


|ORIGINAL|COMBINED|
|---|---|
|(524392, 111)|(524392, 88)|


In [7]:
comb_df.head()

Unnamed: 0,ACCESSTYPE,AGE,ALIGNMENTMETHOD,ANALYSISTYPE,ASSAY,ASSAYTARGET,BENEFACTORID,BRODMANNAREA,CELLTYPE,CHROMOSOME,...,STUDYID,STUDYNAME,TABLE,TERMINALDIFFERENTIATIONPOINT,TISSUE,TRANSPLANTATIONTYPE,TREATMENTTYPE,TUMORTYPE,TYPE,VERSION
0,,,,genotypeimputation,[snpArray],,syn5550382,,,,...,,,SAGE.PORTAL_RAW.AD,,,,,,,
1,,,,genotypeimputation,[snpArray],,syn5550382,,,,...,,,SAGE.PORTAL_RAW.AD,,,,,,,
2,,,,genotypeimputation,[snpArray],,syn5550382,,,,...,,,SAGE.PORTAL_RAW.AD,,,,,,,
3,,,,genotypeimputation,[snpArray],,syn5550382,,,,...,,,SAGE.PORTAL_RAW.AD,,,,,,,
4,,,,genotypeimputation,[snpArray],,syn5550382,,,,...,,,SAGE.PORTAL_RAW.AD,,,,,,,


In [11]:
from datetime import datetime

In [8]:
comb_df.to_csv(f"../data/portal-data-raw-{datetime.now().strftime('%Y%m%d')}.csv")

In [9]:
# pull out only columns found in all tables
cols = {}

for k,v in sf_tables.items(): 
    cols[k] = set(v.columns)

# u = set.intersection(*cols.values()) # HTAN does not have any additional columns at the moment. So this is not useful

In [10]:
cols

{'SAGE.PORTAL_RAW.AD': {'ANALYSISTYPE',
  'ASSAY',
  'ASSAYTARGET',
  'BENEFACTORID',
  'CELLTYPE',
  'CHROMOSOME',
  'CONSORTIUM',
  'CREATEDBY',
  'CREATEDON',
  'CURRENTVERSION',
  'DATAFILEHANDLEID',
  'DATAFILESIZEBYTES',
  'DATASUBTYPE',
  'DATATYPE',
  'FILEFORMAT',
  'GRANTS',
  'GROUPS',
  'ID',
  'INDIVIDUALID',
  'INDIVIDUALIDSOURCE',
  'ISCONSORTIUMANALYSIS',
  'ISMODELSYSTEM',
  'ISMULTISPECIMEN',
  'LIBRARYPREP',
  'METABOLITETYPE',
  'METADATATYPE',
  'MODELSYSTEMNAME',
  'MODELSYSTEMTYPE',
  'MODIFIEDBY',
  'MODIFIEDON',
  'NAME',
  'NUCLEICACIDSOURCE',
  'ORGAN',
  'PARENTID',
  'PROJECTID',
  'RESOURCETYPE',
  'SEX',
  'SPECIES',
  'SPECIMENID',
  'SPECIMENIDSOURCE',
  'STUDY',
  'TABLE',
  'TISSUE',
  'TREATMENTTYPE'},
 'SAGE.PORTAL_RAW.ELITE': {'ANALYSISTYPE',
  'ASSAY',
  'CONSENT',
  'CONSORTIUM',
  'CURRENTVERSION',
  'DATASUBTYPE',
  'DATATYPE',
  'ETAG',
  'FILEFORMAT',
  'GRANTS',
  'ID',
  'ISCONSORTIUMANALYSIS',
  'ISMODELSYSTEM',
  'ISMULTISPECIMEN',
  'LIB