# SQL to CSV 

### The selection of key columns to extract junction_aa sequence data along with the accompanying clinical factors.

This code will be run for each subject and looped from the papermill pipeline notebook.

Using the papermill pipeline, this code will be looped to generate dataframes for each subject.

In [1]:
# load needed libraries
import sqlite3
import os
import pandas as pd

In [2]:
#This sets the parameters for papermill to this notebook.
alpha = 0.6
ratio = 0.1

In [4]:
#Connects to local DB instance - created from running /extract_to_sql/create_sqlitedb.sh 
#This shell script indexed the input tables for fast querying of the data.
def connect():
    '''function to return SQLite connection
    '''
    # create connection object representing database
    # point to local db (DB Path)
    return(sqlite3.connect('/media/teamcovidtest/PNY512/fdacovid3.db') )

In [5]:
conn = connect() #Creates connection to SQLite DB.

In [6]:
#Reads subject identifiers from metadata file.
#This is read to prepare the data for R notebook analysis by subject_id.
subject = pd.read_csv("metadata.csv")['subject_id']

In [7]:
#Assembling the query by joining the input tables on the sample_processing_id column.
query = '''SELECT sequence_id, junction_aa, \n
seqtable.sample_processing_id, metadata.subject_id
FROM seqtable \n
INNER JOIN metadata on metadata.sample_processing_id = seqtable.sample_processing_id \n
WHERE subject_id = (?)
'''

In [8]:
#The creation of a pandas dataframe from the database.
def getdata_subject( query , subject_order):
    ''' function takes input: 
                            query : SQL query,
                            and two binding parameters in query string: disease_stage, num_rows   
    '''
    
    df = pd.read_sql_query(query,
                           connect(),
                           params = [subject[subject_order]], )
    return(df)

In [9]:
#Calls iterator subject_order using the magic notation and from the papermill pipeline.
%store -r subject_order

In [10]:
#Calling getdata_subject function from papermill using subject_order as the binding parameter to get the dataframe of a specific subject.
df_subject = getdata_subject(query, subject_order ) 


In [11]:
df_subject.to_csv("output/df_subject.csv")