Code to make an SQLite database from ABCD data download

https://www.sqlitetutorial.net/sqlite-python/creating-tables/

In [19]:
import sqlite3
import os 
from sqlite3 import Error
from pathlib import Path
import pandas as pd
import numpy as np
import logging

In [11]:
os.chdir(r"/Users/margotwagner/ucsd/research/ABCD")

In [2]:
# logging
# Chang according to desired logging level (default it warning)
# logging.basicConfig(level=logging.INFO)

In [3]:
def create_connection(db_file):
    """ create a database connection to a SQLite database 
    params:
        db_file: 
            database file name to be connected (no .txt/.csv ending)

    return:
        connection object or None
    """
    
    # start database as empty text file
    Path(db_file).touch()


    # create sqlite table
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        logging.info("connection established")
        logging.info("sqlite version: {}".format(sqlite3.version))
    except Error as e:
        logging.error("Unable to establish connection")
        logging.error(e, exc_info=True)

    return conn

In [66]:
def create_table(conn, create_table_sql):
    """ create a table from the create_table_sql statement
    params:
        conn: 
            Connection object
        create_table_sql: 
            a CREATE TABLE statement

    returns:
    """
    success = 0

    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        logging.error("Unable to create table")
        logging.error(e, exc_info=True)

    return success

In [8]:
def get_txt_data(file_name):
    """ creates pandas dataframe containing data values from .txt data files
    
    params:
        file_name:
            title of file of interest (no .txt ending)

    return:
        table: 
            pandas dataframe containing data
        col_desc:
            np array containing column descriptions as given in .txt file
    
    """

    # Create table from txt file
    table = pd.read_csv('./data/data-files/{}.txt'.format(file_name), delimiter='\t')
    col_desc = table.loc[0].values  # get column descriptions
    table.drop(labels=0, inplace=True)    # remove column descriptions

    return table, col_desc


In [57]:
def get_csv_info(file_name):
    """ creates pandas dataframe containing dataset information/descriptions from csv files 
    
    params:
        file_name:
            title of file of interest (no .csv ending)

    return:
        pandas dataframe containing data information
    
    """

    # Create table from csv file
    table = pd.read_csv('./data/abcd-4.0-data-dictionaries/{}.csv'.format(file_name))

    # convert to sql data types
    table.replace({'GUID': 'TEXT', 'String': 'TEXT', 'Date': 'TEXT', 'Integer': 'INTEGER', 'Float': 'REAL'}, inplace=True)    # replace DataType with SQLlite data types

    return table

In [58]:
def dtypes_dict_maker(file_name):
    """ creates datatypes dict for all element names using info from csv files 
    
    params:
        file_name:
            title of file of interest (no .csv ending)

    return:
        dict containing datatype information
    
    """
    # get info table
    table = get_csv_info(file_name)

    data_types = table.to_dict()['DataType']

    # add common element names
    common_elems = ['collection_id', 'abcd_bp01_id', '{}'.format(file_name), 'collection_title', 'study_cohort_name']
    for e in common_elems:
        data_types[e] = 'TEXT'

    return data_types
    

In [70]:
def elem_desc_dict_maker(file_name):
    """ creates description dict for all elements using txt files
    
    params:
        file_name:
            title of file of interest (no .txt ending)

    return:
        dict containing element description information
    
    """

    table, col_desc = get_txt_data(file_name)

    # replace bad descriptions
    poor_desc = ['collection_id', '{}_id'.format(file_name), 'dataset_id', 'collection_title', 'study_cohort_name']
    good_desc = ['The collection of origin for the record', 'The globally unique row ID of the record in the NDA database', 'The record’s dataset of origin', 'The name of the collection', 'The name of the study cohort the data is from (data release)']
    
    for i in range(len(poor_desc)):
        # replace descriptions
        col_desc = np.where(col_desc == poor_desc[i], good_desc[i], col_desc)

    elem_keys = table.columns.values

    desc_dict = { elem_keys[i] : col_desc[i] for i in range(len(elem_keys)) }
    
    return desc_dict

In [41]:
def new_col_entry(column_name, data_type, comment='', constraint='', last_elem=False):
    """ creates entry for sqlite table 
    Note: sqlite python API req triple quotes for command to be read
    
    params:

    return:
        str with entry to put in CREATE TABLE command 
    
    """
    entry = "\t\t\t{} {} {}, \t -- {}\n"

    # strip new lines
    column_name = column_name.replace("\n", " ")
    data_type = data_type.replace("\n", " ")
    comment = comment.replace("\n", " ")

    # no comma for last element
    if last_elem:
        entry = "\t\t\t{} {} {} \t -- {}\n"

    return entry.format(column_name, data_type, constraint, comment)

In [64]:
def sql_create_table_cmd_builder(file_name):
    """ constructs CREATE TABLE entry for sqlite table using abcd file
    Note: sqlite python API req triple quotes for command to be read
    Note: could update to just use pandas to_sql function (maintain data types part of function)
    
    params:
        file_name:

    return:
        str with sqlite CREATE TABLE command
    """

    # open CREATE TABLE command
    #sql_cmd = ''' CREATE TABLE IF NOT EXISTS {} (
	#		'''.format(file_name)
    sql_cmd = ''' DROP TABLE IF EXISTS {}; CREATE TABLE {} (
			'''.format(file_name, file_name)

    
    # Manually add in first 3 ids from txt file (common to all .txts)        
    sql_cmd += new_col_entry("collection_id", "TEXT", "Record's collection of origin")        
    sql_cmd += new_col_entry("{}_id".format(file_name), "TEXT", "The globally unique row ID of the record in the NDA database")
    sql_cmd += new_col_entry("dataset_id", "TEXT", "The record's dataset of origin", last_elem=True)

    # Get column info
    info_table = get_csv_info(file_name)

    # Add remaining data column entries
    for index, row in info_table.iterrows():
        if row['ElementName'] == 'subjectkey':
            sql_cmd += new_col_entry(row['ElementName'], row['DataType'], row['ElementDescription'], constraint='PRIMARY KEY')

        elif index == len(info_table.index) - 1:
            sql_cmd += new_col_entry(row['ElementName'], row['DataType'], row['ElementDescription'], last_elem=True)

        else:
            sql_cmd += new_col_entry(row['ElementName'], row['DataType'], row['ElementDescription'])
            
    # close CREATE TABLE command
    sql_cmd += """);"""

    return sql_cmd

In [93]:
def elem_desc_df_maker(file_name):
    """ creates a description df for elements using txt files
    
    params:
        file_name:
            title of file of interest (no .txt ending)

    return:
        df containing element description information
    
    """

    desc_dict = elem_desc_dict_maker(file_name)

    desc_df = pd.DataFrame.from_dict(desc_dict, orient='index', columns=['elem_desc'])

    # move index to elements
    desc_df.reset_index(inplace=True)
    desc_df = desc_df.rename(columns = {'index':'elem_name'})

    return desc_df


elem_desc_df_maker('abcd_bp01')

Unnamed: 0,elem_name,elem_desc
0,collection_id,The collection of origin for the record
1,abcd_bp01_id,The globally unique row ID of the record in th...
2,dataset_id,The record’s dataset of origin
3,subjectkey,The NDAR Global Unique Identifier (GUID) for r...
4,src_subject_id,Subject ID how it's defined in lab/project
5,interview_date,Date on which the interview/genetic test/sampl...
6,interview_age,Age in months at the time of the interview/tes...
7,sex,Sex of subject at birth
8,eventname,The event name for which the data was collected
9,blood_pressure_start_time,Start time


In [95]:
def get_file_list(main_dir):
    """ Get list of data files that have the data and info

    param:
        dir:
            data directory to look in
    
    return
        file_names:
            list of file names with both data and documentation


    """
    # build directories
    data_dir = os.path.join(main_dir, 'data/data-files')
    docs_dir = os.path.join(main_dir, 'data/abcd-4.0-data-dictionaries')

    # Get list of all data file name
    data_file_names = [f.split('.')[0] for f in os.listdir(data_dir)]
    docs_file_names = [f.split('.')[0] for f in os.listdir(docs_dir)]

    # Check that data files all have info files
    only_data_files = sorted(list(set(data_file_names) - set(docs_file_names)))   # file names with txt data but no csv info
    only_info_files = sorted(list(set(docs_file_names) - set(data_file_names)))   # file names with csv info but no txt data  

    # Only want data files associated info file (for now)
    file_names = data_file_names.copy()

    if only_data_files != []:
        logging.warning("Ignoring data files without info files. Data file names ignore: \t")

        # don't create tables for data files without info 
        for f in only_data_files:
            file_names.remove(f)
            logging.warning("{}.txt".format(f))

        # TODO: ALSO GET FILES WITH JUST DATA BUT GET NAMES AND DESC FROM DATA FILE 

    logging.info("Files with info and no data:", only_info_files)

    return file_names

In [106]:
def main():
    """ Create ABCD sqlite3 database and populate with all given data tables.

    Note: current implementation uses pandas' df.to_sql function which does not allow for specification of primary key or commenting

    TODO: Add SQL command to add primary key and comments
    """

    # Change to ABCD directory
    main_dir = r"/Users/margotwagner/ucsd/research/ABCD"
    os.chdir(main_dir)

    # Name database
    database = r"./data/abcd.db"
    logging.info("Creating database", database)

    # Get files list
    file_names = get_file_list(main_dir)
    n_files = len(file_names)
    tables_created = []

    # create table for all data files
    print("beginning now...")
    logging.info("Attempting to create", n_files, "tables")
    success_count = 0
    fail_count = 0

    for f in file_names:

        # populate sql table with data
        data_table, col_desc = get_txt_data(f)

        # create a database connection
        conn = create_connection(database)

        # get datatypes
        dtypes_dict = dtypes_dict_maker(f)

        # create sql table in database 
        try:
            data_table.to_sql(f, con=conn, dtype=dtypes_dict, index=False)
            logging.info("Table {} created and populated successfully".format(f))
            success_count += 1 
            tables_created.append(f)

            # append elements in file and their descriptions to description table
            desc_df = elem_desc_df_maker(f)
            desc_df.to_sql('all_element_descriptions', con=conn, if_exists='append', index=False)

        except ValueError as ve:
            logging.error("Unable to create table for {} as it already exists".format(f))
            print("Data table for {} was not created".format(f))
            logging.error(ve, exc_info=True)
            fail_count += 1       

        # Output message
        if ((success_count + fail_count)%10 == 0):
            print("{:.1%} completion with {} tables successfully completed and {} failures".format(((success_count+fail_count)/n_files), success_count, fail_count))
        
    logging.info("Percent of tables successfully created: {:.0%}".format(success_count/n_files))
    logging.info("Percent of files that were not created: {:.0%}".format(fail_count/n_files))

    logging.info("Completed!")

    print("Finished!")

    return tables_created
    

if __name__ == '__main__':
    tables_created = main()     

beginning now...


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


2.7% completion with 10 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


5.4% completion with 20 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


8.1% completion with 30 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


10.8% completion with 40 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


13.5% completion with 50 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


16.2% completion with 60 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


18.9% completion with 70 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


21.6% completion with 80 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


24.3% completion with 90 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


27.0% completion with 100 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


29.6% completion with 110 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


32.3% completion with 120 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


35.0% completion with 130 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


37.7% completion with 140 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


40.4% completion with 150 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


43.1% completion with 160 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


45.8% completion with 170 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


48.5% completion with 180 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


51.2% completion with 190 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


53.9% completion with 200 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


56.6% completion with 210 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)


59.3% completion with 220 tables successfully completed and 0 failures


  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


62.0% completion with 230 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


64.7% completion with 240 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


67.4% completion with 250 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)


70.1% completion with 260 tables successfully completed and 0 failures


  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


72.8% completion with 270 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


75.5% completion with 280 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


78.2% completion with 290 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


80.9% completion with 300 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


83.6% completion with 310 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


86.3% completion with 320 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


88.9% completion with 330 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


91.6% completion with 340 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


94.3% completion with 350 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


97.0% completion with 360 tables successfully completed and 0 failures


  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)
  exec(code_obj, self.user_global_ns, self.user_ns)
  desc_df = elem_desc_df_maker(f)


99.7% completion with 370 tables successfully completed and 0 failures
Finished!


In [None]:
def main():
    # Change to ABCD directory
    os.chdir(r"/Users/margotwagner/ucsd/research/ABCD")
    
    # Name database
    database = r"./data/abcd.db"
    logging.info("Creating database", database)

    # Get list of all data file name
    data_file_names = [f.split('.')[0] for f in os.listdir('./data/data-files')]
    info_file_names = [f.split('.')[0] for f in os.listdir('./data/abcd-4.0-data-dictionaries')]

    # Check that data files all have info files
    only_data_files = sorted(list(set(data_file_names) - set(info_file_names)))   # file names with txt data but no csv info
    only_info_files = sorted(list(set(info_file_names) - set(data_file_names)))   # file names with csv info but no txt data  

    # Only want data files associated info file (for now)
    file_names = data_file_names.copy()

    if only_data_files != []:
        logging.warning("Ignoring data files without info files. Data file names ignore: \t")

        # don't create tables for data files without info 
        for f in only_data_files:
            file_names.remove(f)
            logging.warning("{}.txt".format(f))

        # TODO: ALSO GET FILES WITH JUST DATA BUT GET NAMES AND DESC FROM DATA FILE 

    logging.info("Files with info and no data:", only_info_files)
    
    # create table for all data files
    # for f in file_names:
    for f in ['abcd_bp01']:

        # get file name from list
        logging.info("Creating table for file:", f)
    
        # sqlite statements
        sql_cmd = sql_create_table_cmd_builder(f)
        logging.info("Using the following sqlite command: \n", sql_cmd)

        # create a database connection
        conn = create_connection(database)

        # create tables
        if conn is not None:
            # create table
            table_created = create_table(conn, sql_cmd)

            if table_created:
                # populate sql table with data
                data_table, col_desc = get_txt_data(f)

                logging.info("Table created and populated successfully")
        

if __name__ == '__main__':
    main()
