# Notebook to prepare data for Just-DNA-Seq modules

The purpose of this notebook to parse data provided by curator (presumably will work with all further data if it remains the same format)

In [1]:
import polars as pl
from pathlib import *
import sqlite3

### Setting up paths

In [11]:
base = Path(".")
data = base / "data"
input = data / "input"
output = data / "output"
table_name = [ 'rsids', 'studies', 'weight']

Variables to change for different modules:

In [12]:
sqlite_db_name = "lipid_metabolism.sqlite"
module_name = 'lipidmetabolism'
module_input = input / module_name

Getting names of the files in the input directory:

In [5]:
files = []

for file in module_input.iterdir():
    if file.is_file():
        files.append(file)

print(files)

[WindowsPath('data/input/lipidmetabolism/lipid_metabolism_rsids.tsv'), WindowsPath('data/input/lipidmetabolism/lipid_metabolism_studies.tsv'), WindowsPath('data/input/lipidmetabolism/lipid_metabolism_weights.tsv')]


Preparing tables for sqlite db with each tsv file

In [6]:
db = []

for r in range(3):
    module_db = pl.read_csv(str(files[r]), sep='\t')
    if 'Weight' in list(module_db.columns):
        module_db = module_db.with_columns((pl.col('Weight').str.replace(',','.')))
    module_db = (
        module_db
        .pipe(
        lambda df:df.rename({oldCol:oldCol.lower().replace(' ','_') for oldCol in df.columns}))
        .to_pandas()
    )
    db.append(module_db)


Clearing output

In [7]:
import shutil
if output.exists():
    shutil.rmtree(output)
output.mkdir()

Creating sqlite database

In [8]:
sqlite_db_path = output / sqlite_db_name
sqlite_con = sqlite3.connect(sqlite_db_path)

for m in range(3):
    db[m].to_sql(table_name[m], sqlite_con, if_exists='replace')


In [10]:
sqlite_con.close()