In [1]:
import os
import mysql.connector
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline

In [2]:
user_name = os.environ.get('DB_USER')
password = os.environ.get('DB_PASSWORD')

In [56]:
cnx = mysql.connector.connect(user=user_name, password=password,host='localhost',database='FilePaths')

[Cursor documentation](https://www.psycopg.org/docs/cursor.html)

In [48]:
# create cursor to read or write
cursor = cnx.cursor()

## Existing tables

In [4]:
pd.read_sql_query('show tables from FilePaths',cnx)

Unnamed: 0,Tables_in_filepaths
0,FilePath


In [5]:
pd.read_sql_query('describe {}'.format('FilePath'), cnx)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,filePath,varchar(200),NO,PRI,,


## Insert into tables

### Read file and transform into pandas --> separate by abundance, pH and metabolites --> create files --> save in the DB

### Form the lists that contain the abundance, pH or metabolites respective columns names

In order to do this we need:
* A file contaning all the unique possible headers that have been found parsing all the provided files.
* The regex expression, which is set manually. For bacterial abundances, fields that contain **time, active, liquid, OD**, but if I find others, like **qPCR, 16S rRNA seq, ...** they must be added in the regex. For pH measurements, only **ph and time**. For metabolites, it considers that everything that was not count on abundances and tme, belongs to metabolites + time.

In [7]:
headersFile = '/Users/julia/bacterialGrowth_thesis/IntermediateFiles/lab_headers.txt'

with open(headersFile) as file:
    headers_list_split = file.read().splitlines()
    headers_list = file.read()
    print(headers_list)




In [8]:
# Using regex so it is case insensitive and look for patterns inside words (not only complete words)

def getMatchingList (regex, lst):
    list = []
    for word in lst:
        if regex.findall(word):
            list.append(word)
    return list

In [9]:
growth_regex = re.compile(r'.*time.* | .*liquid.* | .*active.* | .*OD.*', flags=re.I | re.X)
growth_regex = re.compile(r'time | liquid | active | OD', flags=re.I | re.X)
ph_regex = re.compile(r'.*time.* | .*ph.*', flags=re.I | re.X)

growth_fields = getMatchingList(growth_regex, headers_list_split)
ph_fields = getMatchingList(ph_regex, headers_list_split)

In [10]:
not_metabolites_list = list(set(growth_fields) | set(ph_fields))
not_metabolites_list

['liquidRIactivecount',
 'liquidactivecount',
 'liquidtotalcount',
 'attachedBTactivecount',
 'time',
 'liquidBTactivecount',
 'pH',
 'attachedactivecount',
 'attachedRIactivecount',
 'OD']

In [11]:
metabolites_fields = set(headers_list_split) - set(not_metabolites_list)
metabolites_fields = list(metabolites_fields)
metabolites_fields.append('time')
metabolites_fields

['Lactate',
 'Galactose',
 'Pyruvate',
 'Glucose',
 'Butyrate',
 'Neu5Ac',
 'Formate',
 'Succinate',
 'Trehalose',
 'Acetate',
 'Mannose',
 'Fucose',
 'time']

### Several paths

In [30]:
files = ['/Users/julia/bacterialGrowth_thesis/Data/BT_batch_WC1/1/BT_batch_WC1_1.txt',
         '/Users/julia/bacterialGrowth_thesis/Data/BTRI_WC_RNAseq/1/BTRI_WC_RNAseq_1.txt']

In [31]:
def findOccurrences(s, ch):
    return [i for i, letter in enumerate(s) if letter == ch]

In [50]:
for f in files:
    #Get directory of the provided paths
    path_end = max(findOccurrences(f, "/"))
    path = f[:path_end+1]
    
    #Read file with all the data
    df = pd.read_table(f, sep=" ")

    #Bacterial abundances
    growth_data = df[df.columns.intersection(growth_fields)]
    growth_data = growth_data.round({'OD': 3})
    #pH over time
    ph_data = df[df.columns.intersection(ph_fields)]
    ph_data = ph_data.round({'pH': 3})
    #Metabolites over time
    metabolites_data = df[df.columns.intersection(metabolites_fields)]
    
    # If len(df.columns) <= 1 (only time column), we do not save it
    if len(growth_data.columns) > 1:
        growth_data.to_csv(path+'growth_file.txt', sep=" ", index=False)
    
    if len(ph_data.columns) > 1:        
        ph_data.to_csv(path+'pH_file.txt', sep=" ", index=False)
    
    if len(metabolites_data.columns) > 1:
        metabolites_data.to_csv(path+'metabolites_file.txt', sep=" ", index=False)

    q = f""" INSERT IGNORE INTO FilePath (filePath)
        VALUES ('{path+'growth_file.txt'}')
        """

    cursor.execute(q)

cursor.commit()

In [51]:
metabolites_data

Unnamed: 0,time
0,0
1,12
2,24
3,36


In [52]:
for i in pd.read_sql('SELECT filePath FROM FilePath', cnx)['filePath']:
    print(i[50:])

WC1/1/growth_file.txt
NAseq/1/growth_file.txt


In [57]:
pd.read_sql('SELECT filePath FROM FilePath', cnx)

Unnamed: 0,filePath
0,/Users/julia/bacterialGrowth_thesis/Data/BT_ba...
1,/Users/julia/bacterialGrowth_thesis/Data/BTRI_...


In [58]:
cursor.close()

cnx.commit()
cnx.close()