In [2]:
import os
import mysql.connector
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline

In [34]:
user_name = os.environ.get('DB_USER')
password = os.environ.get('DB_PASSWORD')

In [35]:
cnx = mysql.connector.connect(user=user_name, password=password,host='localhost',database='FilePaths')

[Cursor documentation](https://www.psycopg.org/docs/cursor.html)

[I think, better documentation for cursor](https://dev.mysql.com/doc/connector-python/en/connector-python-api-mysqlcursor.html)

In [36]:
# create cursor to read or write
cursor = cnx.cursor()

## Existing tables

In [16]:
pd.read_sql_query('show tables from FilePaths',cnx)

Unnamed: 0,Tables_in_filepaths
0,FilePath


In [17]:
pd.read_sql_query('describe {}'.format('FilePath'), cnx)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,filePath,varchar(200),NO,PRI,,


## Insert into tables

### Read file and transform into pandas --> separate by abundance, pH and metabolites --> create files --> save in the DB

### Form the lists that contain the abundance, pH or metabolites respective columns names

In order to do this we need:
* A file contaning all the unique possible headers that have been found parsing all the provided files.
* The regex expression, which is set manually. For bacterial abundances, fields that contain **time, active, liquid, OD**, but if I find others, like **qPCR, 16S rRNA seq, ...** they must be added in the regex. For pH measurements, only **ph and time**. For metabolites, it considers that everything that was not count on abundances and tme, belongs to metabolites + time.

In [17]:
headersFile = '/Users/julia/bacterialGrowth_thesis/IntermediateFiles/lab_headers.txt'

with open(headersFile) as file:
    headers_list_split = file.read().splitlines()
    headers_list = file.read()
    print(headers_list)




In [18]:
headers_list_split

['Acetate',
 'Butyrate',
 'Formate',
 'Fucose',
 'Galactose',
 'Glucose',
 'Lactate',
 'Mannose',
 'Neu5Ac',
 'OD',
 'Pyruvate',
 'Succinate',
 'Trehalose',
 'attachedBTactivecount',
 'attachedRIactivecount',
 'attachedactivecount',
 'liquidBTactivecount',
 'liquidRIactivecount',
 'liquidactivecount',
 'liquidtotalcount',
 'pH',
 'time']

In [19]:
type(headers_list_split)

list

In [20]:
# Using regex so it is case insensitive and look for patterns inside words (not only complete words)

def getMatchingList (regex, lst):
    list = []
    for word in lst:
        if regex.findall(word):
            list.append(word)
    return list

In [21]:
growth_regex = re.compile(r'.*time.* | .*liquid.* | .*active.* | .*OD.*', flags=re.I | re.X)
growth_regex = re.compile(r'time | liquid | active | OD', flags=re.I | re.X)
ph_regex = re.compile(r'.*time.* | .*ph.*', flags=re.I | re.X)

growth_fields = getMatchingList(growth_regex, headers_list_split)
ph_fields = getMatchingList(ph_regex, headers_list_split)

In [22]:
not_metabolites_list = list(set(growth_fields) | set(ph_fields))
not_metabolites_list

['liquidactivecount',
 'liquidtotalcount',
 'attachedRIactivecount',
 'OD',
 'liquidRIactivecount',
 'attachedactivecount',
 'pH',
 'liquidBTactivecount',
 'attachedBTactivecount',
 'time']

In [23]:
metabolites_fields = set(headers_list_split) - set(not_metabolites_list)
metabolites_fields = list(metabolites_fields)
metabolites_fields.append('time')
metabolites_fields

['Pyruvate',
 'Galactose',
 'Acetate',
 'Mannose',
 'Fucose',
 'Trehalose',
 'Formate',
 'Glucose',
 'Butyrate',
 'Neu5Ac',
 'Lactate',
 'Succinate',
 'time']

### Several paths

THINK:
- One options is to call the .py file containing this code with a file containing all the new paths. 
- Make the file with all the paths in the IntermediateFiles directory

Or

- The user will use a first (and hopw, unique) python script in which all the experiment information will be provided. Then it should give the directory with all the files. These files need to be parsed and is needed, for this python code, to get the headers inside all of the files and the paths for saving the separate files for the abundances, metabolites and ph.

In [24]:
files = open("/Users/julia/bacterialGrowth_thesis/IntermediateFiles/listOfFiles.list", "r").readlines()
#print(f.read())
#print(files)

In [25]:
files2 = list(map(lambda s: s.strip(), files))

In [30]:
print(files2[0:2])

['/Users/julia/bacterialGrowth_thesis/Data/BTRI_mucin_RNAseq/1/BTRI_mucin_RNAseq_1.txt', '/Users/julia/bacterialGrowth_thesis/Data/BTRI_mucin_RNAseq/3/BTRI_mucin_RNAseq_3.txt']


In [27]:
def findOccurrences(s, ch):
    return [i for i, letter in enumerate(s) if letter == ch]

In [31]:
for f in files2:
    #Get directory of the provided paths
    path_end = max(findOccurrences(f, "/"))
    path = f[:path_end+1]
    
    #Read file with all the data
    df = pd.read_table(f, sep=" ")

    #Bacterial abundances
    growth_data = df[df.columns.intersection(growth_fields)]
    growth_data = growth_data.round({'OD': 3})
    #pH over time
    ph_data = df[df.columns.intersection(ph_fields)]
    ph_data = ph_data.round({'pH': 3})
    #Metabolites over time
    metabolites_data = df[df.columns.intersection(metabolites_fields)]
    
    # If len(df.columns) <= 1 (only time column), we do not save it
    if len(growth_data.columns) > 1:
        growth_data.to_csv(path+'abundance_file.txt', sep=" ", index=False)
    
    if len(ph_data.columns) > 1:        
        ph_data.to_csv(path+'pH_file.txt', sep=" ", index=False)
    
    if len(metabolites_data.columns) > 1:
        metabolites_data.to_csv(path+'metabolites_file.txt', sep=" ", index=False)

    q = f""" INSERT IGNORE INTO FilePath (filePath)
        VALUES ('{path+'abundance_file.txt'}')
        """

    cursor.execute(q)
    
    q = f""" INSERT IGNORE INTO FilePath (filePath)
        VALUES ('{path+'metabolites_file.txt'}')
        """

    cursor.execute(q)
    
    q = f""" INSERT IGNORE INTO FilePath (filePath)
        VALUES ('{path+'pH_file.txt'}')
        """

    cursor.execute(q)

cnx.commit()
print('DONE!')

DONE!


In [38]:
for i in pd.read_sql('SELECT filePath FROM FilePath', cnx)['filePath']:
    print(i[50:])

mucin1/1/abundance_file.txt
mucin1/1/growth_file.txt
mucin1/1/metabolites_file.txt
mucin1/1/pH_file.txt
mucin1/2/abundance_file.txt
mucin1/2/growth_file.txt
mucin1/2/metabolites_file.txt
mucin1/2/pH_file.txt
mucin1/3/abundance_file.txt
mucin1/3/growth_file.txt
mucin1/3/metabolites_file.txt
mucin1/3/pH_file.txt
mucin1/4/abundance_file.txt
mucin1/4/growth_file.txt
mucin1/4/metabolites_file.txt
mucin1/4/pH_file.txt
mucin1/5/abundance_file.txt
mucin1/5/growth_file.txt
mucin1/5/metabolites_file.txt
mucin1/5/pH_file.txt
mucin1/6/abundance_file.txt
mucin1/6/growth_file.txt
mucin1/6/metabolites_file.txt
mucin1/6/pH_file.txt
mucin1/7/abundance_file.txt
mucin1/7/growth_file.txt
mucin1/7/metabolites_file.txt
mucin1/7/pH_file.txt
mucin1/8/abundance_file.txt
mucin1/8/growth_file.txt
mucin1/8/metabolites_file.txt
mucin1/8/pH_file.txt
mucin1/9/abundance_file.txt
mucin1/9/growth_file.txt
mucin1/9/metabolites_file.txt
mucin1/9/pH_file.txt
mucin2/1/abundance_file.txt
mucin2/1/growth_file.txt
mucin2/1/me

In [37]:
pd.read_sql('SELECT filePath FROM FilePath', cnx)

Unnamed: 0,filePath
0,/Users/julia/bacterialGrowth_thesis/Data/BT_ba...
1,/Users/julia/bacterialGrowth_thesis/Data/BT_ba...
2,/Users/julia/bacterialGrowth_thesis/Data/BT_ba...
3,/Users/julia/bacterialGrowth_thesis/Data/BT_ba...
4,/Users/julia/bacterialGrowth_thesis/Data/BT_ba...
...,...
423,/Users/julia/bacterialGrowth_thesis/Data/RI_WC...
424,/Users/julia/bacterialGrowth_thesis/Data/RI_WC...
425,/Users/julia/bacterialGrowth_thesis/Data/RI_WC...
426,/Users/julia/bacterialGrowth_thesis/Data/RI_WC...


In [33]:
cursor.close()

cnx.commit()
cnx.close()