In [276]:
########################################################################################
import os
import glob
import numpy
import pandas as pd

########################################################################################
def open_LVfile(LVfile):
    """
    This function opens a single LIPIDVIEW text file, renames columns and removes redundant ones.
    This function returns a cleaner table.
    """
    col_names = list(pd.read_table(LVfile, skiprows=1, usecols = lambda column : 'Unnamed' not in column))
    if col_names[0] == 'Sample Name':
        col_names[0] = 'Lipid Name'
    lipid_data = pd.read_table(LVfile, skiprows=2, usecols = lambda column : column not in ['Sample ID', 'PIS m/z', 'Polarity', 'View Type', '(ScanName)'])
    lipid_data.columns = col_names
    for i in range(0, len(lipid_data)):
        lipid_data.loc[i,'Lipid Name'] = lipid_data.loc[i,'Lipid Name'].replace('+NH4','')
    return lipid_data
########################################################################################
def check_col(dataset, column):
    """
    This function returns all rows without NaN's of a given column based on a given dataset.
    This function should be used for locating a suspicious column. The output reveals the lipidclass, where the suspicious column is coming from.
    """
    lipidclass = dataset[dataset[column].notnull()]
    
    return lipidclass
########################################################################################
def test_column_amount(data_frame, amount_samples, amount_kons=3):
    """
    This function is a test.
    """
    sample_cols = data_frame.loc[:, data_frame.columns.str.contains('ample')].columns
    kon_cols = data_frame.loc[:, data_frame.columns.str.contains('on')].columns
    if not  len(sample_cols) == amount_samples:
        raise ValueError(F'There are more sample columns than samples: Check suspicious columns with check_col(dataset, column)-function to find out the lipidclass-based textfile {sample_cols.values}')
    if not  len(kon_cols) == amount_kons:
        raise ValueError(F'There are more control columns than controls: Check suspicious columns with check_col(dataset, column)-function to find out the lipidclass-based textfile {kon_cols.values}')
########################################################################################
####-------------------------------__Main Code__------------------------------------####

# build up your filepath of the raw data and save all paths in a list via glob
file_location = os.path.join('data', '*.txt')
filenames = glob.glob(file_location)

# loop all files as dataframes in a list
data = []
for i in range(0,len(filenames)):
    dat = open_LVfile(filenames[i])
    data.append(dat)
    
# merge all dataframes to one
comb_data = []
for i in range(0,1):
    comb_data = data[i]
    for i2 in range(1, 15):
        comb_data = pd.concat([comb_data, data[i2]], axis=0)

# reset the Index, because through merging of all files, we got indexes of the same number
comb_data = comb_data.reset_index(drop = True)

########################################################################################

# Extract the lipidclass from 'LipidName'-column (e.g. 'IS Chol 24:1')
# and add Lipidclass (e.g 'Chol') and Fatty acid sum (e.g '24:1') as new columns to the dataframe
lipidclass_col = []
for i in range(0, len(comb_data)):
    #Get rid off 'IS'-string
    species = comb_data.loc[i,'Lipid Name'].replace('IS ','')
    #Get rid off 'fatty acid sum'-string
    lipidclass_col.append(species.split())
    
comb_data.insert(1, 'Lipid Class', [ x[0] for x in lipidclass_col])
comb_data.insert(2, 'FA_sum', [ x[-1] for x in lipidclass_col])
########################################################################################
# Runs a test to check, if sample amount and controls are fitting to the column amount
#test_column_amount(comb_data, amount_samples, amount_kons=3)
########################################################################################

tidy_data = pd.melt(comb_data,
                         ['Lipid Name', 'Lipid Class', 'FA_sum'],
                         var_name='Sample Name',
                         value_name='Intensities')

#tidy_data.to_csv('formatted_data.csv')

comb_data

Unnamed: 0,Lipid Name,Lipid Class,FA_sum,Sample 1,Sample 2,Sample 3,Sample 4,Kon A,Kon B,Kon C,Kon A.1
0,Chol,Chol,Chol,30724625,16071750,35192875,34937125,2477500,18795250.0,34101000.0,34937125.0
1,IS D6-Chol,D6-Chol,D6-Chol,15831750,8866625,16906125,14732250,25053625,32932125.0,32824625.0,14732250.0
2,IS Chol 9:0,Chol,9:0,16869875,15516250,151.707.497.398,14463125,12.362.928.766,,,
3,IS Chol 19:0,Chol,19:0,118.567.020.699,101.314.613.682,100.407.917.856,107.190.411.608,11073125,,,
4,IS Chol 24:1,Chol,24:1,17638375,14496375,122.359.463.741,15265625,21154125,,,
...,...,...,...,...,...,...,...,...,...,...,...
824,TAG 60:5,TAG,60:5,0,0,6500,0,0,,,
825,TAG 60:5,TAG,60:5,0,0,3500,3000,0,,,
826,TAG 60:5,TAG,60:5,11250,6375,16750,15250,6750,,,
827,TAG 60:7,TAG,60:7,0,0,0,0,0,,,


In [180]:
test = check_lipidclass(comb_data,'Kon A')
pd.Series(test.loc[:,'Lipid Class']).unique(order :{'C'})
#df = df.fillna(0)      #if the blanks are nan will need this line first
#df['Kon A'] = df['kon A'] + df['Kon A']

#df.drop('kon A', axis=1)

SyntaxError: invalid syntax (<ipython-input-180-15014a15a5f5>, line 2)

In [263]:
def test_column_amount(data_frame, amount_samples, amount_kons=3):
    sample_cols = data_frame.loc[:, data_frame.columns.str.contains('ample')].columns
    kon_cols = data_frame.loc[:, data_frame.columns.str.contains('on')].columns
    if not  len(sample_cols) == amount_samples:
        raise ValueError(F'There are more sample columns than samples: Check suspicious columns with check_col(dataset, column)-function to find out the lipidclass-based textfile {sample_cols.values}')
    if not  len(kon_cols) == amount_kons:
        raise ValueError(F'There are more control columns than controls: Check suspicious columns with check_col(dataset, column)-function to find out the lipidclass-based textfile {kon_cols.values}')
        
pd.Series(test.loc[:,'Lipid Class']).unique(order :{'C'})

In [275]:
sample_cols = comb_data.loc[:, comb_data.columns.str.contains('ample')].columns
kon_cols = comb_data.loc[:, comb_data.columns.str.contains('on')].columns

#test_column_amount(comb_data, 4, 4)
comb_data.loc[:, comb_data.columns.str.contains('Sample')].columns

Index(['Sample 1', 'Sample 2', 'Sample 3', 'Sample 4'], dtype='object')