Fixes to the PLS-DA implementation to work with filtered class groups >2

mfitzp · Oct 24, 2014 · b673ea1 · b673ea1
1 parent f26be50
commit b673ea1
Show file tree

Hide file tree

Showing 6 changed files with 102 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Pathomx
 
-*Latest stable release v3.0.1 (22nd October 2014).*
+*Latest stable release v3.0.2 (24th October 2014).*
 
 Pathomx is an interactive tool for the analysis and visualisation of scientific data.
 Built on IPython it allows rapid, workflow-based exploration of complex datasets through

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-3.0.1
+3.0.2
diff --git a/pathomx/figures.py b/pathomx/figures.py
@@ -459,7 +459,7 @@ def scatterplot(data, figure=None, ax=None, styles=None, lines=[], label_index=N
             df = data
 
         s = ls.markersize ** 2 if ls.markersize != None else 20  #default
-        plots[c] = ax.scatter(df.iloc[:, 0], df.iloc[:, 1], color=ls.markerfacecolor, marker=ls.marker, s=s)
+        plots[c] = ax.scatter(df.iloc[:, 0].values, df.iloc[:, 1].values, color=ls.markerfacecolor, marker=ls.marker, s=s)
 
 
         # Calculate 95% confidence interval for data but only if points >1

diff --git a/pathomx/plugins/multivariate/pls_da.py b/pathomx/plugins/multivariate/pls_da.py
@@ -12,19 +12,29 @@
 class_idx = input_data.index.names.index('Class')
 classes = list( input_data.index.levels[ class_idx ] )
 
-Y = input_data.index.labels[ class_idx ]
+Y = []
+sample_filter_idx = []
+for n, cv in enumerate(input_data.index.values):
+    c = cv[class_idx]
+    if c == _experiment_control:
+        Y.append(0)
+        sample_filter_idx.append(n)
+    elif c == _experiment_test or _experiment_test == '*':
+        Y.append(1)
+        sample_filter_idx.append(n)
 
-plsr.fit(input_data.values, Y)
+
+plsr.fit(input_data.values[sample_filter_idx,:], Y)
 
 # Build scores into a dso no_of_samples x no_of_principal_components
 scores = pd.DataFrame(plsr.x_scores_)  
-scores.index = input_data.index
+scores.index = pd.MultiIndex.from_tuples([v for n, v in enumerate(input_data.index.values) if n in sample_filter_idx], names=list(input_data.index.names))
 
 scoresl =[]
 for n,s in enumerate(plsr.x_scores_.T):
-    scoresl.append( 'Latent Variable %d' % (n+1) ) #, plsr.y_weights_[0][n]) 
+    scoresl.append( 'Latent Variable %d' % (n+1) ) #, plsr.y_weights_[0][n])
 scores.columns = scoresl
-    
+
 
 weights = pd.DataFrame( plsr.x_weights_.T )
 weights.columns = input_data.columns
@@ -39,9 +49,9 @@
 for n in range(0, plsr.x_weights_.shape[1] ):
     lvd =  pd.DataFrame( plsr.x_weights_[:,n:n+1].T )
     lvd.columns = input_data.columns
-    
+
     vars()['LV%d' % (n+1)]  = spectra(lvd, styles=styles)
-    
+
     #weightsdl.append("Weights on LV %s" % (n+1))
     weightsdc.append("LV %s" % (n+1))
 
@@ -56,7 +66,7 @@
     label_index = None
 
 for sc in score_combinations:
-     vars()['Scores %dv%d' % (sc[0]+1, sc[1]+1)] = scatterplot(scores.iloc[:,sc], styles=styles, label_index=label_index)
+    vars()['Scores %dv%d' % (sc[0]+1, sc[1]+1)] = scatterplot(scores.iloc[:,sc], styles=styles, label_index=label_index)
 
-weightsd = None; # Clean up
-lvd = None; # Clean up
+weightsd = None;  # Clean up
+lvd = None;  # Clean up
diff --git a/pathomx/plugins/nmr/bruker_import.py b/pathomx/plugins/nmr/bruker_import.py
@@ -130,12 +130,18 @@ def autophase_PeakMinima(x, s):
 else:
     sample_id_regexp = None
 
+if config['class_regexp']:
+    class_regexp = re.compile(config['class_regexp'])
+else:
+    class_regexp = None
 
 # We should have a folder name; so find all files named fid underneath it (together with path)
 # Extract the path, and the parent folder name (for sample label)
 nmr_data = []
 nmr_dic = []
 sample_labels = []
+sample_classes = []
+
 _ppm_real_scan_folder = False
 fids = []
 for r, d, files in os.walk(config['filename']):  # filename contains a folder for Bruker data
@@ -173,31 +179,75 @@ def autophase_PeakMinima(x, s):
         if config['sample_id_from'] == 'Scan number':
             label = os.path.basename(fid)
 
-        elif config['sample_id_from'] == 'Experiment name':
-            label = dic['acqus']['EXP']
-
-        elif config['sample_id_from'] == 'Experiment (regexp)' and sample_id_regexp is not None:
-            m = sample_id_regexp.search(dic['acqus']['EXP'])
-            if m:
-               label = m.group(0) if m.lastindex is None else m.group(m.lastindex)
+        elif config['sample_id_from'] == 'Sequential':
+            label = str(n+1)
 
-            else:  # Fallback
+        elif config['sample_id_from'] == 'Experiment (regexp)':
+            if sample_id_regexp is None:
                 label = dic['acqus']['EXP']
 
-        elif config['sample_id_from'] == 'Path (regexp)' and sample_id_regexp is not None:
-            m = sample_id_regexp.search(fid)
-            if m:
-               label = m.group(0) if m.lastindex is None else m.group(m.lastindex)
+            else:
+                m = sample_id_regexp.search(dic['acqus']['EXP'])
+                if m:
+                    label = m.group(0) if m.lastindex is None else m.group(m.lastindex)
 
-            else: # Fallback
+                else:  # Fallback
+                    label = dic['acqus']['EXP']
+
+        elif config['sample_id_from'] == 'Path (regexp)':
+            if sample_id_regexp is None:
                 label = os.path.basename(fid)
 
+            else:
+                m = sample_id_regexp.search(fid)
+                if m:
+                    label = m.group(0) if m.lastindex is None else m.group(m.lastindex)
+
+                else:  # Fallback
+                    label = fid
+
         else:
             label = os.path.basename(fid)
 
+
+        # Generate sample id for this spectra
+        # ['Scan number', 'Experiment name', 'Experiment (regexp)', 'Path (regexp)']
+        if config['class_from'] == 'None':
+            classn = ''
+
+        elif config['class_from'] == 'Experiment (regexp)':
+            if class_regexp is None:
+                classn = dic['acqus']['EXP']
+
+            else:
+                m = class_regexp.search(dic['acqus']['EXP'])
+                if m:
+                    classn = m.group(0) if m.lastindex is None else m.group(m.lastindex)
+
+                else:  # Fallback
+                    classn = dic['acqus']['EXP']
+
+        elif config['class_from'] == 'Path (regexp)':
+            if class_regexp is None:
+                classn = os.path.basename(fid)
+
+            else:
+                m = class_regexp.search(fid)
+                if m:
+                    classn = m.group(0) if m.lastindex is None else m.group(m.lastindex)
+
+                else:  # Fallback
+                    classn = fid
+
+        else:
+            classn = ''
+
         #if 'AUTOPOS' in dic['acqus']:
         #    label = label + " %s" % dic['acqus']['AUTOPOS']
+
         sample_labels.append(label)
+        sample_classes.append(classn)
+
         nmr_data.append(data)
         nmr_dic.append(dic)
         _ppm_real_scan_folder = fid
@@ -233,7 +283,7 @@ def autophase_PeakMinima(x, s):
 
     print("Processing spectra to Pandas DataFrame...")
     output_data = pd.DataFrame(nmr_data)
-    output_data.index = pd.MultiIndex.from_tuples([(l, '') for l in sample_labels], names=['Sample', 'Class'])
+    output_data.index = pd.MultiIndex.from_tuples([(l, c) for l, c in zip(sample_labels, sample_classes)], names=['Sample', 'Class'])
     output_data.columns = pd.MultiIndex.from_tuples([(s, ) for s in nmr_ppms], names=['Scale'])
 
     # Export the dictionary parameters for all sets

diff --git a/pathomx/plugins/nmr/loader.py b/pathomx/plugins/nmr/loader.py
@@ -66,7 +66,7 @@ def __init__(self, parent, *args, **kwargs):
         self.config.add_handler('path_filter_regexp', pathfreg_le)
 
         cb_sampleidfrom = QComboBox()
-        cb_sampleidfrom.addItems(['Scan number', 'Experiment name', 'Experiment (regexp)', 'Path (regexp)'])
+        cb_sampleidfrom.addItems(['Scan number', 'Experiment (regexp)', 'Path (regexp)'])
         grid.addWidget(QLabel('Sample ID from'), 2, 0)
         grid.addWidget(cb_sampleidfrom, 2, 1)
         self.config.add_handler('sample_id_from', cb_sampleidfrom)
@@ -76,6 +76,17 @@ def __init__(self, parent, *args, **kwargs):
         grid.addWidget(sample_regexp_le, 3, 1)
         self.config.add_handler('sample_id_regexp', sample_regexp_le)
 
+        cb_classfrom = QComboBox()
+        cb_classfrom.addItems(['None', 'Experiment (regexp)', 'Path (regexp)'])
+        grid.addWidget(QLabel('Class from'), 4, 0)
+        grid.addWidget(cb_classfrom, 4, 1)
+        self.config.add_handler('class_from', cb_classfrom)
+
+        class_regexp_le = QLineEdit()
+        grid.addWidget(QLabel('Class regexp'), 5, 0)
+        grid.addWidget(class_regexp_le, 5, 1)
+        self.config.add_handler('class_regexp', class_regexp_le)
+
         gb.setLayout(grid)
         self.layout.addWidget(gb)
 
@@ -138,6 +149,9 @@ def __init__(self, *args, **kwargs):
             'path_filter_regexp': '',
             'sample_id_from': 'Scan number',  # Experiment name, Path regexp,
             'sample_id_regexp': '',
+
+            'class_from': 'None',  # Experiment name, Path regexp,
+            'class_regexp': '',
         })
 
         self.addConfigPanel(BrukerImportConfigPanel, 'Settings')