# Translation of data import from R

The data we are importing and formatting is for hyperspectral data and spectra (which we also fit). I translate several R functions here. This is preparation for running regression in Python (on a GPU, for faster runtimes than we've been able to get in R).

In [2]:
import tensorflow as tf

In [3]:
import spectral.io.envi as envi

In [4]:
import glob

In [5]:
import pandas as pd

In [6]:
import numpy as np

In [7]:
from rpy2.robjects import numpy2ri

In [49]:
files_list = glob.glob("/scratch2/NSF_GWAS/macroPhor_Array/T16_DEV_genes/EA/wk7/*.hdr")

In [18]:
spectral_library_path = "/scratch2/NSF_GWAS/GMOdetectoR/spectra_library/"

In [50]:
i = 2

In [51]:
file_path = files_list[i]

In [52]:
file_path

'/scratch2/NSF_GWAS/macroPhor_Array/T16_DEV_genes/EA/wk7/EA2_15.0_F1.9_L100_101751_7_1_6.hdr'

In [21]:
img = envi.open(file_path)

Header parameter names converted to lower case.




In [22]:
img.shape

(1571, 1419, 318)

In [23]:
type(img)

spectral.io.bilfile.BilFile

##### Get wavelengths

In [24]:
h = envi.read_envi_header(file_path)
wavelengths = h['wavelength']

Header parameter names converted to lower case.


##### Read in spectra

In [25]:
spectral_library = ['DsRed', 'ZsYellow', 'Chl', 'Diffraction']

In [26]:
j = 0

In [27]:
spectra_path = (spectral_library_path + spectral_library[j] + '.csv')

In [28]:
spectra_path

'/scratch2/NSF_GWAS/GMOdetectoR/spectra_library/DsRed.csv'

In [29]:
pub_emission_spectrum = pd.read_csv(spectra_path)

In [30]:
pub_emission_spectrum = pub_emission_spectrum.rename(columns={'emission wavelength (nm)': 'wavelength'})
pub_emission_spectrum = pub_emission_spectrum.rename(columns={'Normalized emission': 'intensity'})

In [35]:
#import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
#ts=robjects.r('ts')
stats=importr('stats')

In [36]:
import rpy2.robjects as ro
from rpy2.robjects.conversion import localconverter
from rpy2.robjects import pandas2ri

In [37]:
scales=importr('scales')

In [38]:
import rpy2
print(rpy2.__version__)

3.3.5


In [41]:
# Convert pandas object to something R-friendly https://rpy2.github.io/doc/latest/html/pandas.html
with localconverter(ro.default_converter + ro.pandas2ri.converter):
  r_from_pd_df = ro.conversion.py2rpy(pub_emission_spectrum) # If not in rpy2=2.9.4 may have error here due to no attribute 'py2rpy'

# Use rpy2 so we can use loess from R, as in old version, get same exact results
fit = stats.loess(ro.Formula('intensity~wavelength'), data=r_from_pd_df, span=0.1)

In [42]:
predict_from = {'wavelength':wavelengths}
predict_from = pd.DataFrame(predict_from)

with localconverter(ro.default_converter + ro.pandas2ri.converter):
  r_predict_from = ro.conversion.py2rpy(pub_emission_spectrum) # If not in rpy2=2.9.4 may have error here due to no attribute 'py2rpy'



In [99]:
predictions = stats.predict(fit, r_predict_from)

In [100]:
max(predictions)

0.9480948344687715

In [101]:
predictions = scales.rescale(predictions, center='FALSE') # by default, scales between 0 and 1.

In [102]:
scaled_emission_spectra = {'wavelength': wavelengths, 'intensity': predictions}
scaled_emission_spectra = pd.DataFrame(scaled_emission_spectra)

In [103]:
predictions

0,1,2,3,4,5,6
0.000201,0.000201,0.000201,...,0.000201,0.000201,0.000201


Why are these 0.000201 instead of 0 after scaling?

In [104]:
# print(predictions)

Because the 0 is in the middle. Will get rid of noise outside spectra with a threshold.

In [105]:
spectra_noise_threshold = float(0.001)

In [106]:
type(spectra_noise_threshold)

float

In [107]:
type(scaled_emission_spectra)

pandas.core.frame.DataFrame

In [109]:
scaled_emission_spectra.loc[scaled_emission_spectra['intensity'] < spectra_noise_threshold, 'intensity'] = 0

Did it work?

In [110]:
min(scaled_emission_spectra['intensity'])

0.0

Yes. Min is now 0 instead of 0.000201

In [111]:
scaled_emission_spectra

Unnamed: 0,wavelength,intensity
0,399.8645,0.0
1,401.0927,0.0
2,402.3212,0.0
3,403.5499,0.0
4,404.7787,0.0
...,...,...
313,794.6728,0.0
314,795.9675,0.0
315,797.2625,0.0
316,798.5575,0.0


##### Pack up into read_plot_spectra function equivalent to R version (but without integration of plot support yet)

In [39]:
def read_fit_spectra(spectra_path, wavelengths, plot=False, spectra_noise_threshold = 0.001):
    
    numpy2ri.activate()

    pub_emission_spectrum = pd.read_csv(spectra_path)
    pub_emission_spectrum = pub_emission_spectrum.rename(columns={'emission wavelength (nm)': 'wavelength'})
    pub_emission_spectrum = pub_emission_spectrum.rename(columns={'Normalized emission': 'intensity'})

    # Convert pandas object to something R-friendly https://rpy2.github.io/doc/latest/html/pandas.html
    with localconverter(ro.default_converter + ro.pandas2ri.converter):
        r_from_pd_df = ro.conversion.py2rpy(pub_emission_spectrum) # If not in rpy2=2.9.4 may have error here due to no attribute 'py2rpy'

    # Use rpy2 so we can use loess from R, as in old version, get same exact results
    # Fit
    fit = stats.loess(ro.Formula('intensity~wavelength'), data=r_from_pd_df, span=0.1)

    # Prepare to predict
    predict_from = {'wavelength':wavelengths}
    predict_from = pd.DataFrame(predict_from)

    with localconverter(ro.default_converter + ro.pandas2ri.converter):
        r_predict_from = ro.conversion.py2rpy(pub_emission_spectrum) # If not in rpy2>3.0.5 may have error here due to no attribute 'py2rpy'

    # Predict
    predictions = stats.predict(fit, r_predict_from)
    predictions = scales.rescale(predictions, center='FALSE') # by default, scales between 0 and 1.

    # Scale
    scaled_emission_spectra = {'wavelength': wavelengths, 'intensity': predictions}
    scaled_emission_spectra = pd.DataFrame(scaled_emission_spectra)

    # Denoise
    scaled_emission_spectra.loc[scaled_emission_spectra['intensity'] < spectra_noise_threshold, 'intensity'] = 0

    if plot == True:
        print('Plotting not supported here yet.')

    numpy2ri.deactivate()
        
    return(scaled_emission_spectra)

Test the function

In [40]:
scaled_emission_spectra_test = read_fit_spectra(spectra_path = spectra_path,
                                                wavelengths = wavelengths,
                                                plot = False,
                                                spectra_noise_threshold = 0.001)

In [120]:
scaled_emission_spectra_test

Unnamed: 0,wavelength,intensity
0,399.8645,0.0
1,401.0927,0.0
2,402.3212,0.0
3,403.5499,0.0
4,404.7787,0.0
...,...,...
313,794.6728,0.0
314,795.9675,0.0
315,797.2625,0.0
316,798.5575,0.0


##### Now for the whole X, built from intensity columns for each fluorophore constructed as above.

In [210]:
i = 0

In [211]:
import os

In [212]:
os.listdir()

['Untitled.ipynb', '.ipynb_checkpoints']

In [43]:
path = (spectral_library_path + str(spectral_library[i]) + ".csv")

spectra = read_fit_spectra(spectra_path = path,
                           wavelengths = wavelengths,
                           plot = False,
                           spectra_noise_threshold = 0.001)

In [47]:
type(wavelengths)

list

In [48]:
type(0.01)

float

In [46]:
wavelengths

['399.8645',
 '401.0927',
 '402.3212',
 '403.5499',
 '404.7787',
 '406.0079',
 '407.2372',
 '408.4667',
 '409.6964',
 '410.9263',
 '412.1565',
 '413.3869',
 '414.6175',
 '415.8483',
 '417.0793',
 '418.3105',
 '419.5419',
 '420.7736',
 '422.0054',
 '423.2375',
 '424.4698',
 '425.7023',
 '426.935',
 '428.1679',
 '429.401',
 '430.6343',
 '431.8679',
 '433.1017',
 '434.3357',
 '435.5699',
 '436.8043',
 '438.0389',
 '439.2737',
 '440.5087',
 '441.744',
 '442.9795',
 '444.2151',
 '445.451',
 '446.6871',
 '447.9235',
 '449.16',
 '450.3967',
 '451.6337',
 '452.8708',
 '454.1082',
 '455.3458',
 '456.5836',
 '457.8216',
 '459.0598',
 '460.2983',
 '461.5369',
 '462.7758',
 '464.0149',
 '465.2542',
 '466.4937',
 '467.7334',
 '468.9733',
 '470.2134',
 '471.4538',
 '472.6943',
 '473.9351',
 '475.1761',
 '476.4173',
 '477.6587',
 '478.9003',
 '480.1422',
 '481.3842',
 '482.6265',
 '483.869',
 '485.1116',
 '486.3546',
 '487.5977',
 '488.841',
 '490.0845',
 '491.3282',
 '492.5722',
 '493.8164',
 '495.0

In [44]:
type(spectra)

pandas.core.frame.DataFrame

In [45]:
spectra

Unnamed: 0,wavelength,intensity
0,399.8645,0.001076
1,401.0927,0.001076
2,402.3212,0.001076
3,403.5499,0.001076
4,404.7787,0.001076
...,...,...
313,794.6728,0.001076
314,795.9675,0.001076
315,797.2625,0.001076
316,798.5575,0.001076


In [214]:
intercept = 1

In [215]:
i

0

In [216]:
intercept

1

In [217]:
spectra['intensity']

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
313    0.0
314    0.0
315    0.0
316    0.0
317    0.0
Name: intensity, Length: 318, dtype: float64

In [222]:
if(i==0 and intercept==1):
    intercept_vector = [1] * len(wavelengths)
    mm = np.c_[(intercept_vector, np.asarray(spectra['intensity']))] # https://stackoverflow.com/questions/43961585/cbind-r-function-equivalent-in-numpy

In [205]:
intercept = 0

In [206]:
if(i==0 and intercept==0):
    mm = np.array(spectra['intensity'])

In [224]:
mm.shape

(318, 2)

In [225]:
i = 1

In [226]:
spectra['intensity']

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
313    0.0
314    0.0
315    0.0
316    0.0
317    0.0
Name: intensity, Length: 318, dtype: float64

In [228]:
if(i>0):
    mm = np.c_[mm, np.array(spectra['intensity'])] # https://stackoverflow.com/questions/43961585/cbind-r-function-equivalent-in-numpy

In [229]:
colnames = spectral_library.copy()

In [230]:
colnames

['DsRed', 'ZsYellow', 'Chl', 'Diffraction']

In [231]:
spectral_library

['DsRed', 'ZsYellow', 'Chl', 'Diffraction']

Did some reading and found out numpy array columns cannot be named unless you do something fancy by creating new class etc

Moving on, let's subset X to only desired wavelengths.

In [232]:
min_desired_wavelength = 545
max_desired_wavelength = 722

In [233]:
wavelengths = np.asarray(wavelengths)

In [234]:
# https://stackoverflow.com/questions/13869173/numpy-find-index-of-the-elements-within-range
wavelength_indices_desired = np.where(np.logical_and(wavelengths.astype(float)>=min_desired_wavelength,
                                                     wavelengths.astype(float)<=max_desired_wavelength))

In [235]:
mm.shape

(318, 3)

In [238]:
mm[wavelength_indices_desired,].shape

(1, 140, 3)

In [None]:
mm2 = mm[(mm["wavelength"] >= m 2) | (mm["Pclass"] == 3)]

##### Pack it into a function that loops over everything in the fluorophore library

In [254]:
def build_X(fluorophore_ID_vector, intercept, wavelengths,
            min_desired_wavelength, max_desired_wavelength):  
    
    for i in range(0, (len(fluorophore_ID_vector))):
        print(i)
        path = (spectral_library_path + str(fluorophore_ID_vector[i]) + ".csv")
        if(os.path.isfile(path) == False):
            raise NameError("Error: Spectrum for " + fluorophore_ID_vector[i] + " not found in spectra_library folder")

        spectra = read_fit_spectra(spectra_path = path,
                                   wavelengths = wavelengths,
                                   plot = False,
                                   spectra_noise_threshold = 0.001)

        if(i==0 and intercept==1):
            intercept_vector = [1] * len(wavelengths)
            mm = np.c_[(intercept_vector, np.asarray(spectra['intensity']))] # https://stackoverflow.com/questions/43961585/cbind-r-function-equivalent-in-numpy

        if(i==0 and intercept==0):
            mm = np.array(spectra['intensity'])

        if(i>0):
            mm = np.c_[mm, np.array(spectra['intensity'])] # https://stackoverflow.com/questions/43961585/cbind-r-function-equivalent-in-numpy

    wavelengths = np.asarray(wavelengths)
    
    # https://stackoverflow.com/questions/13869173/numpy-find-index-of-the-elements-within-range
    wavelength_indices_desired = np.where(np.logical_and(wavelengths.astype(float)>=min_desired_wavelength,
                                                         wavelengths.astype(float)<=max_desired_wavelength))

    return(mm[wavelength_indices_desired,])

In [255]:
build_X(fluorophore_ID_vector = spectral_library,
        intercept = 1,
        wavelengths = wavelengths,
        min_desired_wavelength = 545,
        max_desired_wavelength = 722)

0
1


ValueError: arrays must all be same length

##### Try again outside of the function...

In [249]:
fluorophore_ID_vector = spectral_library

In [251]:
for i in range(0, (len(fluorophore_ID_vector))):
    print(i)
    

0
1
2
3


In [256]:
i = 0

In [258]:
path = (spectral_library_path + str(fluorophore_ID_vector[i]) + ".csv")
if(os.path.isfile(path) == False):
    raise NameError("Error: Spectrum for " + fluorophore_ID_vector[i] + " not found in spectra_library folder")

spectra = read_fit_spectra(spectra_path = path,
                           wavelengths = wavelengths,
                           plot = False,
                           spectra_noise_threshold = 0.001)

In [259]:
if(i==0 and intercept==1):
    intercept_vector = [1] * len(wavelengths)
    mm = np.c_[(intercept_vector, np.asarray(spectra['intensity']))] # https://stackoverflow.com/questions/43961585/cbind-r-function-equivalent-in-numpy

if(i==0 and intercept==0):
    mm = np.array(spectra['intensity'])

if(i>0):
    mm = np.c_[mm, np.array(spectra['intensity'])] # https://stackoverflow.com/questions/43961585/cbind-r-function-equivalent-in-numpy

In [260]:
i = 1

In [261]:
path = (spectral_library_path + str(fluorophore_ID_vector[i]) + ".csv")
if(os.path.isfile(path) == False):
    raise NameError("Error: Spectrum for " + fluorophore_ID_vector[i] + " not found in spectra_library folder")

spectra = read_fit_spectra(spectra_path = path,
                           wavelengths = wavelengths,
                           plot = False,
                           spectra_noise_threshold = 0.001)

ValueError: arrays must all be same length

In [None]:
path = (spectral_library_path + str(fluorophore_ID_vector[i]) + ".csv")

In [262]:
path

'/scratch2/NSF_GWAS/GMOdetectoR/spectra_library/ZsYellow.csv'

In [263]:
if(os.path.isfile(path) == False):
    raise NameError("Error: Spectrum for " + fluorophore_ID_vector[i] + " not found in spectra_library folder")

In [265]:
spectra = read_fit_spectra(spectra_path = path,
                           wavelengths = wavelengths,
                           plot = False,
                           spectra_noise_threshold = 0.001)

ValueError: arrays must all be same length

##### Why doesn't read_fit_spectra work for ZsYellow?

Work through step-by-step.

Is the problem that DsRed library file has values for all wavelengths but ZsYellow file doesn't? This is easy to fix if so...

In [282]:
DsRed_spec = pd.read_csv('/scratch2/NSF_GWAS/GMOdetectoR/spectra_library/DsRed.csv')
ZsYellow_spec = pd.read_csv('/scratch2/NSF_GWAS/GMOdetectoR/spectra_library/ZsYellow.csv')

In [283]:
DsRed_spec.shape

(318, 2)

In [284]:
ZsYellow_spec.shape

(207, 2)

Yes, that does seem to be the problem. Why is it happening with rpy2 but no tin R version? Work in both RStudio and Jupyter alongside step-by-step to see where it becoems 207 or 318 in each.

That method of debugging didn't work since the R function is fed a subset of wavelengths, where here we use all wavelengths. Should do some type of merge instead.

In [299]:
pub_emission_spectrum = pd.read_csv('/scratch2/NSF_GWAS/GMOdetectoR/spectra_library/ZsYellow.csv')
pub_emission_spectrum = pub_emission_spectrum.rename(columns={'emission wavelength (nm)': 'wavelength'})
pub_emission_spectrum = pub_emission_spectrum.rename(columns={'Normalized emission': 'intensity'})

# Convert pandas object to something R-friendly https://rpy2.github.io/doc/latest/html/pandas.html
with localconverter(ro.default_converter + ro.pandas2ri.converter):
    r_from_pd_df = ro.conversion.py2rpy(pub_emission_spectrum) # If not in rpy2=2.9.4 may have error here due to no attribute 'py2rpy'

# Use rpy2 so we can use loess from R, as in old version, get same exact results
# Fit
fit = stats.loess(ro.Formula('intensity~wavelength'), data=r_from_pd_df, span=0.1)

In [300]:
# Prepare to predict
predict_from = {'wavelength':wavelengths}
predict_from = pd.DataFrame(predict_from)

In [301]:
predict_from

Unnamed: 0,wavelength
0,399.8645
1,401.0927
2,402.3212
3,403.5499
4,404.7787
...,...
313,794.6728
314,795.9675
315,797.2625
316,798.5575


In [323]:
numpy2ri.activate()

In [320]:
with localconverter(ro.default_converter + ro.pandas2ri.converter):
    r_predict_from = ro.conversion.py2rpy(wavelengths) # If not in rpy2>3.0.5 may have error here due to no attribute 'py2rpy'

NotImplementedError: Conversion 'py2rpy' not defined for objects of type '<class 'numpy.ndarray'>'

In [324]:
# Predict
predictions = stats.predict(fit, wavelengths)

In [325]:
len(predictions)

318

##### Solved. The problem was that I was predicting from original wavelengths instead of desired ones (translation error)

In [338]:
def build_X(fluorophore_ID_vector, intercept, wavelengths,
            min_desired_wavelength, max_desired_wavelength):  
    
    for i in range(0, (len(fluorophore_ID_vector))):
        print(i)
        path = (spectral_library_path + str(fluorophore_ID_vector[i]) + ".csv")
        if(os.path.isfile(path) == False):
            raise NameError("Error: Spectrum for " + fluorophore_ID_vector[i] + " not found in spectra_library folder")

        spectra = read_fit_spectra(spectra_path = path,
                                   wavelengths = wavelengths,
                                   plot = False,
                                   spectra_noise_threshold = 0.01)

        if(i==0 and intercept==1):
            intercept_vector = [1] * len(wavelengths)
            mm = np.c_[(intercept_vector, np.asarray(spectra['intensity']))] # https://stackoverflow.com/questions/43961585/cbind-r-function-equivalent-in-numpy

        if(i==0 and intercept==0):
            mm = np.array(spectra['intensity'])

        if(i>0):
            mm = np.c_[mm, np.array(spectra['intensity'])] # https://stackoverflow.com/questions/43961585/cbind-r-function-equivalent-in-numpy

    wavelengths = np.asarray(wavelengths)
    
    # https://stackoverflow.com/questions/13869173/numpy-find-index-of-the-elements-within-range
    wavelength_indices_desired = np.where(np.logical_and(wavelengths.astype(float)>=min_desired_wavelength,
                                                         wavelengths.astype(float)<=max_desired_wavelength))

    return(mm[wavelength_indices_desired,])

In [339]:
def read_fit_spectra(spectra_path, wavelengths, plot=False, spectra_noise_threshold = 0.01):
    
    numpy2ri.activate()

    pub_emission_spectrum = pd.read_csv(spectra_path)
    pub_emission_spectrum = pub_emission_spectrum.rename(columns={'emission wavelength (nm)': 'wavelength'})
    pub_emission_spectrum = pub_emission_spectrum.rename(columns={'Normalized emission': 'intensity'})

    # Convert pandas object to something R-friendly https://rpy2.github.io/doc/latest/html/pandas.html
    with localconverter(ro.default_converter + ro.pandas2ri.converter):
        r_from_pd_df = ro.conversion.py2rpy(pub_emission_spectrum) # If not in rpy2=2.9.4 may have error here due to no attribute 'py2rpy'

    # Use rpy2 so we can use loess from R, as in old version, get same exact results
    # Fit
    fit = stats.loess(ro.Formula('intensity~wavelength'), data=r_from_pd_df, span=0.1)

    # Predict
    predictions = stats.predict(fit, wavelengths)
    predictions = scales.rescale(predictions, center='FALSE') # by default, scales between 0 and 1.

    # Scale
    scaled_emission_spectra = {'wavelength': wavelengths, 'intensity': predictions}
    scaled_emission_spectra = pd.DataFrame(scaled_emission_spectra)

    # Denoise
    print(scaled_emission_spectra.head())
    print(spectra_noise_threshold)
    scaled_emission_spectra.loc[scaled_emission_spectra['intensity'] < spectra_noise_threshold, 'intensity'] = 0
    print(scaled_emission_spectra.head())
    if plot == True:
        print('Plotting not supported here yet.')

    numpy2ri.deactivate()
        
    return(scaled_emission_spectra)

In [343]:
build_X(fluorophore_ID_vector = spectral_library,
        intercept = 1,
        wavelengths = wavelengths,
        min_desired_wavelength = 445,
        max_desired_wavelength = 725)

0
  wavelength  intensity
0   399.8645   0.000201
1   401.0927   0.000201
2   402.3212   0.000201
3   403.5499   0.000201
4   404.7787   0.000201
0.01
  wavelength  intensity
0   399.8645        0.0
1   401.0927        0.0
2   402.3212        0.0
3   403.5499        0.0
4   404.7787        0.0
1
  wavelength  intensity
0   399.8645        NaN
1   401.0927        NaN
2   402.3212        NaN
3   403.5499        NaN
4   404.7787   0.000706
0.01
  wavelength  intensity
0   399.8645        NaN
1   401.0927        NaN
2   402.3212        NaN
3   403.5499        NaN
4   404.7787        0.0
2
  wavelength  intensity
0   399.8645   0.001076
1   401.0927   0.001076
2   402.3212   0.001076
3   403.5499   0.001076
4   404.7787   0.001076
0.01
  wavelength  intensity
0   399.8645        0.0
1   401.0927        0.0
2   402.3212        0.0
3   403.5499        0.0
4   404.7787        0.0
3
  wavelength  intensity
0   399.8645   0.008441
1   401.0927   0.008441
2   402.3212   0.008441
3   403.5499   0.

array([[[1.        , 0.        , 0.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        , 0.        , 0.        ],
        ...,
        [1.        , 0.        ,        nan, 0.0132177 , 0.        ],
        [1.        , 0.        ,        nan, 0.01092884, 0.        ],
        [1.        , 0.        ,        nan, 0.        , 0.        ]]])