In [81]:
import pandas as pd
import ramanspy as rp
import numpy as np
import matplotlib.pyplot as plt

# Load ILSdata.csv
file_path = "../data/dataset/ILSdata.csv"
ils_data = pd.read_csv(file_path)
data = pd.read_csv(file_path)

# Inspect the dataset
print("ILS Data Columns:", ils_data.columns)
print("ILS Data Shape:", ils_data.shape)

ils_target = data['substrate'].values

ILS Data Columns: Index(['labcode', 'substrate', 'laser', 'method', 'sample', 'type', 'conc',
       'batch', 'replica', '400',
       ...
       '1972', '1975', '1978', '1981', '1984', '1987', '1990', '1993', '1996',
       '1999'],
      dtype='object', length=543)
ILS Data Shape: (3516, 543)


In [72]:
import pandas as pd

# Load the ILS dataset
file_path = "../data/dataset/ILSdata.csv"
data = pd.read_csv(file_path)

# Separate metadata and spectra
metadata_columns = ['labcode', 'substrate', 'laser', 'method', 'sample', 'type', 'conc', 'batch', 'replica']
spectra_data = data.drop(columns=metadata_columns)

# Print the shape of spectra_data
print("Shape of spectra_data:", spectra_data.shape)

Shape of spectra_data: (3516, 534)


In [73]:
# Check the first few rows and columns
print("First few rows and columns of spectra_data:")
print(spectra_data.iloc[:5, :10])  # First 5 rows and first 10 columns

# Check for missing values
print("Number of missing values in spectra_data:", spectra_data.isna().sum().sum())

# Check the data types
print("Data types of spectra_data columns:")
print(spectra_data.dtypes)

First few rows and columns of spectra_data:
       400      403      406      409      412      415      418      421  \
0  66533.0  66322.0  66170.0  66073.0  66129.0  66154.0  65938.0  65752.0   
1  95228.0  95066.0  94977.0  94967.0  94976.0  94761.0  94734.0  94846.0   
2  80044.0  80204.0  80182.0  79995.0  79751.0  79644.0  79693.0  79643.0   
3  77645.0  77972.0  77634.0  76632.0  75875.0  75897.0  75800.0  75655.0   
4  85099.0  84810.0  85071.0  86000.0  86656.0  85530.0  84496.0  84636.0   

       424      427  
0  65636.0  65475.0  
1  94574.0  94260.0  
2  79324.0  79090.0  
3  75470.0  75443.0  
4  84639.0  84617.0  
Number of missing values in spectra_data: 27180
Data types of spectra_data columns:
400     float64
403     float64
406     float64
409     float64
412     float64
         ...   
1987    float64
1990    float64
1993    float64
1996    float64
1999    float64
Length: 534, dtype: object


In [74]:
import re

# Extract numeric part from column names
original_wavelengths_ils = spectra_data.columns.to_series().apply(lambda x: float(re.findall(r'\d+', x)[0]))

# Print the extracted wavelengths
print("Extracted wavelengths for ILS dataset:", original_wavelengths_ils)
print("Minimum wavelength:", original_wavelengths_ils.min())
print("Maximum wavelength:", original_wavelengths_ils.max())

# Convert spectra to a NumPy array
spectra_ils = spectra_data.to_numpy()

# Print the shape of the NumPy array
print("Shape of spectra_ils:", spectra_ils.shape)

Extracted wavelengths for ILS dataset: 400      400.0
403      403.0
406      406.0
409      409.0
412      412.0
         ...  
1987    1987.0
1990    1990.0
1993    1993.0
1996    1996.0
1999    1999.0
Length: 534, dtype: float64
Minimum wavelength: 400.0
Maximum wavelength: 1999.0
Shape of spectra_ils: (3516, 534)


In [75]:
# Load training and testing datasets
X_train, y_train = rp.datasets.bacteria("train", folder="../data/bacteria/")
X_test, y_test = rp.datasets.bacteria("test", folder="../data/bacteria/")
X_val, y_val = rp.datasets.bacteria("val", folder="../data/bacteria/")

# Convert to numpy arrays for randomization and PyTorch compatibility
X_train_bacteria = np.array(X_train.spectral_data)
y_train_bacteria = np.array(y_train)
X_test_bacteria = np.array(X_test.spectral_data)
y_test_bacteria = np.array(y_test)
X_val_bacteria = np.array(X_val.spectral_data)
y_val_bacteria = np.array(y_val)

# Combine bacteria datasets
X_bacteria = np.vstack([X_train_bacteria, X_test_bacteria, X_val_bacteria])

# Check the shape
print("Bacteria Data Shape:", X_bacteria.shape)

Bacteria Data Shape: (66000, 1000)


In [76]:
print("Type of spectra_wheat:", type(spectra_wheat))
print("Available attributes and methods:", dir(spectra_wheat))

spectra_wheat_data = spectra_wheat.spectral_data

print("Shape of spectra_wheat_data:", spectra_wheat_data.shape)

Type of spectra_wheat: <class 'ramanspy.core.SpectralContainer'>
Available attributes and methods: ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'band', 'flat', 'from_stack', 'load', 'mean', 'save', 'shape', 'spectral_axis', 'spectral_data', 'spectral_length', 'tolist']
Shape of spectra_wheat_data: (53134, 1748)


In [85]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.utils import to_categorical

# Load the dataset
df_wheats = rp.datasets.wheat_lines(file="../wheats/Data.mat")
spectra, labels, label_names = df_wheats


# Encode the labels into integers
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Convert the encoded labels to one-hot encoding
num_classes = len(label_encoder.classes_)
labels_one_hot = to_categorical(labels_encoded, num_classes=num_classes)

In [87]:
labels_one_hot [0]


array([1., 0., 0., 0.])

In [77]:
print("ILS Data Shape:", spectra_ils.shape)
print("Bacteria Data Shape:", X_bacteria.shape)
print("Wheat Data Shape:", spectra_wheat.shape)

ILS Data Shape: (3516, 534)
Bacteria Data Shape: (66000, 1000)
Wheat Data Shape: (53134,)


In [78]:
print("Bacteria Data Shape:", X_bacteria.shape)

Bacteria Data Shape: (66000, 1000)


In [79]:
import numpy as np

# Define metadata_columns (as you provided)
metadata_columns = ['labcode', 'substrate', 'laser', 'method', 'sample', 'type', 'conc', 'batch', 'replica']

# Extract spectral data from ILS dataset
spectra_ils = data.drop(columns=metadata_columns).to_numpy()

# Extract spectral data from wheat dataset
spectra_wheat = spectra_wheat.spectral_data

# Combine all spectral data
X_combined = np.vstack([spectra_ils, X_bacteria, spectra_wheat])

# Print the combined shape
print("Combined Spectral Data Shape:", X_combined.shape)

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 534 and the array at index 1 has size 1000

In [35]:
# Assuming the wavelength values are stored in a separate array or file
original_wavelengths_bacteria = np.linspace(250, 2000, X_bacteria.shape[1])  # Example: 250 to 2000 cm⁻¹

In [40]:
# Load wavenumbers for the bacteria dataset
original_wavelengths_bacteria = np.load("../data/bacteria/wavenumbers.npy")

# Check the shape and values
print("Bacteria Wavenumbers Shape:", original_wavelengths_bacteria.shape)
print("Bacteria Wavenumbers:", original_wavelengths_bacteria)

Bacteria Wavenumbers Shape: (1000,)
Bacteria Wavenumbers: [1792.4  1791.2  1789.9  1788.6  1787.3  1786.   1784.8  1783.5  1782.2
 1780.9  1779.6  1778.3  1777.1  1775.8  1774.5  1773.2  1771.9  1770.6
 1769.4  1768.1  1766.8  1765.5  1764.2  1762.9  1761.7  1760.4  1759.1
 1757.8  1756.5  1755.2  1753.9  1752.6  1751.3  1750.1  1748.8  1747.5
 1746.2  1744.9  1743.6  1742.3  1741.   1739.8  1738.5  1737.2  1735.9
 1734.6  1733.3  1732.   1730.7  1729.4  1728.1  1726.8  1725.5  1724.3
 1723.   1721.7  1720.4  1719.1  1717.8  1716.5  1715.2  1713.9  1712.6
 1711.3  1710.   1708.7  1707.4  1706.1  1704.8  1703.5  1702.2  1701.
 1699.7  1698.3  1697.   1695.8  1694.5  1693.2  1691.9  1690.6  1689.3
 1688.   1686.7  1685.4  1684.1  1682.8  1681.5  1680.2  1678.9  1677.5
 1676.2  1675.   1673.7  1672.3  1671.   1669.7  1668.4  1667.1  1665.8
 1664.5  1663.2  1661.9  1660.6  1659.3  1658.   1656.7  1655.4  1654.1
 1652.8  1651.5  1650.2  1648.9  1647.5  1646.2  1644.9  1643.6  1642.3
 1641. 

In [None]:
import re

# Extract numeric part from column names
original_wavelengths_ils = spectra_data.columns.to_series().apply(lambda x: float(re.findall(r'\d+', x)[0]))

In [None]:
original_wavelengths_ils = spectra_data.columns.astype(float)

In [54]:
original_wavelengths_ils = spectra_data.columns.to_series().apply(lambda x: float(re.findall(r'\d+', x)[0]))

In [65]:
# Regenerate original_wavelengths_wheat based on the number of columns in spectra_wheat_data
num_wavelengths = spectra_wheat_data.shape[1]
original_wavelengths_wheat = np.linspace(250, 2000, num_wavelengths)  # Example range for Raman spectra

In [66]:
print("Shape of original_wavelengths_wheat:", original_wavelengths_wheat.shape)
print("Shape of spectra_wheat_data:", spectra_wheat_data.shape)

Shape of original_wavelengths_wheat: (1748,)
Shape of spectra_wheat_data: (53134, 1748)


In [67]:
# Resize the wheat dataset
spectra_wheat_resized = resize_spectra(spectra_wheat_data, original_wavelengths_wheat, target_size)

# Print the resized shape
print("Resized Wheat Data Shape:", spectra_wheat_resized.shape)

Resized Wheat Data Shape: (53134, 1748)


In [69]:
import numpy as np
from scipy.interpolate import interp1d
import re

# Define target size (e.g., size of the largest dataset)
target_size = 1748

# Function to resize spectra
def resize_spectra(spectra, original_wavelengths, target_size):
    resized_spectra = []
    for spectrum in spectra:
        interp_func = interp1d(original_wavelengths, spectrum, kind='linear', fill_value="extrapolate")
        new_wavelengths = np.linspace(original_wavelengths.min(), original_wavelengths.max(), target_size)
        resized_spectrum = interp_func(new_wavelengths)
        resized_spectra.append(resized_spectrum)
    return np.array(resized_spectra)

# Resize the ILS dataset
original_wavelengths_ils = spectra_data.columns.to_series().apply(lambda x: float(re.findall(r'\d+', x)[0]))
spectra_ils_resized = resize_spectra(spectra_ils, original_wavelengths_ils, target_size)
print("Resized ILS Data Shape:", spectra_ils_resized.shape)

# Resize the bacteria dataset
original_wavelengths_bacteria = np.load("../data/bacteria/wavenumbers.npy")
X_bacteria_resized = resize_spectra(X_bacteria, original_wavelengths_bacteria, target_size)
print("Resized Bacteria Data Shape:", X_bacteria_resized.shape)

# Resize the wheat dataset (already done)
print("Resized Wheat Data Shape:", spectra_wheat_resized.shape)

# Combine resized spectral data
X_combined_resized = np.vstack([spectra_ils_resized, X_bacteria_resized, spectra_wheat_resized])

# Print the combined resized shape
print("Combined Resized Spectral Data Shape:", X_combined_resized.shape)

Resized ILS Data Shape: (3516, 1748)
Resized Bacteria Data Shape: (66000, 1748)
Resized Wheat Data Shape: (53134, 1748)
Combined Resized Spectral Data Shape: (122650, 1748)


In [88]:
# Combine labels
y_combined = np.hstack([ils_target, y_bacteria, labels_one_hot])

# Print the combined labels shape
print("Combined Labels Shape:", y_combined.shape)

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 2 has 2 dimension(s)