In [7]:
import pandas as pd
import ramanspy as rp
import numpy as np
import matplotlib.pyplot as plt

import numpy as np
import glob

# Path to dataset folder
folder_path = "../data/dataset/*.txt"

# Read all text files
spectra_data = []
for file in glob.glob(folder_path):
    data = np.loadtxt(file)  # Load text file as a NumPy array
    spectra_data.append(data[:, 1])  # Store only the y-values (intensity)

spectra_array = np.array(spectra_data)  # Convert list to 2D NumPy array

print(f"Loaded {len(spectra_array)} spectra, each with {spectra_array.shape[1]} data points.")


In [8]:
import numpy as np
import glob

# Path to dataset folder
folder_path = "../data/dataset/*.txt"

spectra_data = []
lengths = []

for file in glob.glob(folder_path):
    data = np.loadtxt(file)  # Load text file as a NumPy array
    y_values = data[:, 1]  # Extract y-values (intensity)
    spectra_data.append(y_values)
    lengths.append(len(y_values))

# Check if all spectra have the same length
min_length = min(lengths)
max_length = max(lengths)

if min_length != max_length:
    print(f"Warning: Spectra lengths vary! Min: {min_length}, Max: {max_length}")

    # Option 1: Trim all spectra to the shortest length
    spectra_data = [s[:min_length] for s in spectra_data]

    # Option 2: Pad shorter spectra (uncomment if preferred)
    # spectra_data = [np.pad(s, (0, max_length - len(s)), mode='constant') for s in spectra_data]

spectra_array = np.array(spectra_data)  # Convert to 2D NumPy array

print(f"Loaded {len(spectra_array)} spectra, each with {spectra_array.shape[1]} data points.")


Loaded 3516 spectra, each with 568 data points.


again but with csv

In [9]:
import pandas as pd
import numpy as np

# Load the CSV file
file_path = "../data/dataset/ILSdata.csv"
data = pd.read_csv(file_path)

# Separate metadata and spectra
metadata_columns = ['labcode', 'substrate', 'laser', 'method', 'sample', 'type', 'conc', 'batch', 'replica']
spectra_data = data.drop(columns=metadata_columns)  # Remove metadata columns to get spectra
spectra_data = spectra_data.apply(pd.to_numeric, errors='coerce')  # Convert to numeric, with NAs where necessary

# Convert spectra to a NumPy array
spectra_array = spectra_data.to_numpy()

print(f"Loaded {spectra_array.shape[0]} spectra, each with {spectra_array.shape[1]} data points.")


Loaded 3516 spectra, each with 534 data points.


In [10]:
import pandas as pd

# Load the CSV file
file_path = "../data/dataset/ILSdata.csv"
data = pd.read_csv(file_path)

# Print the first few column names
print("First few column names:", data.columns[:10])

# Extract all spectral column names (wavenumbers)
wavenumbers = data.columns[len(['labcode', 'substrate', 'laser', 'method', 'sample', 'type', 'conc', 'batch', 'replica']):]
print("Wavenumber range:", wavenumbers[0], "to", wavenumbers[-1])


First few column names: Index(['labcode', 'substrate', 'laser', 'method', 'sample', 'type', 'conc',
       'batch', 'replica', '400'],
      dtype='object')
Wavenumber range: 400 to 1999


In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

# Load the CSV file
file_path = "../data/dataset/ILSdata.csv"
data = pd.read_csv(file_path)

# Separate metadata and spectra
metadata_columns = ['labcode', 'substrate', 'laser', 'method', 'sample', 'type', 'conc', 'batch', 'replica']
spectra_data = data.drop(columns=metadata_columns)  # Remove metadata columns to get spectra
spectra_data = spectra_data.apply(pd.to_numeric, errors='coerce')  # Convert to numeric, with NAs where necessary

# Convert spectra to a NumPy array
spectra_array = spectra_data.to_numpy()

# Replace NaN values with 0
spectra_array = np.nan_to_num(spectra_array, nan=0.0)

# Extract the target variable (substrate)
target = data['substrate'].values

# Encode the substrate labels into integers
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(target)

# Convert the encoded labels to one-hot encoding
num_classes = len(label_encoder.classes_)
target_one_hot = to_categorical(target_encoded, num_classes=num_classes)

# Split the data into training (80%), validation (10%), and test (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(spectra_array, target_one_hot, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Build the neural network model
model = Sequential()

# Input layer
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))

# Hidden layers
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))  # Dropout for regularization
model.add(Dense(32, activation='relu'))

# Output layer (for multi-class classification)
model.add(Dense(num_classes, activation='softmax'))

# Compile the model with a lower learning rate and gradient clipping
model.compile(optimizer=Adam(learning_rate=0.001, clipvalue=1.0), loss='categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32,
    verbose=1
)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# Make predictions on the test set
predictions = model.predict(X_test)
predicted_labels = np.argmax(predictions, axis=1)  # Convert probabilities to class labels

# Decode the predicted labels back to substrate names
predicted_substrates = label_encoder.inverse_transform(predicted_labels)

# Compare predictions with actual values
for i in range(5):
    print(f"Predicted: {predicted_substrates[i]}, Actual: {label_encoder.inverse_transform([np.argmax(y_test[i])])[0]}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6344 - loss: 1.1096 - val_accuracy: 0.7869 - val_loss: 0.7260
Epoch 2/50
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7278 - loss: 0.8078 - val_accuracy: 0.7812 - val_loss: 0.6567
Epoch 3/50
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7772 - loss: 0.6657 - val_accuracy: 0.8267 - val_loss: 0.5566
Epoch 4/50
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7793 - loss: 0.6224 - val_accuracy: 0.8267 - val_loss: 0.5001
Epoch 5/50
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8038 - loss: 0.5361 - val_accuracy: 0.8040 - val_loss: 0.5263
Epoch 6/50
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8054 - loss: 0.5603 - val_accuracy: 0.8239 - val_loss: 0.5018
Epoch 7/50
[1m88/88[0m [32m━━━━━━━━━━