Import necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.signal import savgol_filter

Load the FTIR data

In [None]:
file_path = r"C:\Users\Abdul\OneDrive - Chulalongkorn University\fadhli nitip\csv"  # Update with your file path
ftir_data = pd.read_csv(file_path)

Preprocessing - Normalize the data

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(ftir_data.values)

Feature Extraction - Apply Savitzky-Golay filter for smoothing

In [None]:
smoothed_data = savgol_filter(scaled_data, window_length=11, polyorder=2, axis=1)

Baseline correction (simple method: subtract the minimum value in each spectrum)

In [None]:
baseline_corrected_data = smoothed_data - np.min(smoothed_data, axis=1, keepdims=True)

PCA for dimensionality reduction

In [None]:
pca = PCA(n_components=2)
pca_data = pca.fit_transform(baseline_corrected_data)

Clustering - Determine the optimal number of clusters using the elbow method

In [None]:
inertia = []
for n_clusters in range(1, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(pca_data)
    inertia.append(kmeans.inertia_)

Plot the elbow curve

In [None]:
plt.figure()
plt.plot(range(1, 11), inertia, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.show()

Clustering - Apply K-means with the optimal number of clusters (assume 3 based on elbow plot)

In [None]:
optimal_clusters = 3
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(pca_data)

Validation - Calculate silhouette score

In [None]:
sil_score = silhouette_score(pca_data, cluster_labels)
print(f'Silhouette Score: {sil_score}')

Visualization - Plot the clusters

In [None]:
plt.figure()
for cluster in range(optimal_clusters):
    cluster_data = pca_data[cluster_labels == cluster]
    plt.scatter(cluster_data[:, 0], cluster_data[:, 1], label=f'Cluster {cluster + 1}')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA of FTIR Data with K-means Clustering')
plt.legend()
plt.show()