In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.signal import savgol_filter
from sklearn.ensemble import IsolationForest

# Load the data
df_0 = pd.read_csv(r"C:\Users\pingk\Downloads\fadhli nitip\asik.csv")

# Identify numeric columns
numeric_cols = df_0.select_dtypes(include=[np.number]).columns.tolist()
string_cols = df_0.select_dtypes(exclude=[np.number]).columns.tolist()


In [14]:
# Baseline Correction (Example using polynomial fitting)
def baseline_correction(df, numeric_cols):
    for col in numeric_cols:
        df[col] = df[col] - np.polyval(np.polyfit(range(len(df[col])), df[col], 2), range(len(df[col])))
    return df

# Smoothing (Example using Savitzky-Golay filter)
def smooth_spectra(df, numeric_cols, window_length=11, polyorder=2):
    for col in numeric_cols:
        df[col] = savgol_filter(df[col], window_length, polyorder)
    return df

# Normalization (Example using vector normalization)
def normalize_spectra(df, numeric_cols):
    for col in numeric_cols:
        df[col] = df[col] / np.linalg.norm(df[col])
    return df

# Outlier Detection and Removal (Example using Isolation Forest)
def remove_outliers(df, numeric_cols):
    iso = IsolationForest(contamination=0.05)
    yhat = iso.fit_predict(df[numeric_cols])
    mask = yhat != -1
    return df[mask].reset_index(drop=True)

# Remove region between 2700 and 1870
def remove_spectral_region(df, start, end):
    columns_to_remove = [col for col in df.columns if start <= float(col) <= end]
    df = df.drop(columns=columns_to_remove, errors='ignore')
    return df

In [15]:
# Apply preprocessing steps
df_1 = baseline_correction(df_0, numeric_cols)
df_2 = smooth_spectra(df_1, numeric_cols)
df_3 = normalize_spectra(df_2, numeric_cols)
df_4 = remove_outliers(df_3, numeric_cols)
df_5 = remove_spectral_region(df_4[numeric_cols], 1870, 2700)

In [16]:
# Update numeric columns list after removing spectral region
numeric_cols_df_5 = df_5.select_dtypes(include=[np.number]).columns.tolist()
string_cols_df_5 = df_5.select_dtypes(exclude=[np.number]).columns.tolist()