In [None]:
import pandas as pd
import os

# Assuming raw_data is the folder path containing the CSV file
file_name = "DATA NIR KERING KADAR PROTEIN.csv"  # Replace with the actual file name
file_path = os.path.join("./../raw_data", file_name)

# Read the CSV file from the raw_data folder
data = pd.read_csv(file_path)

# Display the first few rows of the data
data.head()

In [None]:
import matplotlib.pyplot as plt

data_plot = data.drop(columns=['SAMPEL', 'PROTEIN'])

plt.figure(figsize=(16, 6))
data_plot.T.plot(legend=False, alpha=0.5, linewidth=1)
plt.xlabel('Wavelength')
plt.ylabel('Absorbance')
plt.title('NIR Spectra')
plt.tight_layout()
plt.show()

In [None]:
# Columns to exclude from outlier removal
exclude_cols = ['SAMPEL', 'PROTEIN']

# Select only numeric columns except the excluded ones
numeric_cols = [col for col in data.columns if col not in exclude_cols]

# Calculate Q1, Q3, and IQR for each numeric column
Q1 = data[numeric_cols].quantile(0.35)
Q3 = data[numeric_cols].quantile(0.65)
IQR = Q3 - Q1

# Create a boolean mask for rows without outliers in any numeric column
mask = ~((data[numeric_cols] < (Q1 - 1.5 * IQR)) | (data[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)

# Filter the dataframe
data_no_outliers = data[mask].reset_index(drop=True)

In [None]:
data_no_outliers_plot = data_no_outliers.drop(columns=exclude_cols)

plt.figure(figsize=(16, 6))
data_no_outliers_plot.T.plot(legend=False, alpha=0.5, linewidth=1)
plt.xlabel('Wavelength')
plt.ylabel('Absorbance')
plt.title('NIR Spectra (No Outliers)')
plt.tight_layout()
plt.show()

In [None]:
data_no_outliers_plot

In [None]:
data_no_outliers.to_csv("653_RAW_NIR_KERING_PROTEIN.csv", index=False)