In [None]:
import pandas as pd

def summarize_datasets(data_cal: pd.DataFrame, data_val: pd.DataFrame, column_name: str) -> pd.DataFrame:
    """
    Membuat ringkasan statistik untuk data calibration, validation, dan gabungan keduanya.
    
    Parameters
    ----------
    data_cal : pd.DataFrame
        Dataset calibration.
    data_val : pd.DataFrame
        Dataset validation.
    column_name : str
        Nama kolom numerik yang ingin dianalisis.
    
    Returns
    -------
    pd.DataFrame
        DataFrame ringkasan berisi:
        ['Column', 'Dataset', 'Number of Samples', 'Range', 'Mean', 'SD']
        dengan pembulatan satu angka di belakang koma.
    """
    
    def summarize(df, label):
        col = df[column_name].dropna()
        return {
            "Column": column_name,
            "Dataset": label,
            "Number of Samples": len(col),
            "Range": f"{col.min():.1f} â€“ {col.max():.1f}",
            "Mean": round(col.mean(), 2),
            "SD": round(col.std(), 2)
        }
    
    summary = [
        summarize(data_cal, "Calibration"),
        summarize(data_val, "Validation"),
        summarize(pd.concat([data_cal, data_val], ignore_index=True), "All data")
    ]
    
    return pd.DataFrame(summary)


In [None]:
import pandas as pd

# Define file path_kadar_airs
cal_file_path_kadar_air = '../../splited_wo_outliers_curated_data_2/kering/nir/kadar_air/cal.csv'
val_file_path_kadar_air = '../../splited_wo_outliers_curated_data_2/kering/nir/kadar_air/val.csv'

# Read the CSV files
cal_df_kadar_air = pd.read_csv(cal_file_path_kadar_air)
val_df_kadar_air = pd.read_csv(val_file_path_kadar_air)

In [None]:
import pandas as pd

# Define file path_fenols
cal_file_path_fenol = '../../splited_wo_outliers_curated_data_2/kering/nir/fenol/cal.csv'
val_file_path_fenol = '../../splited_wo_outliers_curated_data_2/kering/nir/fenol/val.csv'

# Read the CSV files
cal_df_fenol = pd.read_csv(cal_file_path_fenol)
val_df_fenol = pd.read_csv(val_file_path_fenol)

In [None]:
import pandas as pd

# Define file path_gula_reduksis
cal_file_path_gula_reduksi = '../../splited_wo_outliers_curated_data_2/kering/nir/gula_reduksi/cal.csv'
val_file_path_gula_reduksi = '../../splited_wo_outliers_curated_data_2/kering/nir/gula_reduksi/val.csv'

# Read the CSV files
cal_df_gula_reduksi = pd.read_csv(cal_file_path_gula_reduksi)
val_df_gula_reduksi = pd.read_csv(val_file_path_gula_reduksi)

In [None]:
import pandas as pd

# Define file path_lemaks
cal_file_path_lemak = '../../splited_wo_outliers_curated_data_2/kering/nir/lemak/cal.csv'
val_file_path_lemak = '../../splited_wo_outliers_curated_data_2/kering/nir/lemak/val.csv'

# Read the CSV files
cal_df_lemak = pd.read_csv(cal_file_path_lemak)
val_df_lemak = pd.read_csv(val_file_path_lemak)

In [None]:
import pandas as pd

# Define file path_proteins
cal_file_path_protein = '../../splited_wo_outliers_curated_data_2/kering/nir/protein/cal.csv'
val_file_path_protein = '../../splited_wo_outliers_curated_data_2/kering/nir/protein/val.csv'

# Read the CSV files
cal_df_protein = pd.read_csv(cal_file_path_protein)
val_df_protein = pd.read_csv(val_file_path_protein)

summary

In [None]:
summary_fenol = summarize_datasets(cal_df_fenol, val_df_fenol, 'FENOL')
summary_gula_reduksi = summarize_datasets(cal_df_gula_reduksi, val_df_gula_reduksi, 'Gula Reduksi')
summary_lemak = summarize_datasets(cal_df_lemak, val_df_lemak, 'LEMAK')
summary_protein = summarize_datasets(cal_df_protein, val_df_protein, 'PROTEIN')
summary_kadar_air = summarize_datasets(cal_df_kadar_air, val_df_kadar_air, 'Kadar Air')

In [None]:
summary_fenol

In [None]:
summary = pd.concat([summary_fenol, summary_gula_reduksi, summary_lemak, summary_protein, summary_kadar_air], ignore_index=True)    

In [None]:
summary.to_csv('data_summary/data_summary_nir_kering.csv', index=False)