In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

In [7]:
import os
import pandas as pd

# Path ke folder data
data_dir = '../../../prepossed_data/basah/nir/gula_reduksi'

# Membaca semua file CSV di folder
files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]

# Membuat DataFrame untuk setiap file
for file in files:
    df_name = file.split('.')[0]  # Nama DataFrame berdasarkan nama file
    globals()[df_name] = pd.read_csv(os.path.join(data_dir, file))

# Verifikasi jumlah DataFrame yang dibuat
print(f"Total DataFrame yang dibuat: {len(files)}")

Total DataFrame yang dibuat: 22


In [8]:
X_train = cal_max_norm.drop(columns=['Gula Reduksi'])
y_train = cal_max_norm['Gula Reduksi']
X_test = val_max_norm.drop(columns=['Gula Reduksi'])
y_test = val_max_norm['Gula Reduksi']

preprocessor = ColumnTransformer([
    ('numeric', num_pipe(scaling="minmax"), X_train.columns),
])
from sklearn.svm import SVR
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', SVR(max_iter=500))
])
from sklearn.model_selection import GridSearchCV
from jcopml.tuning import grid_search_params as gsp

model = GridSearchCV(pipeline, gsp.svm_params, cv=10, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print("Train Score:", model.score(X_train, y_train))
print("Test Score:", model.score(X_test, y_test))
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(model, X_train, y_train, cv=10, n_jobs=-1)
print("CV Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Fitting 10 folds for each of 49 candidates, totalling 490 fits




Train Score: 0.43192251593226905
Test Score: 0.11011891726966583
CV Accuracy Scores: [ 0.00941364  0.05388692  0.30548161  0.14842259  0.22975059  0.09167412
  0.33877438 -0.08431004  0.31638017  0.15375028]
Mean CV Accuracy: 0.15632242624781112


In [9]:
X_train = cal_mean_norm.drop(columns=['Gula Reduksi'])
y_train = cal_mean_norm['Gula Reduksi']
X_test = val_mean_norm.drop(columns=['Gula Reduksi'])
y_test = val_mean_norm['Gula Reduksi']

preprocessor = ColumnTransformer([
    ('numeric', num_pipe(scaling="minmax"), X_train.columns),
])
from sklearn.svm import SVR
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', SVR(max_iter=500))
])
from sklearn.model_selection import GridSearchCV
from jcopml.tuning import grid_search_params as gsp

model = GridSearchCV(pipeline, gsp.svm_params, cv=10, scoring='r2', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print("Train Score:", model.score(X_train, y_train))
print("Test Score:", model.score(X_test, y_test))
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='r2', n_jobs=-1, verbose=1)
print("CV Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Fitting 10 folds for each of 49 candidates, totalling 490 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Train Score: 0.45124189471529197
Test Score: 0.08747396774370186
CV Accuracy Scores: [ 0.07812017 -0.06384052  0.06405293 -0.04240612  0.13682539  0.26199816
  0.31077499 -0.20085518  0.04575765  0.30618074]
Mean CV Accuracy: 0.08966082238998209


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.3min finished


In [10]:
X_train = cal_msc.drop(columns=['Gula Reduksi'])
y_train = cal_msc['Gula Reduksi']
X_test = val_msc.drop(columns=['Gula Reduksi'])
y_test = val_msc['Gula Reduksi']

preprocessor = ColumnTransformer([
    ('numeric', num_pipe(), X_train.columns),
])
from sklearn.svm import SVR
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', SVR(max_iter=500))
])
from sklearn.model_selection import GridSearchCV
from jcopml.tuning import grid_search_params as gsp

model = GridSearchCV(pipeline, gsp.svm_params, cv=10, scoring='r2', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print("Train Score:", model.score(X_train, y_train))
print("Test Score:", model.score(X_test, y_test))
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='r2', n_jobs=-1, verbose=1)
print("CV Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Fitting 10 folds for each of 49 candidates, totalling 490 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Train Score: 0.844988479424453
Test Score: 0.06486445945120956
CV Accuracy Scores: [-0.09639543  0.23209494  0.05482051  0.06895923  0.36106091  0.15495549
  0.3199967   0.34931781  0.29946867  0.26204853]
Mean CV Accuracy: 0.20063273580922614


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.4min finished


In [11]:
X_train = cal_range_norm.drop(columns=['Gula Reduksi'])
y_train = cal_range_norm['Gula Reduksi']
X_test = val_range_norm.drop(columns=['Gula Reduksi'])
y_test = val_range_norm['Gula Reduksi']

preprocessor = ColumnTransformer([
    ('numeric', num_pipe(), X_train.columns),
])
from sklearn.svm import SVR
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', SVR(max_iter=500))
])
from sklearn.model_selection import GridSearchCV
from jcopml.tuning import grid_search_params as gsp

model = GridSearchCV(pipeline, gsp.svm_params, cv=10, scoring='r2', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print("Train Score:", model.score(X_train, y_train))
print("Test Score:", model.score(X_test, y_test))
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='r2', n_jobs=-1, verbose=1)
print("CV Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Fitting 10 folds for each of 49 candidates, totalling 490 fits
Train Score: 0.4765443340593364
Test Score: -0.021247493566952214


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


CV Accuracy Scores: [-0.08999244 -0.04585111  0.06088666 -0.40923986  0.12375677  0.21653541
  0.3058311  -0.00772162  0.27480568  0.035962  ]
Mean CV Accuracy: 0.046497259546405066


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.2min finished


In [12]:
X_train = cal_sg1.drop(columns=['Gula Reduksi'])
y_train = cal_sg1['Gula Reduksi']
X_test = val_sg1.drop(columns=['Gula Reduksi'])
y_test = val_sg1['Gula Reduksi']

preprocessor = ColumnTransformer([
    ('numeric', num_pipe(), X_train.columns),
])
from sklearn.svm import SVR
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', SVR(max_iter=500))
])
from sklearn.model_selection import GridSearchCV
from jcopml.tuning import grid_search_params as gsp

model = GridSearchCV(pipeline, gsp.svm_params, cv=10, scoring='r2', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print("Train Score:", model.score(X_train, y_train))
print("Test Score:", model.score(X_test, y_test))
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='r2', n_jobs=-1, verbose=1)
print("CV Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Fitting 10 folds for each of 49 candidates, totalling 490 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Train Score: 0.6570333576729731
Test Score: 0.5324742825302968
CV Accuracy Scores: [-0.18601239  0.3146341   0.34808264  0.34463754  0.63590739  0.2046214
  0.56652108  0.45731381  0.52269392  0.67808539]
Mean CV Accuracy: 0.38864848781784933


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.3min finished


In [13]:
X_train = cal_sg2.drop(columns=['Gula Reduksi'])
y_train = cal_sg2['Gula Reduksi']
X_test = val_sg2.drop(columns=['Gula Reduksi'])
y_test = val_sg2['Gula Reduksi']

preprocessor = ColumnTransformer([
    ('numeric', num_pipe(), X_train.columns),
])
from sklearn.svm import SVR
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', SVR(max_iter=500))
])
from sklearn.model_selection import GridSearchCV
from jcopml.tuning import grid_search_params as gsp

model = GridSearchCV(pipeline, gsp.svm_params, cv=10, scoring='r2', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print("Train Score:", model.score(X_train, y_train))
print("Test Score:", model.score(X_test, y_test))
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='r2', n_jobs=-1, verbose=1)
print("CV Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Fitting 10 folds for each of 49 candidates, totalling 490 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Train Score: 0.9911139997921846
Test Score: 0.49831132260407784
CV Accuracy Scores: [0.63327711 0.48722097 0.67861468 0.51097501 0.74423583 0.42500392
 0.70238719 0.32166115 0.7050282  0.70173369]
Mean CV Accuracy: 0.5910137756860128


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.3min finished


In [14]:
X_train = cal_smoothing_mean.drop(columns=['Gula Reduksi'])
y_train = cal_smoothing_mean['Gula Reduksi']
X_test = val_smoothing_mean.drop(columns=['Gula Reduksi'])
y_test = val_smoothing_mean['Gula Reduksi']

preprocessor = ColumnTransformer([
    ('numeric', num_pipe(), X_train.columns),
])
from sklearn.svm import SVR
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', SVR(max_iter=500))
])
from sklearn.model_selection import GridSearchCV
from jcopml.tuning import grid_search_params as gsp

model = GridSearchCV(pipeline, gsp.svm_params, cv=10, scoring='r2', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print("Train Score:", model.score(X_train, y_train))
print("Test Score:", model.score(X_test, y_test))
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='r2', n_jobs=-1, verbose=1)
print("CV Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Fitting 10 folds for each of 49 candidates, totalling 490 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Train Score: 0.9985600799105225
Test Score: 0.008462925067549198
CV Accuracy Scores: [ 0.04268078 -0.09215136 -0.14156883  0.05091029 -0.06269589 -0.17416032
  0.11283301  0.03596657  0.04739193  0.13740677]
Mean CV Accuracy: -0.004338704570552232


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.3min finished


In [15]:
X_train = cal_snv.drop(columns=['Gula Reduksi'])
y_train = cal_snv['Gula Reduksi']
X_test = val_snv.drop(columns=['Gula Reduksi'])
y_test = val_snv['Gula Reduksi']

preprocessor = ColumnTransformer([
    ('numeric', num_pipe(), X_train.columns),
])
from sklearn.svm import SVR
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', SVR(max_iter=500))
])
from sklearn.model_selection import GridSearchCV
from jcopml.tuning import grid_search_params as gsp

model = GridSearchCV(pipeline, gsp.svm_params, cv=10, scoring='r2', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print("Train Score:", model.score(X_train, y_train))
print("Test Score:", model.score(X_test, y_test))
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='r2', n_jobs=-1, verbose=1)
print("CV Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Fitting 10 folds for each of 49 candidates, totalling 490 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Train Score: 0.6730356888887554
Test Score: 0.2501073415513132
CV Accuracy Scores: [-0.02088661  0.26961697  0.29745813  0.08344487  0.52066262  0.28779899
  0.3699524   0.34830215  0.42803293  0.42882147]
Mean CV Accuracy: 0.3013203930622058


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.3min finished


In [16]:
X_train = cal_fd1.drop(columns=['Gula Reduksi'])
y_train = cal_fd1['Gula Reduksi']
X_test = val_fd1.drop(columns=['Gula Reduksi'])
y_test = val_fd1['Gula Reduksi']

preprocessor = ColumnTransformer([
    ('numeric', num_pipe(), X_train.columns),
])
from sklearn.svm import SVR
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', SVR(max_iter=500))
])
from sklearn.model_selection import GridSearchCV
from jcopml.tuning import grid_search_params as gsp

model = GridSearchCV(pipeline, gsp.svm_params, cv=10, scoring='r2', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print("Train Score:", model.score(X_train, y_train))
print("Test Score:", model.score(X_test, y_test))
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='r2', n_jobs=-1, verbose=1)
print("CV Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Fitting 10 folds for each of 49 candidates, totalling 490 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Train Score: 0.9960832187284286
Test Score: 0.52396172520663
CV Accuracy Scores: [0.64491176 0.51898636 0.71767224 0.52674761 0.76998255 0.65292026
 0.68544799 0.60057211 0.70775843 0.69874765]
Mean CV Accuracy: 0.6523746985229695


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.3min finished


In [17]:
X_train = cal_fd2.drop(columns=['Gula Reduksi'])
y_train = cal_fd2['Gula Reduksi']
X_test = val_fd2.drop(columns=['Gula Reduksi'])
y_test = val_fd2['Gula Reduksi']

preprocessor = ColumnTransformer([
    ('numeric', num_pipe(), X_train.columns),
])
from sklearn.svm import SVR
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', SVR(max_iter=500))
])
from sklearn.model_selection import GridSearchCV
from jcopml.tuning import grid_search_params as gsp

model = GridSearchCV(pipeline, gsp.svm_params, cv=10, scoring='r2', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print("Train Score:", model.score(X_train, y_train))
print("Test Score:", model.score(X_test, y_test))
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='r2', n_jobs=-1, verbose=1)
print("CV Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Fitting 10 folds for each of 49 candidates, totalling 490 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Train Score: 0.9770316849743943
Test Score: 0.6568772872621469
CV Accuracy Scores: [0.73282421 0.53991739 0.7500901  0.72718175 0.77728355 0.70677048
 0.76157726 0.69009117 0.79857404 0.72674552]
Mean CV Accuracy: 0.7211055478819708


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.3min finished


In [18]:
X_train = cal_ori.drop(columns=['Gula Reduksi'])
y_train = cal_ori['Gula Reduksi']
X_test = val_ori.drop(columns=['Gula Reduksi'])
y_test = val_ori['Gula Reduksi']

preprocessor = ColumnTransformer([
    ('numeric', num_pipe(), X_train.columns),
])
from sklearn.svm import SVR
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', SVR(max_iter=500))
])
from sklearn.model_selection import GridSearchCV
from jcopml.tuning import grid_search_params as gsp

model = GridSearchCV(pipeline, gsp.svm_params, cv=10, scoring='r2', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print("Train Score:", model.score(X_train, y_train))
print("Test Score:", model.score(X_test, y_test))
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='r2', n_jobs=-1, verbose=1)
print("CV Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Fitting 10 folds for each of 49 candidates, totalling 490 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Train Score: 0.9985768015224623
Test Score: 0.01821391154337959
CV Accuracy Scores: [ 0.05067416 -0.04240976  0.06297846  0.02805082 -0.03723106 -0.106291
  0.10311207  0.013612    0.03805553  0.12223968]
Mean CV Accuracy: 0.02327908933019084


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.4min finished
