In [40]:
import os
import cv2
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb

import tensorflow as tf
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


In [2]:
AMBING_PATH = '../datasets/Ambing.csv'

In [3]:
raw_df = pd.read_csv(AMBING_PATH)
raw_df.head()

Unnamed: 0,No,T Ambing dr Belakang,L Ambing dr Belakang,Panjang Ambing dr samping,Vol
0,1,,,,4.0
1,2,45.0,36.0,40.0,10.5
2,3,,,,3.0
3,4,,,,4.0
4,5,40.0,35.0,38.0,9.0


In [4]:
raw_df.shape

(202, 5)

In [5]:
ambing_df = raw_df[~(raw_df['T Ambing dr Belakang'].isna() & raw_df['L Ambing dr Belakang'].isna())]
ambing_df.shape

(54, 5)

In [6]:
nan_ambing_df = raw_df[raw_df['T Ambing dr Belakang'].isna() & raw_df['L Ambing dr Belakang'].isna()]
nan_ambing_df.shape

(148, 5)

# Image

In [7]:
IMAGE_BELAKANG_PATH = '../datasets/crop_belakang'
IMAGE_SAMPING_PATH = '../datasets/crop_samping'

In [8]:
INPUT_SHAPE = (224, 224, 3)

In [9]:
belakang_images = os.listdir(IMAGE_BELAKANG_PATH)
len(belakang_images)

202

In [10]:
samping_images = os.listdir(IMAGE_SAMPING_PATH)
len(samping_images)

202

In [11]:
vgg_model = tf.keras.applications.VGG16(weights='imagenet', include_top=False, input_shape=INPUT_SHAPE)

In [12]:
belakang_features = []
for path in belakang_images:
    image = cv2.imread(os.path.join(IMAGE_BELAKANG_PATH, path))
    image = cv2.resize(image, (INPUT_SHAPE[0], INPUT_SHAPE[1]))
    image = np.expand_dims(image, axis=0)
    image = tf.keras.applications.vgg16.preprocess_input(image)
    feature = vgg_model.predict(image)
    belakang_features.append([int(path.lower().replace('a.jpg', '')), feature.flatten()])



In [13]:
belakang_features_df = pd.DataFrame(belakang_features, columns=['No', 'belakang'])
belakang_features_df.head()

Unnamed: 0,No,belakang
0,100,"[2.0196407, 0.0, 0.0, 3.115536, 0.0, 0.0, 0.0,..."
1,101,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,102,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,103,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,104,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [14]:
samping_features = []
for path in samping_images:
    image = cv2.imread(os.path.join(IMAGE_SAMPING_PATH, path))
    image = cv2.resize(image, (INPUT_SHAPE[0], INPUT_SHAPE[1]))
    image = np.expand_dims(image, axis=0)
    image = tf.keras.applications.vgg16.preprocess_input(image)
    feature = vgg_model.predict(image)
    samping_features.append([int(path.lower().replace('b.jpg', '')), feature.flatten()])



In [15]:
samping_features_df = pd.DataFrame(samping_features, columns=['No', 'samping'])
samping_features_df.head()

Unnamed: 0,No,samping
0,100,"[7.4379587, 0.0, 13.921858, 0.0, 0.0, 0.0, 0.0..."
1,101,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,102,"[0.0, 0.0, 6.067517, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,103,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.78..."
4,104,"[1.0351293, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.0..."


In [16]:
features_df = pd.merge(belakang_features_df, samping_features_df, on='No')
features_df.head()

Unnamed: 0,No,belakang,samping
0,100,"[2.0196407, 0.0, 0.0, 3.115536, 0.0, 0.0, 0.0,...","[7.4379587, 0.0, 13.921858, 0.0, 0.0, 0.0, 0.0..."
1,101,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,102,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 6.067517, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,103,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.78..."
4,104,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0351293, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.0..."


In [17]:
def concatenate_arrays(row):
    return np.concatenate((row['belakang'], row['samping']))

In [18]:
features_df['combinated'] = features_df.apply(concatenate_arrays, axis=1)
features_df = features_df.drop(columns=['belakang', 'samping'])
features_df.head()

Unnamed: 0,No,combinated
0,100,"[2.0196407, 0.0, 0.0, 3.115536, 0.0, 0.0, 0.0,..."
1,101,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,102,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,103,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,104,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [19]:
ambing_df.head()

Unnamed: 0,No,T Ambing dr Belakang,L Ambing dr Belakang,Panjang Ambing dr samping,Vol
1,2,45.0,36.0,40.0,10.5
4,5,40.0,35.0,38.0,9.0
6,7,38.0,32.0,34.0,7.0
8,9,39.0,36.0,43.0,11.0
16,17,39.0,32.0,40.0,9.5


In [20]:
train_df = pd.merge(features_df, raw_df, on='No')
train_df.head()

Unnamed: 0,No,combinated,T Ambing dr Belakang,L Ambing dr Belakang,Panjang Ambing dr samping,Vol
0,100,"[2.0196407, 0.0, 0.0, 3.115536, 0.0, 0.0, 0.0,...",,,,3.5
1,101,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,,,2.5
2,102,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,,,6.0
3,103,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,,,2.5
4,104,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",34.0,17.0,34.0,2.5


In [21]:
X = np.array(train_df['combinated'].to_list())
y = train_df['Vol'].to_list()

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

## Random Forest

In [23]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=10)
rf_model.fit(X_train, y_train)

In [24]:
y_pred = rf_model.predict(X_test)

In [25]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 4.68092987804878
R-squared: 0.5673717217396086


## Decision Tree

In [26]:
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

In [27]:
y_pred = dt_model.predict(X_test)

In [28]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 8.634146341463415
R-squared: 0.20200131955135248


## Support Vector

In [29]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [31]:
svr_model = SVR(kernel='rbf', C=100, epsilon=0.1)
svr_model.fit(X_train, y_train)

In [32]:
y_pred = svr_model.predict(X_test)

In [33]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 6.241484920833798
R-squared: 0.34486817658934654


# XGBoost

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [37]:
xgb_reg = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=3)
xgb_reg.fit(X_train, y_train)

In [38]:
y_pred = xgb_reg.predict(X_test)

In [39]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 4.549979263058602
R-squared: 0.5794746458543265


# LightGBM

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [64]:
lgb_reg = LGBMRegressor(n_estimators=100, 
                        learning_rate=0.1,
                        max_depth=6,
                        num_leaves=40, 
                        min_child_samples=10,
                        min_gain_to_split=0.0,  
                        force_col_wise=True)
lgb_reg.fit(X_train, y_train)

[LightGBM] [Info] Total Bins 429714
[LightGBM] [Info] Number of data points in the train set: 161, number of used features: 31130
[LightGBM] [Info] Start training from score 5.437888


In [65]:
y_pred = lgb_reg.predict(X_test)



In [66]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 4.40191373572824
R-squared: 0.5931593913701797
