In [23]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras_tuner import Hyperband

In [24]:
# Memuat dataset
url = '/kaggle/input/sdfgbffasdf/Modified_DatasetCapstoneRianco.csv'
df = pd.read_csv(url)
print(df.shape)
print(df.head())


(12163, 6)
    HARGA     LT     LB   JKT  JKM     Kota
0  2100.0  137.0  170.0   3.0  2.0  Bandung
1  4100.0  202.0  300.0   3.0  2.0  Bandung
2  3300.0  350.0  258.0   5.0  2.0  Bandung
3   580.0   30.0   80.0   2.0  2.0  Bandung
4  1300.0  176.0  176.0  11.0  3.0  Bandung


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12163 entries, 0 to 12162
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   HARGA   12163 non-null  float64
 1   LT      12161 non-null  float64
 2   LB      12161 non-null  float64
 3   JKT     12129 non-null  float64
 4   JKM     12134 non-null  float64
 5   Kota    12163 non-null  object 
dtypes: float64(5), object(1)
memory usage: 570.3+ KB


In [26]:
# Cek missing value
print(df.isnull().sum())


HARGA     0
LT        2
LB        2
JKT      34
JKM      29
Kota      0
dtype: int64


In [27]:
df['Kota'] = df['Kota'].replace('JAKSEL', 'Jakarta Selatan')


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12163 entries, 0 to 12162
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   HARGA   12163 non-null  float64
 1   LT      12161 non-null  float64
 2   LB      12161 non-null  float64
 3   JKT     12129 non-null  float64
 4   JKM     12134 non-null  float64
 5   Kota    12163 non-null  object 
dtypes: float64(5), object(1)
memory usage: 570.3+ KB


In [29]:
df.loc[df['HARGA'] < 300, 'HARGA'] *= 100


In [30]:
df.nsmallest(20,"HARGA")

Unnamed: 0,HARGA,LT,LB,JKT,JKM,Kota
230,300.0,140.0,100.0,4.0,3.0,Bandung
2405,300.0,55.0,40.0,2.0,1.0,Bandung
3118,300.0,55.0,40.0,2.0,1.0,Bandung
3374,300.0,67.0,31.0,2.0,1.0,Bandung
3428,300.0,72.0,36.0,2.0,1.0,Bandung
3497,300.0,67.0,31.0,2.0,1.0,Bandung
3619,300.0,60.0,30.0,2.0,1.0,Bandung
3642,300.0,60.0,40.0,2.0,1.0,Bandung
6584,300.0,37.0,60.0,5.0,4.0,Bandung
7091,300.0,60.0,40.0,2.0,1.0,Bandung


In [31]:
# Menghapus baris dengan nilai NaN yang mungkin muncul setelah konversi
df = df.dropna(subset=['HARGA'])

# Menghapus nilai di atas 100,000
df = df[df['HARGA'] <= 80000]

In [32]:
df.nlargest(30,'HARGA')

Unnamed: 0,HARGA,LT,LB,JKT,JKM,Kota
6947,80000.0,2000.0,400.0,5.0,5.0,Bandung
8408,80000.0,800.0,2000.0,7.0,6.0,Jakarta Selatan
4454,79500.0,2122.0,901.0,23.0,10.0,Bandung
4455,79500.0,2122.0,901.0,23.0,10.0,Bandung
4218,78500.0,2000.0,600.0,7.0,4.0,Bandung
8585,77500.0,1148.0,1200.0,8.0,6.0,Jakarta Selatan
1841,77000.0,2200.0,1000.0,7.0,5.0,Bandung
1732,75000.0,4550.0,300.0,5.0,3.0,Bandung
2006,75000.0,712.0,400.0,5.0,3.0,Bandung
6940,75000.0,2000.0,2000.0,2.0,2.0,Bandung


In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import keras_tuner as kt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Gantikan 'data' dengan 'df'
df = df.dropna()  # Menghapus baris dengan nilai NaN
label_encoder = LabelEncoder()
df['Kota'] = label_encoder.fit_transform(df['Kota'])

# Feature Engineering
df['LT_LB'] = df['LT'] * df['LB']
df['Log_LT'] = np.log1p(df['LT'])
df['Log_LB'] = np.log1p(df['LB'])

# Ensure LT and LB are one-dimensional and do not contain NaN values
if df['LT'].isnull().any() or df['LB'].isnull().any():
    raise ValueError("LT and LB must not contain NaN values")
if df['LT'].ndim != 1 or df['LB'].ndim != 1:
    raise ValueError("LT and LB must be 1 dimensional")

# Binning Continuous Variables
df['LT_bin'] = pd.cut(df['LT'], bins=[0, 100, 200, 300, 400, np.inf], labels=['0-100', '100-200', '200-300', '300-400', '400+'])
df['LB_bin'] = pd.cut(df['LB'], bins=[0, 50, 100, 150, 200, np.inf], labels=['0-50', '50-100', '100-150', '150-200', '200+'])

# One-hot Encoding
df = pd.get_dummies(df, columns=['Kota', 'LT_bin', 'LB_bin'])

X = df.drop(columns=['HARGA'])
y = df['HARGA']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [34]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')  # atau 'mean'
df[['LT', 'LB', 'JKT', 'JKM']] = imputer.fit_transform(df[['LT', 'LB', 'JKT', 'JKM']])


In [35]:
def build_model(hp):
    model = keras.Sequential()
    model.add(keras.layers.Input(shape=(X_train_scaled.shape[1],)))

    # Tune the number of units in the first Dense layer
    hp_units1 = hp.Int('units1', min_value=32, max_value=512, step=32)
    model.add(keras.layers.Dense(units=hp_units1, activation='relu'))
  
    # Tune dropout rate
    hp_dropout1 = hp.Float('dropout1', min_value=0.0, max_value=0.5, step=0.1)
    model.add(keras.layers.Dropout(rate=hp_dropout1))

    # Additional layers with tunable units
    hp_units2 = hp.Int('units2', min_value=32, max_value=256, step=32)
    model.add(keras.layers.Dense(units=hp_units2, activation='relu'))

    hp_units3 = hp.Int('units3', min_value=16, max_value=128, step=16)
    model.add(keras.layers.Dense(units=hp_units3, activation='relu'))

    hp_dropout2 = hp.Float('dropout2', min_value=0.0, max_value=0.5, step=0.1)
    model.add(keras.layers.Dropout(rate=hp_dropout2))

    model.add(keras.layers.Dense(1, activation='linear'))

    # Tune the learning rate for the optimizer
    hp_learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss='mean_squared_error',
                  metrics=['mean_absolute_error'])

    return model


In [36]:
from tensorflow import keras
import keras_tuner as kt
tuner = kt.RandomSearch(
    build_model,
    objective='val_mean_absolute_error',
    max_trials=10,
    executions_per_trial=1,
    directory='my_dir1',
    project_name='intro_to_kt'
)

tuner.search(X_train_scaled, y_train, epochs=40, validation_split=0.2)

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The optimal number of units in the first densely-connected layer is {best_hps.get('units1')}.
The optimal number of units in the second densely-connected layer is {best_hps.get('units2')}.
The optimal number of units in the third densely-connected layer is {best_hps.get('units3')}.
The optimal learning rate for the optimizer is {best_hps.get('learning_rate')}.
The optimal dropout rate for the first dropout layer is {best_hps.get('dropout1')}.
The optimal dropout rate for the second dropout layer is {best_hps.get('dropout2')}.
""")


Reloading Tuner from my_dir1/intro_to_kt/tuner0.json

The optimal number of units in the first densely-connected layer is 288.
The optimal number of units in the second densely-connected layer is 224.
The optimal number of units in the third densely-connected layer is 96.
The optimal learning rate for the optimizer is 0.0025000464748982493.
The optimal dropout rate for the first dropout layer is 0.2.
The optimal dropout rate for the second dropout layer is 0.4.



In [37]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2

# Buat model dengan regularisasi L2
model = Sequential([
    tf.keras.layers.Input(shape=(X_train_scaled.shape[1],)),
    Dense(288, activation='relu', kernel_regularizer=l2(0.001)),  # Menambahkan regularisasi L2
    Dropout(0.2),
    Dense(224, activation='relu', kernel_regularizer=l2(0.001)),  # Menambahkan regularisasi L2
    Dense(96, activation='relu', kernel_regularizer=l2(0.001)),   # Menambahkan regularisasi L2
    Dropout(0.4),
    Dense(1, activation='linear')
])




In [38]:
# Kompilasi model
# Menggunakan learning rate optimal
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0025),
              loss='mean_squared_error',
              metrics=['mean_absolute_error'])

# Latih model dengan tensor input yang sudah di-encode
history = model.fit(X_train_scaled, y_train, batch_size=32 ,epochs=100,validation_data=(X_test_scaled, y_test))

Epoch 1/100
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 61721804.0000 - mean_absolute_error: 3783.5039 - val_loss: 138156112.0000 - val_mean_absolute_error: 3163.9883
Epoch 2/100
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 32317162.0000 - mean_absolute_error: 2756.9202 - val_loss: 203748176.0000 - val_mean_absolute_error: 2846.7507
Epoch 3/100
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 31476422.0000 - mean_absolute_error: 2641.5466 - val_loss: 255261856.0000 - val_mean_absolute_error: 2906.4104
Epoch 4/100
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 27638474.0000 - mean_absolute_error: 2541.0083 - val_loss: 294433984.0000 - val_mean_absolute_error: 3122.7336
Epoch 5/100
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 28972788.0000 - mean_absolute_error: 2633.7117 - val_loss: 298934912.0000 - val_m

In [39]:
# Creating a dictionary to see the mapping of city names to their encoded values
kota_labels = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(kota_labels)

{'Bandung': 0, 'Bekasi': 1, 'Bogor': 2, 'Depok': 3, 'Jakarta Barat': 4, 'Jakarta Pusat': 5, 'Jakarta Selatan': 6, 'Jakarta Timur': 7, 'Jakarta Utara': 8, 'Tangerang': 9}


In [40]:
# Prediksi pada data uji
y_pred = model.predict(X_test_scaled)

# Cetak nilai aktual dan prediksi untuk beberapa sampel pertama
for actual, predicted in zip(y_test[:10], y_pred.flatten()[:10]):
    print(f"Actual: {actual}, Predicted: {predicted:.2f}")
    
    # Hitung Mean Squared Error (MSE)
mae = mean_absolute_error(y_test, y_pred)

print("MaE on Test Data:", mae)


[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Actual: 2000.0, Predicted: 2757.29
Actual: 2700.0, Predicted: 1987.67
Actual: 1200.0, Predicted: 1267.10
Actual: 3250.0, Predicted: 2791.04
Actual: 2300.0, Predicted: 1323.31
Actual: 5900.0, Predicted: 6005.57
Actual: 1350.0, Predicted: 1350.06
Actual: 1450.0, Predicted: 1051.21
Actual: 1090.0, Predicted: 1376.56
Actual: 3900.0, Predicted: 9217.47
MaE on Test Data: 2783.11502118089


In [41]:
from sklearn.metrics import mean_absolute_error
import numpy as np
# Menghitung nilai rata-rata target dari set latih
mean_y_train = np.mean(y_train)

# Membuat array prediksi yang berisi nilai rata-rata untuk setiap sampel di set uji
baseline_predictions = np.full(shape=y_test.shape, fill_value=mean_y_train)

# Menghitung MAE antara prediksi baseline dan nilai sebenarnya di set uji
baseline_mae = mean_absolute_error(y_test, baseline_predictions)
print("Baseline MAE:", baseline_mae)

Baseline MAE: 5274.020274931766


In [42]:

# Anggap 'model' adalah model Keras yang sudah dilatih dan siap
model.save("my_modelLastTTT.h5")

In [None]:
import tensorflow as tf

# Load the pre-trained Keras model
model = tf.keras.models.load_model("/kaggle/working/my_modelLastTTT.h5")

# Create a TFLiteConverter object from the loaded Keras model
converter = tf.lite.TFLiteConverter.from_keras_model(model)

# Set optimization strategy if needed, here using default optimizations
converter.optimizations = [tf.lite.Optimize.DEFAULT]

# Convert the model
tflite_model = converter.convert()

# Save the TFLite model to the Kaggle working directory
tflite_path = '/kaggle/working/model.tflite'
with open(tflite_path, 'wb') as f:
    f.write(tflite_model)

# Print the path for reference
print("Model saved to:", tflite_path)
