In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
! kaggle datasets download shivam2503/diamonds

Dataset URL: https://www.kaggle.com/datasets/shivam2503/diamonds
License(s): unknown
diamonds.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
! unzip /content/diamonds.zip

Archive:  /content/diamonds.zip
replace diamonds.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: diamonds.csv            


In [4]:
df = pd.read_csv('/content/diamonds.csv')

In [5]:
# Menghapus Outlier
def hapus_outlier_iqr(df):
    # Memilih fitur float saja
    kolom_float = df.select_dtypes(include=['float64']).columns

    # Menghitung kuartil pertama (Q1)
    q1 = df[kolom_float].quantile(0.25)
    # Menghitung kuartil ketiga (Q3)
    q3 = df[kolom_float].quantile(0.75)
    # Menghitung IQR (Interquartile Range)
    iqr = q3 - q1

    # Mengidentifikasi outlier
    masker_outlier = ((df[kolom_float] < (q1 - 1.5 * iqr)) | (df[kolom_float] > (q3 + 1.5 * iqr)))
    # Menghitung outlier di setiap kolom
    jumlah_outlier = masker_outlier.sum()

    # Menampilkan jumlah outlier
    print("Jumlah outlier yang terdeteksi:\n")
    print(jumlah_outlier)

    # Menghapus outlier
    df_cleaned = df[~masker_outlier.any(axis=1)]

    # Menghitung outlier setelah dihapus
    jumlah_outlier_cleaned = ((df_cleaned[kolom_float] < (q1 - 1.5 * iqr)) | (df_cleaned[kolom_float] > (q3 + 1.5 * iqr))).sum()
    print("Jumlah outlier setelah dihapus:\n")
    print(jumlah_outlier_cleaned)

    return df_cleaned

# Menjalankan Fungsi
df = hapus_outlier_iqr(df)

Jumlah outlier yang terdeteksi:

carat    1889
depth    2545
table     605
x          32
y          29
z          49
dtype: int64
Jumlah outlier setelah dihapus:

carat    0
depth    0
table    0
x        0
y        0
z        0
dtype: int64


In [6]:
from sklearn.preprocessing import LabelEncoder

# Mendefinisikan kolom kategorikal yang akan di-encode
CAT_COL_E = ['cut','color','clarity']

# Fungsi untuk meng-encode kolom menggunakan Label Encoding
def LabelEncode(data, columns):
    # Inisialisasi LabelEncoder
    L_E = LabelEncoder()

    # Encode kolom kategorikal
    for col in columns:
        data[col] = L_E.fit_transform(data[col])

    # Mengembalikan data yang telah di-encode
    return data

# Menerapkan Label Encoding ke DataFrame 'df'
df = LabelEncode(df, CAT_COL_E)

# Menampilkan DataFrame yang telah diupdate
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,2,1,3,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,3,1,2,59.8,61.0,326,3.89,3.84,2.31
3,4,0.29,3,5,5,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,1,6,3,63.3,58.0,335,4.34,4.35,2.75
5,6,0.24,4,6,7,62.8,57.0,336,3.94,3.96,2.48


In [10]:
df = df.drop('Unnamed: 0',axis=1)

In [11]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,2,1,3,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,326,3.89,3.84,2.31
3,0.29,3,5,5,62.4,58.0,334,4.2,4.23,2.63
4,0.31,1,6,3,63.3,58.0,335,4.34,4.35,2.75
5,0.24,4,6,7,62.8,57.0,336,3.94,3.96,2.48


In [12]:
X = df.drop('price',axis=1)
y = df['price']

In [16]:
from sklearn.preprocessing import MinMaxScaler

# Inisialisasi MinMaxScaler
scaler = MinMaxScaler()

# Sesuaikan scaler dengan data X dan transformasikan data
X = scaler.fit_transform(X)


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
from keras.models import Sequential
from keras.layers import Dense
from keras.regularizers import l1

#Sequential Model
model = Sequential()

#Imput Layer
model.add(Dense(10,activation='relu',input_dim=9, kernel_regularizer=l1(0.01))) # Now l1 is defined
#Hidden Layers
model.add(Dense(10,activation='relu'))
#OutputLayer
model.add(Dense(1,activation='linear'))

#Compile
model.compile(loss='huber',optimizer='Adam',metrics=['mse'])

# Summary
model.summary()

#Fit
history = model.fit(X_train,
                    y_train,
                    epochs=15,
                    validation_split=0.2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/15
[1m987/987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 3391.6470 - mse: 23148308.0000 - val_loss: 2376.2554 - val_mse: 14824774.0000
Epoch 2/15
[1m987/987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 2188.5999 - mse: 12161758.0000 - val_loss: 1978.4126 - val_mse: 9110880.0000
Epoch 3/15
[1m987/987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 1880.6993 - mse: 8232110.0000 - val_loss: 1658.4557 - val_mse: 7180736.0000
Epoch 4/15
[1m987/987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 1509.8828 - mse: 6299168.0000 - val_loss: 1133.6659 - val_mse: 4553562.5000
Epoch 5/15
[1m987/987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 1063.0530 - mse: 3978396.2500 - val_loss: 975.7216 - val_mse: 3322000.5000
Epoch 6/15
[1m987/987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 938.4849 - mse: 3016643.7500 - val_loss: 891.7003 - va

In [23]:
pred = model.predict(X_test)

[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


In [28]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Menghitung Mean Squared Error (MSE)
mse = mean_squared_error(y_test, pred)

# Menghitung Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Menghitung Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, pred)

# Menghitung R-squared (R2)
r2 = r2_score(y_test, pred)

# Menampilkan hasil metrik
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R2: {r2}")

MSE: 1136664.5
RMSE: 1066.1446899928733
MAE: 580.6182861328125
R2: 0.8984094858169556
