In [None]:
!pip install xgboost

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Ganti "path/to/your/file" dengan path yang benar dari file Anda
file_path = '/content/drive/MyDrive/Dataset/RegresiUTSTelkom.csv'
data = pd.read_csv(file_path)
data.head()

In [None]:
# Mengganti nama kolom agar lebih mudah dipanggil
data.columns = [f"col_{i}" for i in range(data.shape[1])]

# Memeriksa informasi dataset setelah mengganti nama kolom
data.info()

# Menangani missing values (jika ada)
data = data.dropna()  # atau gunakan fillna sesuai kebutuhan

# Normalisasi kolom-kolom numerik menggunakan MinMaxScaler
scaler = MinMaxScaler()
data[data.columns] = scaler.fit_transform(data[data.columns])

# Menampilkan 5 data pertama setelah preprocessing
data.head()

In [None]:
# Ringkasan statistik dasar
data.describe()

In [None]:
# Visualisasi distribusi untuk kolom-kolom awal
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Distribusi Kolom Utama')

for i, col in enumerate(data.columns[:6]):
    sns.histplot(data[col], bins=30, kde=True, ax=axes[i//3, i%3])
    axes[i//3, i%3].set_title(f'Distribusi {col}')

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

In [None]:
# Visualisasi heatmap untuk subset kolom
subset_corr = data[data.columns[:23]].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(subset_corr, cmap='coolwarm', annot=True, fmt=".2f")
plt.title('Heatmap Korelasi untuk Subset Kolom')
plt.show()

In [None]:
# Visualisasi outliers dengan box plot
plt.figure(figsize=(15, 6))
sns.boxplot(data=data[data.columns[:5]], orient='h')
plt.title('Visualisasi Outliers Kolom Utama')
plt.show()

In [None]:
# Membagi data menjadi fitur (X) dan target (y) - sesuaikan nama kolom target
X = data.drop(columns='col_20')  # Ganti 'col_target' dengan nama kolom target Anda
y = data['col_20']               # Ganti 'col_target' dengan nama kolom target Anda

# Membagi dataset menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Menggunakan SelectKBest untuk memilih 10 fitur terbaik berdasarkan korelasi dengan target
selector = SelectKBest(score_func=f_regression, k=20)
X_train_reduced = selector.fit_transform(X_train, y_train)
X_test_reduced = selector.transform(X_test)

# Membuat pipeline baru dengan degree yang lebih rendah
poly_pipeline_reduced = Pipeline([
    ('poly_features', PolynomialFeatures(degree=1)),  # Mulai dengan degree=1
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

# Melatih model
poly_pipeline_reduced.fit(X_train_reduced, y_train)

# Evaluasi
y_pred_poly_reduced = poly_pipeline_reduced.predict(X_test_reduced)
mse_poly_reduced = mean_squared_error(y_test, y_pred_poly_reduced)
print("Mean Squared Error (Polynomial Regression with reduced features):", mse_poly_reduced)

In [None]:
# Pipeline untuk Decision Tree Regression
tree_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', DecisionTreeRegressor(max_depth=20))  # Ubah max_depth sesuai kebutuhan
])

# Melatih model
tree_pipeline.fit(X_train, y_train)

# Evaluasi
y_pred_tree = tree_pipeline.predict(X_test)
mse_tree = mean_squared_error(y_test, y_pred_tree)
print("Mean Squared Error (Decision Tree Regression):", mse_tree)

In [None]:
# Pipeline untuk k-Nearest Neighbors Regression
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', KNeighborsRegressor(n_neighbors=5))  # Ubah n_neighbors sesuai kebutuhan
])

# Melatih model
knn_pipeline.fit(X_train, y_train)

# Evaluasi
y_pred_knn = knn_pipeline.predict(X_test)
mse_knn = mean_squared_error(y_test, y_pred_knn)
print("Mean Squared Error (k-NN Regression):", mse_knn)

In [None]:
# Pipeline untuk k-Nearest Neighbors Regression
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', KNeighborsRegressor(n_neighbors=5))  # Ubah n_neighbors sesuai kebutuhan
])

# Melatih model
knn_pipeline.fit(X_train, y_train)

# Evaluasi
y_pred_knn = knn_pipeline.predict(X_test)
mse_knn = mean_squared_error(y_test, y_pred_knn)
print("Mean Squared Error (k-NN Regression):", mse_knn)

In [None]:
# Pipeline untuk XGBoost Regression
xgb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1, objective='reg:squarederror'))
])

# Melatih model
xgb_pipeline.fit(X_train, y_train)

# Evaluasi
y_pred_xgb = xgb_pipeline.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print("Mean Squared Error (XGBoost Regression):", mse_xgb)

In [None]:
print("Mean Squared Error (Polynomial Regression with reduced features):", mse_poly_reduced)
print("Mean Squared Error (Decision Tree Regression):", mse_tree)
print("Mean Squared Error (k-NN Regression):", mse_knn)
print("Mean Squared Error (XGBoost Regression):", mse_xgb)

In [None]:
# Import Library
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression

# 1. Definisikan Pipeline untuk Decision Tree
tree_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardisasi data
    ('regressor', DecisionTreeRegressor(random_state=42))  # Model Decision Tree
])

# 2. Parameter Grid untuk RandomizedSearchCV
param_grid_tree = {
    'regressor__max_depth': [5, 10, 20],         # Kedalaman maksimal pohon
    'regressor__min_samples_split': [2, 5, 10], # Minimal sampel untuk split
    'regressor__min_samples_leaf': [1, 2, 4]    # Minimal sampel di setiap leaf node
}

# 3. RandomizedSearchCV untuk Tuning Parameter
random_search_tree = RandomizedSearchCV(
    tree_pipeline,
    param_distributions=param_grid_tree,
    n_iter=5,  # Jumlah kombinasi parameter yang diuji
    scoring='neg_mean_squared_error',
    cv=3,      # 3-fold cross-validation
    random_state=42
)

# 4. Melatih RandomizedSearchCV
random_search_tree.fit(X_train, y_train)

# 5. Evaluasi Model Terbaik
best_params_tree = random_search_tree.best_params_
y_pred_tree = random_search_tree.best_estimator_.predict(X_test)
mse_tree = mean_squared_error(y_test, y_pred_tree)

print("Optimal Parameters:", best_params_tree)
print("Mean Squared Error (Best Decision Tree):", mse_tree)

In [None]:
from sklearn.model_selection import GridSearchCV

# Parameter Grid untuk k-NN
param_grid_knn = {
    'regressor__n_neighbors': [3, 5, 7, 10],
    'regressor__weights': ['uniform', 'distance']
}

# GridSearchCV untuk k-NN
knn_search = GridSearchCV(
    knn_pipeline,
    param_grid=param_grid_knn,
    scoring='neg_mean_squared_error',
    cv=3
)

# Melatih GridSearchCV
knn_search.fit(X_train, y_train)

# Evaluasi Model Terbaik
best_params_knn = knn_search.best_params_
y_pred_knn = knn_search.best_estimator_.predict(X_test)
mse_knn = mean_squared_error(y_test, y_pred_knn)

print("Optimal Parameters (k-NN):", best_params_knn)
print("Mean Squared Error (Best k-NN):", mse_knn)

In [None]:
# Parameter Grid untuk XGBoost
param_grid_xgb = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [3, 5, 7],
    'regressor__learning_rate': [0.01, 0.1, 0.2]
}

# RandomizedSearchCV untuk XGBoost
xgb_search = RandomizedSearchCV(
    xgb_pipeline,
    param_distributions=param_grid_xgb,
    n_iter=5,
    scoring='neg_mean_squared_error',
    cv=3,
    random_state=42
)

# Melatih RandomizedSearchCV
xgb_search.fit(X_train, y_train)

# Evaluasi Model Terbaik
best_params_xgb = xgb_search.best_params_
y_pred_xgb = xgb_search.best_estimator_.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)

print("Optimal Parameters (XGBoost):", best_params_xgb)
print("Mean Squared Error (Best XGBoost):", mse_xgb)

In [None]:
# Parameter Grid untuk Polynomial Regression
param_grid_poly = {
    'poly_features__degree': [1, 2, 3]
}

# GridSearchCV untuk Polynomial Regression
poly_search = GridSearchCV(
    poly_pipeline_reduced,
    param_grid=param_grid_poly,
    scoring='neg_mean_squared_error',
    cv=3
)

# Melatih GridSearchCV
poly_search.fit(X_train_reduced, y_train)

# Evaluasi Model Terbaik
best_params_poly = poly_search.best_params_
y_pred_poly = poly_search.best_estimator_.predict(X_test_reduced)
mse_poly = mean_squared_error(y_test, y_pred_poly)

print("Optimal Parameters (Polynomial Regression):", best_params_poly)
print("Mean Squared Error (Best Polynomial Regression):", mse_poly)


In [None]:
# Parameter Grid untuk Polynomial Regression
param_grid_poly = {
    'poly_features__degree': [1, 2, 3]
}

# GridSearchCV untuk Polynomial Regression
poly_search = GridSearchCV(
    poly_pipeline_reduced,
    param_grid=param_grid_poly,
    scoring='neg_mean_squared_error',
    cv=3
)

# Melatih GridSearchCV
poly_search.fit(X_train_reduced, y_train)

# Evaluasi Model Terbaik
best_params_poly = poly_search.best_params_
y_pred_poly = poly_search.best_estimator_.predict(X_test_reduced)
mse_poly = mean_squared_error(y_test, y_pred_poly)

print("Optimal Parameters (Polynomial Regression):", best_params_poly)
print("Mean Squared Error (Best Polynomial Regression):", mse_poly)


In [None]:
import pandas as pd

# Membuat DataFrame untuk perbandingan
comparison_df = pd.DataFrame({
    'Model': ['Decision Tree', 'k-NN', 'XGBoost', 'Polynomial Regression'],
    'MSE': [mse_tree, mse_knn, mse_xgb, mse_poly]
})

# Menambahkan kolom untuk MAE dan R² jika diperlukan
comparison_df.sort_values(by='MSE', ascending=True, inplace=True)
print(comparison_df)


In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(range(len(y_test)), y_test, label='Actual', alpha=0.7)
plt.scatter(range(len(y_test)), y_pred_xgb, label='Predicted (XGBoost)', alpha=0.7, color='red')
plt.title("Actual vs Predicted (XGBoost)")
plt.xlabel("Sample Index")
plt.ylabel("Target Value")
plt.legend()
plt.show()
