# **1. Import Library**

In [None]:
#import library for clasification
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# **2. Memuat Dataset dari Hasil Clustering**

In [None]:
#load dataset
df = pd.read_csv('ds_salaries_labeled.csv')
df.head()

Unnamed: 0,work_year,experience_level,salary_in_usd,remote_ratio,company_size,employment_type_CT,employment_type_FL,employment_type_FT,employment_type_PT,job_title_freq,employee_residence_encoded,company_location_encoded,cluster
0,2023,3,85847,100,3,False,False,True,False,8,47,44,1
1,2023,2,30000,100,1,True,False,False,False,34,1893,1929,2
2,2023,2,25500,100,1,True,False,False,False,34,1893,1929,2
3,2023,3,175000,100,2,False,False,True,False,538,81,83,1
4,2023,3,120000,100,2,False,False,True,False,538,81,83,1


# **3. Data Splitting**

In [None]:
#separate features(X) and target(y ='cluster')
X = df.drop(['cluster'], axis=1)
y = df['cluster']

#split data training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#verify the split
print(f"Jumlah data latih: {len(X_train)}")
print(f"Jumlah data uji: {len(X_test)}")

Jumlah data latih: 2067
Jumlah data uji: 517


# **4. Membangun Model Klasifikasi**


## **a. Membangun Model Klasifikasi**

In [None]:
#initialize model
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

Tulis narasi atau penjelasan algoritma yang Anda gunakan.

## **b. Evaluasi Model Klasifikasi**

In [None]:
#train and evaluate model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    print(f"Model: {name}")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", conf_matrix)
    print("="*50)

Model: Logistic Regression
Accuracy: 0.9477756286266924
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98       386
           1       0.90      0.89      0.89       119
           2       0.00      0.00      0.00        12

    accuracy                           0.95       517
   macro avg       0.62      0.63      0.62       517
weighted avg       0.93      0.95      0.94       517

Confusion Matrix:
 [[384   2   0]
 [ 13 106   0]
 [  2  10   0]]
Model: Decision Tree
Accuracy: 0.9806576402321083
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99       386
           1       0.95      0.97      0.96       119
           2       1.00      1.00      1.00        12

    accuracy                           0.98       517
   macro avg       0.98      0.98      0.98       517
weighted avg       0.98      0.98      0.98       517

Confusion Matrix:
 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: Random Forest
Accuracy: 0.988394584139265
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       386
           1       0.98      0.97      0.97       119
           2       1.00      1.00      1.00        12

    accuracy                           0.99       517
   macro avg       0.99      0.99      0.99       517
weighted avg       0.99      0.99      0.99       517

Confusion Matrix:
 [[384   2   0]
 [  4 115   0]
 [  0   0  12]]
Model: K-Nearest Neighbors
Accuracy: 0.9361702127659575
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.99      0.97       386
           1       0.91      0.84      0.87       119
           2       0.00      0.00      0.00        12

    accuracy                           0.94       517
   macro avg       0.62      0.61      0.61       517
weighted avg       0.91      0.94      0.92       517

Confusion Matrix:
 [

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Tulis hasil evaluasi algoritma yang digunakan, jika Anda menggunakan 2 algoritma, maka bandingkan hasilnya.

## **c. Tuning Model Klasifikasi (Optional)**

Gunakan GridSearchCV, RandomizedSearchCV, atau metode lainnya untuk mencari kombinasi hyperparameter terbaik

In [None]:
#GridSearchCV for Logistic Regression
param_grid = {
    'penalty': ['l2', 'none'],
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga']
}

# initial model and GridSearchCV
log_reg = LogisticRegression(max_iter=1000)
grid_search_log_reg = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy')

# Fit model to data
grid_search_log_reg.fit(X_train, y_train)

print(f"Best Hyperparameters: {grid_search_log_reg.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search_log_reg.best_score_}")



Best Hyperparameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best Cross-Validation Accuracy: 0.9733960299914612


30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
sklea

In [None]:
# GridSearchCV untuk Decision Tree
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize model and GridSearchCV
dt = DecisionTreeClassifier()
grid_search_dt = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy')

# Fit model to data
grid_search_dt.fit(X_train, y_train)

print(f"Best Hyperparameters: {grid_search_dt.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search_dt.best_score_}")

Best Hyperparameters: {'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 5}
Best Cross-Validation Accuracy: 0.9883940999637388


In [None]:
# GridSearchCV untuk Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# initialize model and GridSearchCV
rf = RandomForestClassifier()
grid_search_rf = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')

# Fit model to data
grid_search_rf.fit(X_train, y_train)

print(f"Best Hyperparameters: {grid_search_rf.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search_rf.best_score_}")


Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Best Cross-Validation Accuracy: 0.9903299762548106


In [None]:
# RandomizedSearchCV untuk K-Nearest Neighbors
param_dist = {
    'n_neighbors': np.arange(1, 31),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# initialize model and RandomizedSearchCV
knn = KNeighborsClassifier()
random_search_knn = RandomizedSearchCV(knn, param_dist, n_iter=100, cv=5, scoring='accuracy')

# Fit model to data
random_search_knn.fit(X_train, y_train)

print(f"Best Hyperparameters: {random_search_knn.best_params_}")
print(f"Best Cross-Validation Accuracy: {random_search_knn.best_score_}")

Best Hyperparameters: {'weights': 'distance', 'n_neighbors': 3, 'metric': 'manhattan'}
Best Cross-Validation Accuracy: 0.9651787907499034


In [None]:
# print all best hyperparamaters and best cross - validation
print("Best Hyperparameters and Best Cross-Validation Accuracy for model logistic regression")
print(f"Best Hyperparameters: {grid_search_log_reg.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search_log_reg.best_score_}")
print("="*50)
print("Best Hyperparameters and Best Cross-Validation Accuracy for model decision tree")
print(f"Best Hyperparameters: {grid_search_dt.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search_dt.best_score_}")
print("="*50)
print("Best Hyperparameters and Best Cross-Validation Accuracy for model random forest")
print(f"Best Hyperparameters: {grid_search_rf.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search_rf.best_score_}")
print("="*50)
print("Best Hyperparameters and Best Cross-Validation Accuracy for model k-nearest neighbors")
print(f"Best Hyperparameters: {random_search_knn.best_params_}")
print(f"Best Cross-Validation Accuracy: {random_search_knn.best_score_}")

Best Hyperparameters and Best Cross-Validation Accuracy for model logistic regression
Best Hyperparameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best Cross-Validation Accuracy: 0.9733960299914612
Best Hyperparameters and Best Cross-Validation Accuracy for model decision tree
Best Hyperparameters: {'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 5}
Best Cross-Validation Accuracy: 0.9883940999637388
Best Hyperparameters and Best Cross-Validation Accuracy for model random forest
Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Best Cross-Validation Accuracy: 0.9903299762548106
Best Hyperparameters and Best Cross-Validation Accuracy for model k-nearest neighbors
Best Hyperparameters: {'weights': 'distance', 'n_neighbors': 3, 'metric': 'manhattan'}
Best Cross-Validation Accuracy: 0.9651787907499034


## **d. Evaluasi Model Klasifikasi setelah Tuning (Optional)**

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Model logistic regression with best hyperparameter
log_reg_best = LogisticRegression(C=0.1, penalty='l2', solver='liblinear')
log_reg_best.fit(X_train, y_train)
log_reg_predictions = log_reg_best.predict(X_test)

# Model decision tree with best hyperparameter
decision_tree_best = DecisionTreeClassifier(criterion='gini', max_depth=30, min_samples_leaf=2, min_samples_split=5)
decision_tree_best.fit(X_train, y_train)
decision_tree_predictions = decision_tree_best.predict(X_test)

# Model random forest with best hyperparameter
random_forest_best = RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300)
random_forest_best.fit(X_train, y_train)
random_forest_predictions = random_forest_best.predict(X_test)

# Model k-nearest neighbors with best hyperparameter
knn_best = KNeighborsClassifier(weights='distance', n_neighbors=3, metric='manhattan')
knn_best.fit(X_train, y_train)
knn_predictions = knn_best.predict(X_test)

# Evaluation performance each model
models = {
    "Logistic Regression": log_reg_best,
    "Decision Tree": decision_tree_best,
    "Random Forest": random_forest_best,
    "K-Nearest Neighbors": knn_best
}

for model_name, model in models.items():
    print(f"Model: {model_name}")
    predictions = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, predictions))
    print("Classification Report:\n", classification_report(y_test, predictions))
    print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
    print("="*50)


Model: Logistic Regression
Accuracy: 0.9458413926499033
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98       386
           1       0.89      0.89      0.89       119
           2       0.00      0.00      0.00        12

    accuracy                           0.95       517
   macro avg       0.62      0.63      0.62       517
weighted avg       0.92      0.95      0.93       517

Confusion Matrix:
 [[383   3   0]
 [ 13 106   0]
 [  2  10   0]]
Model: Decision Tree
Accuracy: 0.9787234042553191
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99       386
           1       0.95      0.96      0.95       119
           2       1.00      1.00      1.00        12

    accuracy                           0.98       517
   macro avg       0.98      0.98      0.98       517
weighted avg       0.98      0.98      0.98       517

Confusion Matrix:
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


 0.9342359767891683
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.97       386
           1       0.89      0.87      0.88       119
           2       0.25      0.08      0.12        12

    accuracy                           0.93       517
   macro avg       0.70      0.65      0.66       517
weighted avg       0.92      0.93      0.93       517

Confusion Matrix:
 [[378   7   1]
 [ 13 104   2]
 [  5   6   1]]


## **e. Analisis Hasil Evaluasi Model Klasifikasi**

**Perbandingan Hasil Evaluasi Sebelum dan Setelah Hyperparameter Tuning**

1. Logistic Regression
  - Sebelum Hyperparameter Tuning:
    - Cross-Validation Accuracy: 0.9734
  - Setelah Hyperparameter Tuning:
    -  Accuracy: 0.9458
    - Precision & Recall: Menurun pada kelas 2, terlihat dari f1-score = 0.00
    - Confusion Matrix: Kesalahan pada kelas 2 meningkat, model tidak memprediksi kelas 2 dengan baik.
2. Decision Tree
  - Sebelum Hyperparameter Tuning:
    - Cross-Validation Accuracy: 0.9884
  - Setelah Hyperparameter Tuning:
    - Accuracy: 0.9787
    - Precision & Recall: Hampir sempurna di semua kelas, terutama kelas 2 (f1-score = 1.00).
    - Confusion Matrix: Kesalahan prediksi lebih sedikit dibandingkan logistic regression, akurasi sangat mendekati hasil cross-validation.
3. Random Forest
  - Sebelum Hyperparameter Tuning:
    - Cross-Validation Accuracy: 0.9903
  - Setelah Hyperparameter Tuning:
    - Accuracy: 0.9865
    - Precision & Recall: Hampir sempurna untuk semua kelas, termasuk kelas 2 (f1-score = 1.00).
    - Confusion Matrix: Prediksi sangat akurat, hanya sedikit kesalahan.
4. K-Nearest Neighbors
  - Sebelum Hyperparameter Tuning:
    - Cross-Validation Accuracy: 0.9652
  - Setelah Hyperparameter Tuning:
    - Accuracy: 0.93
    - Precision & Recall: Menurun pada kelas 2 (f1-score = 0.12) dan kelas 1 sedikit lebih rendah dari random forest.
    - Confusion Matrix: Kesalahan pada kelas 2 cukup tinggi, model sering salah mengklasifikasikan kelas 2 ke kelas lainnya.

---

Random Forest menjadi model terbaik karena:
Akurasi tinggi pada cross-validation dan evaluasi akhir dan
Menangani semua kelas, termasuk kelas minoritas (kelas 2), dengan sangat baik.
