In [1]:
pip install scikit-learn==1.4.2



In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [3]:
df = pd.read_csv('/content/train.csv', index_col= 0)
df.head()

Unnamed: 0_level_0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
542,1 - 200 DM,12,critical,car,2366,500 - 1000 DM,4 - 7 years,3,3,36,none,own,1,management,1,yes,0
441,1 - 200 DM,12,good,furniture/appliances,2930,< 100 DM,4 - 7 years,2,1,27,none,own,1,skilled,1,no,0
483,unknown,12,critical,furniture/appliances,1240,unknown,> 7 years,4,2,38,none,own,2,skilled,1,yes,0
423,1 - 200 DM,18,good,car,6260,< 100 DM,4 - 7 years,3,3,28,none,rent,1,unskilled,1,no,0
779,unknown,12,good,furniture/appliances,1262,< 100 DM,1 - 4 years,3,2,25,none,own,1,skilled,1,no,0


In [4]:
df.describe()

Unnamed: 0,months_loan_duration,amount,percent_of_income,years_at_residence,age,existing_loans_count,dependents,default
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,20.835714,3285.072857,2.97,2.805714,35.31,1.414286,1.162857,0.315714
std,11.922623,2847.08997,1.124808,1.106346,11.140906,0.595473,0.369499,0.465132
min,4.0,276.0,1.0,1.0,19.0,1.0,1.0,0.0
25%,12.0,1371.75,2.0,2.0,27.0,1.0,1.0,0.0
50%,18.0,2326.0,3.0,3.0,33.0,1.0,1.0,0.0
75%,24.0,3972.25,4.0,4.0,42.0,2.0,1.0,1.0
max,72.0,18424.0,4.0,4.0,75.0,4.0,2.0,1.0


In [5]:
df.isnull().sum()

Unnamed: 0,0
checking_balance,0
months_loan_duration,0
credit_history,0
purpose,0
amount,0
savings_balance,0
employment_duration,0
percent_of_income,0
years_at_residence,0
age,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 700 entries, 542 to 103
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   checking_balance      700 non-null    object
 1   months_loan_duration  700 non-null    int64 
 2   credit_history        700 non-null    object
 3   purpose               700 non-null    object
 4   amount                700 non-null    int64 
 5   savings_balance       700 non-null    object
 6   employment_duration   700 non-null    object
 7   percent_of_income     700 non-null    int64 
 8   years_at_residence    700 non-null    int64 
 9   age                   700 non-null    int64 
 10  other_credit          700 non-null    object
 11  housing               700 non-null    object
 12  existing_loans_count  700 non-null    int64 
 13  job                   700 non-null    object
 14  dependents            700 non-null    int64 
 15  phone                 700 non-null    objec

In [7]:
object_columns = df.select_dtypes(include=['object']).columns
for col in object_columns:
    print(f"Unique values in column '{col}': {df[col].unique()}")

Unique values in column 'checking_balance': ['1 - 200 DM' 'unknown' '< 0 DM' '> 200 DM']
Unique values in column 'credit_history': ['critical' 'good' 'very good' 'poor' 'perfect']
Unique values in column 'purpose': ['car' 'furniture/appliances' 'education' 'business' 'renovations' 'car0']
Unique values in column 'savings_balance': ['500 - 1000 DM' '< 100 DM' 'unknown' '100 - 500 DM' '> 1000 DM']
Unique values in column 'employment_duration': ['4 - 7 years' '> 7 years' '1 - 4 years' 'unemployed' '< 1 year']
Unique values in column 'other_credit': ['none' 'bank' 'store']
Unique values in column 'housing': ['own' 'rent' 'other']
Unique values in column 'job': ['management' 'skilled' 'unskilled' 'unemployed']
Unique values in column 'phone': ['yes' 'no']


In [8]:
df['checking_balance'] = df['checking_balance'].map({
    'unknown': -1,
    '< 0 DM': 0,
    '1 - 200 DM': 1,
    '> 200 DM': 2,
})

df['credit_history'] = df['credit_history'].map({
    'critical' : 1,
    'poor' : 2,
    'good' : 3,
    'very good' : 4,
    'perfect' : 5

})

df['savings_balance'] = df['savings_balance'].map({
    '< 100 DM': 1,
    '100 - 500 DM': 2,
    '500 - 1000 DM': 3,
    '> 1000 DM': 4,
    'unknown': -1
})

df['employment_duration'] = df['employment_duration'].map({
    'unemployed': 0,
    '< 1 year': 1,
    '1 - 4 years': 2,
    '4 - 7 years': 3,
    '> 7 years': 4
})

df['checking_balance'] = df['checking_balance'].fillna(0)
df['credit_history'] = df['credit_history'].fillna(0)
df['savings_balance'] = df['savings_balance'].fillna(0)
df['employment_duration'] = df['employment_duration'].fillna(0)

In [9]:
df = pd.get_dummies(df, columns=['phone', 'purpose', 'other_credit', 'housing', 'job'])

In [10]:
columns = df[['months_loan_duration',	'amount',	'percent_of_income',	'years_at_residence',	'age',	'existing_loans_count',	'dependents']]
scaler = MinMaxScaler()
scaled_columns = scaler.fit_transform(columns)
df[['months_loan_duration', 'amount', 'percent_of_income',
    'years_at_residence', 'age', 'existing_loans_count', 'dependents']] = scaled_columns


In [11]:
df.head()

Unnamed: 0_level_0,checking_balance,months_loan_duration,credit_history,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,existing_loans_count,...,other_credit_bank,other_credit_none,other_credit_store,housing_other,housing_own,housing_rent,job_management,job_skilled,job_unemployed,job_unskilled
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
542,1,0.117647,1,0.115164,3,3,0.666667,0.666667,0.303571,0.0,...,False,True,False,False,True,False,True,False,False,False
441,1,0.117647,3,0.146242,1,3,0.333333,0.0,0.142857,0.0,...,False,True,False,False,True,False,False,True,False,False
483,-1,0.117647,1,0.053119,-1,4,1.0,0.333333,0.339286,0.333333,...,False,True,False,False,True,False,False,True,False,False
423,1,0.205882,3,0.329733,1,3,0.666667,0.666667,0.160714,0.0,...,False,True,False,False,False,True,False,False,False,True
779,-1,0.117647,3,0.054331,1,2,0.666667,0.333333,0.107143,0.0,...,False,True,False,False,True,False,False,True,False,False


In [12]:
X = df.drop('default', axis=1)
y = df['default']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [14]:
X_train.head()

Unnamed: 0_level_0,checking_balance,months_loan_duration,credit_history,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,existing_loans_count,...,other_credit_bank,other_credit_none,other_credit_store,housing_other,housing_own,housing_rent,job_management,job_skilled,job_unemployed,job_unskilled
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
793,0,0.029412,3,0.02127,1,1,0.666667,1.0,0.392857,0.0,...,False,True,False,False,True,False,False,False,False,True
275,-1,0.073529,3,0.064635,1,3,0.666667,0.333333,0.142857,0.333333,...,False,True,False,False,True,False,False,True,False,False
357,0,0.294118,3,0.162993,1,1,1.0,1.0,0.071429,0.0,...,False,True,False,False,False,True,False,False,False,True
277,0,0.294118,4,0.16933,3,1,1.0,1.0,0.196429,0.0,...,False,True,False,True,False,False,False,True,False,False
728,0,0.647059,3,0.370454,1,3,0.0,0.0,0.267857,0.333333,...,False,True,False,False,True,False,False,True,False,False


In [15]:
!pip install pycaret



In [16]:
pip install pycaret[full]

Collecting shap~=0.44.0 (from pycaret[full])
  Using cached shap-0.44.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
Collecting interpret>=0.2.7 (from pycaret[full])
  Using cached interpret-0.6.8-py3-none-any.whl.metadata (1.0 kB)
Collecting umap-learn>=0.5.2 (from pycaret[full])
  Using cached umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting ydata-profiling>=4.3.1 (from pycaret[full])
  Using cached ydata_profiling-4.12.1-py2.py3-none-any.whl.metadata (20 kB)
Collecting explainerdashboard>=0.3.8 (from pycaret[full])
  Using cached explainerdashboard-0.4.7-py3-none-any.whl.metadata (3.8 kB)
Collecting fairlearn==0.7.0 (from pycaret[full])
  Using cached fairlearn-0.7.0-py3-none-any.whl.metadata (7.3 kB)
Collecting kmodes>=0.11.1 (from pycaret[full])
  Using cached kmodes-0.12.2-py2.py3-none-any.whl.metadata (8.1 kB)
Collecting statsforecast<1.6.0,>=0.5.5 (from pycaret[full])
  Using cached statsfore

In [17]:
import pycaret
print(pycaret.__version__)

3.3.2


In [18]:
from pycaret.classification import *

In [19]:
s = setup(df,target='default',session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,default
2,Target type,Binary
3,Original data shape,"(700, 30)"
4,Transformed data shape,"(700, 30)"
5,Transformed train set shape,"(489, 30)"
6,Transformed test set shape,"(211, 30)"
7,Numeric features,11
8,Preprocess,True
9,Imputation type,simple


In [20]:
best=s.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7466,0.7428,0.5142,0.6139,0.553,0.3812,0.3871,0.649
catboost,CatBoost Classifier,0.7465,0.7742,0.4558,0.6372,0.5246,0.3613,0.3733,2.493
et,Extra Trees Classifier,0.7342,0.7283,0.41,0.6114,0.4806,0.3181,0.3314,0.326
knn,K Neighbors Classifier,0.7299,0.7199,0.4988,0.584,0.5312,0.3455,0.351,0.15
xgboost,Extreme Gradient Boosting,0.7281,0.7537,0.5271,0.5759,0.5434,0.353,0.358,0.223
gbc,Gradient Boosting Classifier,0.7198,0.7618,0.4733,0.5637,0.5108,0.3179,0.3221,0.193
rf,Random Forest Classifier,0.7157,0.7558,0.3646,0.5629,0.4365,0.2654,0.2756,0.258
ada,Ada Boost Classifier,0.7014,0.7118,0.4212,0.5315,0.4659,0.2647,0.2697,0.151
lda,Linear Discriminant Analysis,0.6932,0.6881,0.2988,0.5352,0.3791,0.1966,0.214,0.048
lr,Logistic Regression,0.6912,0.6984,0.2796,0.4992,0.3547,0.1797,0.1906,1.738


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

In [21]:
print(best)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=123, reg_alpha=0.0, reg_lambda=0.0, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)


In [22]:
tuned_model = tune_model(best)
print(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7551,0.8157,0.6,0.6,0.6,0.4235,0.4235
1,0.7143,0.751,0.4667,0.5385,0.5,0.3014,0.3029
2,0.7551,0.6706,0.4,0.6667,0.5,0.351,0.3711
3,0.7143,0.7725,0.4,0.5455,0.4615,0.2733,0.2794
4,0.7347,0.6569,0.3333,0.625,0.4348,0.2818,0.3056
5,0.7551,0.7898,0.5625,0.6429,0.6,0.4247,0.4266
6,0.7143,0.6913,0.4375,0.5833,0.5,0.3057,0.3119
7,0.7551,0.7519,0.4375,0.7,0.5385,0.3836,0.4033
8,0.6939,0.7311,0.3125,0.5556,0.4,0.2156,0.2317
9,0.7083,0.7556,0.4,0.5455,0.4615,0.268,0.274


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=123, reg_alpha=0.0, reg_lambda=0.0, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)


In [23]:
pip install lightgbm



In [24]:
import lightgbm as lgb

In [25]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb

# Inisialisasi model LightGBM dengan parameter terbaik
lgb_model = lgb.LGBMClassifier(
    learning_rate=0.1,
    max_depth=10,
    min_data_in_leaf=20,
    n_estimators=300,
    num_leaves=31,
    class_weight='balanced',
    random_state=123
)

# Train dan evaluasi LightGBM
lgb_model.fit(X_train, y_train)
y_pred_lgb = lgb_model.predict(X_test)
print("LightGBM Accuracy:", accuracy_score(y_test, y_pred_lgb))
print("Classification Report for LightGBM:\n", classification_report(y_test, y_pred_lgb))

[LightGBM] [Info] Number of positive: 168, number of negative: 392
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000187 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 320
[LightGBM] [Info] Number of data points in the train set: 560, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
LightGBM Accuracy: 0.7642857142857142
Classification Report for LightGBM:
               precision    recall  f1-score   support

           0       0.79      0.85      0.82        87
           1       0.72      0.62      0.67        53

    accuracy                           0.76       140
   macro avg       0.75      0.74      0.74       140
weighted avg       0.76      0.76      0.76       140



In [26]:
# Evaluasi akurasi pada training data
train_accuracy = accuracy_score(y_train, lgb_model.predict(X_train))

# Evaluasi akurasi pada testing data
test_accuracy = accuracy_score(y_test, y_pred_lgb)

# Cetak hasil
print("LightGBM Training Accuracy:", train_accuracy)
print("LightGBM Test Accuracy:", test_accuracy)

# Analisis apakah ada overfitting
if train_accuracy - test_accuracy > 0.1:  # Threshold 10% untuk overfitting
    print("LightGBM might be overfitting.")
else:
    print("LightGBM does not show significant overfitting.")

LightGBM Training Accuracy: 1.0
LightGBM Test Accuracy: 0.7642857142857142
LightGBM might be overfitting.


In [27]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb

In [29]:
import joblib

# Simpan model
joblib.dump(lgb_model, 'final_model.pkl')

# Muat model
loaded_model = joblib.load('final_model.pkl')

In [30]:
# Dataset asli tanpa kolom target
original_dataset = pd.read_csv("/content/Teeestt.csv", index_col= 0)

# Periksa dataset
print(original_dataset.head())

# Pastikan kolom target 'default' belum ada
assert 'default' not in original_dataset.columns

# Dataset untuk prediksi (tanpa kolom target)
X_original = original_dataset.copy()  # Gunakan semua kolom untuk prediksi


    checking_balance  months_loan_duration credit_history  \
id                                                          
522          unknown                    24       critical   
738          unknown                    12           good   
741          unknown                    20       critical   
661          unknown                    18           good   
412          unknown                     6        perfect   

                  purpose  amount savings_balance employment_duration  \
id                                                                      
522                   car    2022        < 100 DM         1 - 4 years   
738  furniture/appliances    2141    100 - 500 DM         4 - 7 years   
741                   car    3485         unknown            < 1 year   
661  furniture/appliances    1453        < 100 DM            < 1 year   
412                   car    1204    100 - 500 DM         1 - 4 years   

     percent_of_income  years_at_residence  age other_credit

In [31]:
X_original['checking_balance'] = X_original['checking_balance'].map({
    'unknown': -1,
    '< 0 DM': 0,
    '1 - 200 DM': 1,
    '> 200 DM': 2,
})

X_original['credit_history'] = X_original['credit_history'].map({
    'critical' : 1,
    'poor' : 2,
    'good' : 3,
    'very good' : 4,
    'perfect' : 5

})

X_original['savings_balance'] = X_original['savings_balance'].map({
    '< 100 DM': 1,
    '100 - 500 DM': 2,
    '500 - 1000 DM': 3,
    '> 1000 DM': 4,
    'unknown': -1
})

X_original['employment_duration'] = X_original['employment_duration'].map({
    'unemployed': 0,
    '< 1 year': 1,
    '1 - 4 years': 2,
    '4 - 7 years': 3,
    '> 7 years': 4
})

X_original['checking_balance'] = X_original['checking_balance'].fillna(0)
X_original['credit_history'] = X_original['credit_history'].fillna(0)
X_original['savings_balance'] = X_original['savings_balance'].fillna(0)
X_original['employment_duration'] = X_original['employment_duration'].fillna(0)

In [32]:
X_original = pd.get_dummies(X_original, columns=['phone', 'purpose', 'other_credit', 'housing', 'job'])

In [33]:
columns = X_original[['months_loan_duration',	'amount',	'percent_of_income',	'years_at_residence',	'age',	'existing_loans_count',	'dependents']]
scaler = MinMaxScaler()
scaled_columns = scaler.fit_transform(columns)
X_original[['months_loan_duration', 'amount', 'percent_of_income',
    'years_at_residence', 'age', 'existing_loans_count', 'dependents']] = scaled_columns

In [34]:
X_original.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300 entries, 522 to 156
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   checking_balance              300 non-null    int64  
 1   months_loan_duration          300 non-null    float64
 2   credit_history                300 non-null    int64  
 3   amount                        300 non-null    float64
 4   savings_balance               300 non-null    int64  
 5   employment_duration           300 non-null    int64  
 6   percent_of_income             300 non-null    float64
 7   years_at_residence            300 non-null    float64
 8   age                           300 non-null    float64
 9   existing_loans_count          300 non-null    float64
 10  dependents                    300 non-null    float64
 11  phone_no                      300 non-null    bool   
 12  phone_yes                     300 non-null    bool   
 13  purpose_

In [43]:
# Prediksi nilai target dengan model
original_dataset['default'] = lgb_model .predict(X_original)

# Lihat hasil prediksi
print(original_dataset[['default']].head())

# Simpan hasil ke file
original_dataset.to_csv("dataset_asli_with_predictions.csv", index=True)

     default
id          
522        0
738        0
741        0
661        0
412        0


In [44]:
dpred = pd.read_csv('/content/dataset_asli_with_predictions.csv')
dpred.head()

Unnamed: 0,id,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,522,unknown,24,critical,car,2022,< 100 DM,1 - 4 years,4,4,37,none,own,1,skilled,1,yes,0
1,738,unknown,12,good,furniture/appliances,2141,100 - 500 DM,4 - 7 years,3,1,35,none,own,1,skilled,1,no,0
2,741,unknown,20,critical,car,3485,unknown,< 1 year,2,4,44,none,own,2,skilled,1,yes,0
3,661,unknown,18,good,furniture/appliances,1453,< 100 DM,< 1 year,3,1,26,none,own,1,skilled,1,no,0
4,412,unknown,6,perfect,car,1204,100 - 500 DM,1 - 4 years,4,1,35,bank,rent,1,skilled,1,no,0


In [37]:
dfori = pd.read_csv('/content/solution kalsifikasi.csv')
dfori.head()

Unnamed: 0,id,default,Usage
0,522,0,Public
1,738,0,Public
2,741,0,Public
3,661,0,Public
4,412,0,Public


In [41]:
dfori.drop(columns = ['Usage'], inplace = True)

In [45]:
# Gabungkan berdasarkan kolom 'id'
merged = pd.merge(dpred, dfori, on='id', suffixes=('_df1', '_df2'))
print(merged)

      id checking_balance  months_loan_duration credit_history  \
0    522          unknown                    24       critical   
1    738          unknown                    12           good   
2    741          unknown                    20       critical   
3    661          unknown                    18           good   
4    412          unknown                     6        perfect   
..   ...              ...                   ...            ...   
295  469       1 - 200 DM                    24       critical   
296  936           < 0 DM                    12           good   
297  429          unknown                    48       critical   
298    8         > 200 DM                    12           good   
299  156           < 0 DM                    24           poor   

                  purpose  amount savings_balance employment_duration  \
0                     car    2022        < 100 DM         1 - 4 years   
1    furniture/appliances    2141    100 - 500 DM         4 -

In [46]:
# Tambahkan kolom untuk membandingkan
merged['match'] = merged['default_df1'] == merged['default_df2']

# Hitung akurasi
accuracy = merged['match'].mean() * 100
print(f"Akurasi: {accuracy:.2f}%")


Akurasi: 75.00%


In [48]:
from sklearn.ensemble import BaggingClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# Inisialisasi model dasar LightGBM
lgb_model = LGBMClassifier()

# Kombinasi LightGBM dengan Bagging
bagging_lgb = BaggingClassifier(
    estimator=lgb_model,    # Ubah ke estimator
    n_estimators=10,
    max_samples=0.8,
    max_features=1.0,
    n_jobs=-1,
    random_state=42
)

# Training model pada data training
bagging_lgb.fit(X_train, y_train)

# Prediksi
y_pred_train_bagging = bagging_lgb.predict(X_train)
y_pred_test_bagging = bagging_lgb.predict(X_test)

# Evaluasi akurasi
train_accuracy = accuracy_score(y_train, y_pred_train_bagging)
test_accuracy = accuracy_score(y_test, y_pred_test_bagging)

print("Bagging LightGBM Training Accuracy:", train_accuracy)
print("Bagging LightGBM Test Accuracy:", test_accuracy)

if train_accuracy - test_accuracy > 0.1:
    print("Bagging LightGBM might be overfitting.")
else:
    print("Bagging LightGBM does not show significant overfitting.")

Bagging LightGBM Training Accuracy: 0.9589285714285715
Bagging LightGBM Test Accuracy: 0.7714285714285715
Bagging LightGBM might be overfitting.


In [50]:
import joblib

# Simpan model
joblib.dump(bagging_lgb, 'final_model2.pkl')

# Muat model
loaded_model = joblib.load('final_model2.pkl')

In [51]:
# Prediksi nilai target dengan model
original_dataset['default'] = bagging_lgb .predict(X_original)

# Lihat hasil prediksi
print(original_dataset[['default']].head())

# Simpan hasil ke file
original_dataset.to_csv("dataset_asli_with_predictions2.csv", index=True)

     default
id          
522        0
738        0
741        0
661        0
412        0


In [52]:
dfori2 = pd.read_csv('/content/dataset_asli_with_predictions2.csv')
dfori2.head()

Unnamed: 0,id,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,522,unknown,24,critical,car,2022,< 100 DM,1 - 4 years,4,4,37,none,own,1,skilled,1,yes,0
1,738,unknown,12,good,furniture/appliances,2141,100 - 500 DM,4 - 7 years,3,1,35,none,own,1,skilled,1,no,0
2,741,unknown,20,critical,car,3485,unknown,< 1 year,2,4,44,none,own,2,skilled,1,yes,0
3,661,unknown,18,good,furniture/appliances,1453,< 100 DM,< 1 year,3,1,26,none,own,1,skilled,1,no,0
4,412,unknown,6,perfect,car,1204,100 - 500 DM,1 - 4 years,4,1,35,bank,rent,1,skilled,1,no,0


In [53]:
# Gabungkan berdasarkan kolom 'id'
merged = pd.merge(dpred, dfori2, on='id', suffixes=('_df1', '_df2'))
print(merged)

      id checking_balance_df1  months_loan_duration_df1 credit_history_df1  \
0    522              unknown                        24           critical   
1    738              unknown                        12               good   
2    741              unknown                        20           critical   
3    661              unknown                        18               good   
4    412              unknown                         6            perfect   
..   ...                  ...                       ...                ...   
295  469           1 - 200 DM                        24           critical   
296  936               < 0 DM                        12               good   
297  429              unknown                        48           critical   
298    8             > 200 DM                        12               good   
299  156               < 0 DM                        24               poor   

              purpose_df1  amount_df1 savings_balance_df1  \
0 

In [54]:
# Tambahkan kolom untuk membandingkan
merged['match'] = merged['default_df1'] == merged['default_df2']

# Hitung akurasi
accuracy = merged['match'].mean() * 100
print(f"Akurasi: {accuracy:.2f}%")

Akurasi: 92.00%
