In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import pickle

data = pd.read_csv('data_1D.csv').drop(columns=['Unnamed: 0'])  


# **Pre-Processing**

In [None]:
X = data.drop(columns=['y'])
y = data['y']

In [20]:
numeric_features = ['age', 'duration', 'campaign', 'pdays', 'previous']
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [23]:
X_preprocessed = preprocessor.fit_transform(X)

In [24]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# **Split Data**

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y_encoded, test_size=0.2, random_state=42)

# **Train Data**

In [29]:
#Logistic Regression model
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)

report_logreg = classification_report(y_test, y_pred_logreg)
print("Classification Report for Logistic Regression:\n", report_logreg)


Classification Report for Logistic Regression:
               precision    recall  f1-score   support

           0       0.91      0.98      0.95      1435
           1       0.75      0.35      0.47       213

    accuracy                           0.90      1648
   macro avg       0.83      0.66      0.71      1648
weighted avg       0.89      0.90      0.88      1648



In [30]:
#Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

report_rf = classification_report(y_test, y_pred_rf)
print("Classification Report for Random Forest:\n", report_rf)


Classification Report for Random Forest:
               precision    recall  f1-score   support

           0       0.91      0.98      0.94      1435
           1       0.72      0.33      0.46       213

    accuracy                           0.90      1648
   macro avg       0.82      0.66      0.70      1648
weighted avg       0.88      0.90      0.88      1648



In [28]:
logreg_f1 = classification_report(y_test, y_pred_logreg, output_dict=True)['weighted avg']['f1-score']
rf_f1 = classification_report(y_test, y_pred_rf, output_dict=True)['weighted avg']['f1-score']

if rf_f1 > logreg_f1:
    best_model = rf
    print("Random Forest selected as the best model")
else:
    best_model = logreg
    print("Logistic Regression selected as the best model")

Logistic Regression selected as the best model


# **Analysis**

**Logistic Regression**
Akurasi: 90%
Precision untuk Kelas 1 ("yes"): 0.75
Recall untuk Kelas 1 ("yes"): 0.35
F1-score untuk Kelas 1 ("yes"): 0.47

**Random Forest**
Akurasi: 90%
Precision untuk Kelas 1 ("yes"): 0.72
Recall untuk Kelas 1 ("yes"): 0.33
F1-score untuk Kelas 1 ("yes"): 0.46

Logistic Regression memiliki recall yang lebih tinggi untuk Kelas 1, yang berarti lebih baik dalam mengidentifikasi pelanggan yang benar-benar akan berlangganan. Akurasi keseluruhan juga sedikit lebih tinggi pada Logistic Regression. Maka dari itu, Logistic Regression adalah model terbaik.

# **Save Best Model in Pickle**

In [31]:
with open('BestModel.pkl', 'wb') as f:
    pickle.dump(rf, f)
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)
with open('labelEncoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)