In [27]:
# Optimized parameters for Wine Quality Dataset
# {
#     "Decision Tree": {"max_depth": 20, "min_samples_leaf": 1, "min_samples_split": 2},
#     "SVM": {"C": 10, "kernel": "rbf"},
#     "Logistic Regression": {
#         "C": 100,
#         "l1_ratio": None,
#         "penalty": "l2",
#         "solver": "saga",
#     },
#     "Naive Bayes": {"var_smoothing": 1e-06},
# }

In [28]:
%pip install pandas
%pip install matplotlib
%pip install imbalanced-learn
%pip install numpy

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [29]:
# Decision Tree - SMOTEENN - all/auto/not minority
# SVM - SMOTEENN - Minority Class
# Logistic Regression - SMOTEENN - Minority Class
# Naive Bayes - SMOTEENN - Minority Class

In [30]:
import pandas as pd

In [31]:
wine_data = pd.read_csv("winequalityN.csv")

# Handle missing values by filling them with the mean of each column
numeric_mean = wine_data.select_dtypes(include=[float, int]).mean()

# Fill missing values in the DataFrame with the computed means
wine_data_filled = wine_data.fillna(numeric_mean)

# Find rows with NaN values
rows_with_nan = wine_data_filled.isna().any(axis=1)

# Display rows that contain at least one NaN value
nan_rows = wine_data_filled[rows_with_nan]

nan_rows

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality


In [32]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
)
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.combine import SMOTEENN
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

In [33]:
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE


label_encoder = LabelEncoder()
wine_data_filled["type"] = label_encoder.fit_transform(wine_data_filled["type"])

X = wine_data_filled.drop("quality", axis=1)
y = wine_data_filled["quality"]

In [34]:
smote = SMOTE(sampling_strategy="auto", random_state=10, k_neighbors=4)
enn = EditedNearestNeighbours(sampling_strategy="auto", n_neighbors=4)

In [35]:
# Create SMOTEENN instance with dynamic n_neighbors
smote_enn = SMOTEENN(sampling_strategy='auto', random_state=10, smote=smote, enn=enn)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.3, random_state=10
)

In [36]:
# Decision Tree
clf = DecisionTreeClassifier(max_depth=20, min_samples_leaf=1, min_samples_split=2)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions, average="weighted")
print(f"F1 Score for Decision Tree: {f1:.4f}")
print(f"Accuracy for Decision Tree: {accuracy:.4f}")
print(
    f"Classification Report for Decision Tree:\n",
    classification_report(y_test, predictions),
)
print("-" * 50)

F1 Score for Decision Tree: 0.9248
Accuracy for Decision Tree: 0.9253
Classification Report for Decision Tree:
               precision    recall  f1-score   support

           3       0.96      0.98      0.97       873
           4       0.91      0.92      0.92       683
           5       0.78      0.74      0.76       216
           6       0.59      0.60      0.60        70
           7       0.88      0.84      0.86       507
           8       0.92      0.93      0.92       732
           9       0.99      1.00      1.00       803

    accuracy                           0.93      3884
   macro avg       0.86      0.86      0.86      3884
weighted avg       0.92      0.93      0.92      3884

--------------------------------------------------


In [37]:
# Logistic Regression
clf = LogisticRegression(C=100, l1_ratio=None, penalty="l2", solver="saga")
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions, average="weighted")
print(f"F1 Score for Decision Tree: {f1:.4f}")
print(f"Accuracy for Decision Tree: {accuracy:.4f}")
print(
    f"Classification Report for Decision Tree:\n",
    classification_report(y_test, predictions),
)
print("-" * 50)

F1 Score for Decision Tree: 0.4099
Accuracy for Decision Tree: 0.4704
Classification Report for Decision Tree:
               precision    recall  f1-score   support

           3       0.43      0.61      0.50       873
           4       0.55      0.53      0.54       683
           5       0.00      0.00      0.00       216
           6       0.00      0.00      0.00        70
           7       0.00      0.00      0.00       507
           8       0.45      0.37      0.41       732
           9       0.48      0.83      0.61       803

    accuracy                           0.47      3884
   macro avg       0.27      0.33      0.29      3884
weighted avg       0.38      0.47      0.41      3884

--------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
# NB
clf = GaussianNB(var_smoothing=1e-06)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions, average="weighted")
print(f"F1 Score for Decision Tree: {f1:.4f}")
print(f"Accuracy for Decision Tree: {accuracy:.4f}")
print(
    f"Classification Report for Decision Tree:\n",
    classification_report(y_test, predictions),
)
print("-" * 50)

F1 Score for Decision Tree: 0.4753
Accuracy for Decision Tree: 0.5054
Classification Report for Decision Tree:
               precision    recall  f1-score   support

           3       0.77      0.42      0.54       873
           4       0.53      0.56      0.55       683
           5       0.20      0.19      0.19       216
           6       0.13      0.27      0.18        70
           7       0.31      0.08      0.13       507
           8       0.39      0.43      0.41       732
           9       0.57      0.99      0.72       803

    accuracy                           0.51      3884
   macro avg       0.41      0.42      0.39      3884
weighted avg       0.51      0.51      0.48      3884

--------------------------------------------------


In [39]:
# SVM
clf = SVC(C=10, kernel="rbf")
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions, average="weighted")
print(f"F1 Score for Decision Tree: {f1:.4f}")
print(f"Accuracy for Decision Tree: {accuracy:.4f}")
print(
    f"Classification Report for Decision Tree:\n",
    classification_report(y_test, predictions),
)
print("-" * 50)

F1 Score for Decision Tree: 0.5130
Accuracy for Decision Tree: 0.5765
Classification Report for Decision Tree:
               precision    recall  f1-score   support

           3       0.54      0.73      0.62       873
           4       0.57      0.55      0.56       683
           5       0.25      0.00      0.01       216
           6       0.00      0.00      0.00        70
           7       0.61      0.03      0.06       507
           8       0.50      0.58      0.54       732
           9       0.67      0.97      0.79       803

    accuracy                           0.58      3884
   macro avg       0.45      0.41      0.37      3884
weighted avg       0.55      0.58      0.51      3884

--------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
# Optimized parameters for Good Customer Dataset
# {
#     "Decision Tree": {"max_depth": None, "min_samples_leaf": 2, "min_samples_split": 5},
#     "SVM": {"C": 10, "kernel": "rbf"},
#     "Logistic Regression": {
#         "C": 0.1,
#         "l1_ratio": 0.6,
#         "penalty": "elasticnet",
#         "solver": "saga",
#     },
#     "Naive Bayes": {"var_smoothing": 1e-09},
# }

In [41]:
df = pd.read_csv("good_customer.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1723 entries, 0 to 1722
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   month                1723 non-null   int64 
 1   credit_amount        1723 non-null   int64 
 2   credit_term          1723 non-null   int64 
 3   age                  1723 non-null   int64 
 4   sex                  1723 non-null   object
 5   education            1723 non-null   object
 6   product_type         1723 non-null   object
 7   having_children_flg  1723 non-null   int64 
 8   region               1723 non-null   int64 
 9   income               1723 non-null   int64 
 10  family_status        1723 non-null   object
 11  phone_operator       1723 non-null   int64 
 12  is_client            1723 non-null   int64 
 13  bad_client_target    1723 non-null   int64 
dtypes: int64(10), object(4)
memory usage: 188.6+ KB


In [42]:
df.head(10)

Unnamed: 0,month,credit_amount,credit_term,age,sex,education,product_type,having_children_flg,region,income,family_status,phone_operator,is_client,bad_client_target
0,1,7000,12,39,male,Secondary special education,Cell phones,0,2,21000,Another,0,0,0
1,1,19000,6,20,male,Secondary special education,Household appliances,1,2,17000,Another,3,1,0
2,1,29000,12,23,female,Secondary special education,Household appliances,0,2,31000,Another,2,0,0
3,1,10000,12,30,male,Secondary special education,Cell phones,1,2,31000,Unmarried,3,1,0
4,1,14500,12,25,female,Higher education,Cell phones,0,2,26000,Married,0,1,0
5,1,32500,24,47,female,Secondary special education,Furniture,0,2,26000,Married,0,1,0
6,1,8000,3,23,male,Higher education,Computers,0,2,21000,Another,0,1,0
7,1,20000,10,25,female,Higher education,Household appliances,0,0,33000,Married,2,1,0
8,1,26000,6,21,female,Secondary special education,Cell phones,0,0,31000,Another,2,1,0
9,1,15000,24,25,female,Secondary special education,Household appliances,1,2,26000,Another,3,0,0


In [43]:
rows_with_nan = df.isna().any(axis=1)

# Display rows that contain at least one NaN value
nan_rows = df[rows_with_nan]

nan_rows

Unnamed: 0,month,credit_amount,credit_term,age,sex,education,product_type,having_children_flg,region,income,family_status,phone_operator,is_client,bad_client_target


In [44]:
label_encoder = LabelEncoder()
df["sex"] = label_encoder.fit_transform(df["sex"])
df["education"] = label_encoder.fit_transform(df["education"])
df["product_type"] = label_encoder.fit_transform(df["product_type"])
df["family_status"] = label_encoder.fit_transform(df["family_status"])


X = df.drop("bad_client_target", axis=1)
y = df["bad_client_target"]

In [45]:
# Decision Tree - SMOTEENN - not minority
# SVM - SMOTE - not minority
# Logistic Regression - SMOTEENN - not minority
# Naive Bayes - SMOTEENN - not minority

In [46]:
smote = SMOTE(sampling_strategy="auto", random_state=10, k_neighbors=4)
enn = EditedNearestNeighbours(sampling_strategy="auto", n_neighbors=4)

In [47]:
# Apply SMOTEENN and fit Decision Tree with best hyperparameters
smote_enn = SMOTEENN(sampling_strategy='not minority', random_state=10, smote=smote, enn=enn)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

clf = DecisionTreeClassifier(max_depth=None, min_samples_leaf=2, min_samples_split=5)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions, average="weighted")
print(f"F1 Score for Decision Tree: {f1:.4f}")
print(f"Accuracy for Decision Tree: {accuracy:.4f}")
print(
    f"Classification Report for Decision Tree:\n",
    classification_report(y_test, predictions),
)
print("-" * 50)

F1 Score for Decision Tree: 0.9033
Accuracy for Decision Tree: 0.9040
Classification Report for Decision Tree:
               precision    recall  f1-score   support

           3       0.94      0.98      0.96       873
           4       0.88      0.91      0.89       683
           5       0.75      0.64      0.69       216
           6       0.47      0.54      0.50        70
           7       0.81      0.80      0.80       507
           8       0.93      0.89      0.91       732
           9       0.99      1.00      1.00       803

    accuracy                           0.90      3884
   macro avg       0.83      0.82      0.82      3884
weighted avg       0.90      0.90      0.90      3884

--------------------------------------------------


In [48]:
# Apply SMOTE to fit other models with best hyperparameters
smote = SMOTE(sampling_strategy="not minority", random_state=10, k_neighbors=4)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [49]:
# fit SVM with best hyperparameters
clf = SVC(C=10, kernel="rbf")
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions, average="weighted")
print(f"F1 Score for SVM: {f1:.4f}")
print(f"Accuracy for SVM: {accuracy:.4f}")
print(
    f"Classification Report for SVM:\n",
    classification_report(y_test, predictions),
)
print("-" * 50)

F1 Score for SVM: 0.5130
Accuracy for SVM: 0.5765
Classification Report for SVM:
               precision    recall  f1-score   support

           3       0.54      0.73      0.62       873
           4       0.57      0.55      0.56       683
           5       0.25      0.00      0.01       216
           6       0.00      0.00      0.00        70
           7       0.61      0.03      0.06       507
           8       0.50      0.58      0.54       732
           9       0.67      0.97      0.79       803

    accuracy                           0.58      3884
   macro avg       0.45      0.41      0.37      3884
weighted avg       0.55      0.58      0.51      3884

--------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
# fit Naive Bayes with best hyperparameters
clf = GaussianNB(var_smoothing=1e-09)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions, average="weighted")
print(f"F1 Score for Naive Bayes: {f1:.4f}")
print(f"Accuracy for Naive Bayes: {accuracy:.4f}")
print(
    f"Classification Report for Naive Bayes:\n",
    classification_report(y_test, predictions),
)
print("-" * 50)

F1 Score for Naive Bayes: 0.4753
Accuracy for Naive Bayes: 0.5033
Classification Report for Naive Bayes:
               precision    recall  f1-score   support

           3       0.76      0.44      0.56       873
           4       0.53      0.51      0.52       683
           5       0.18      0.18      0.18       216
           6       0.12      0.33      0.18        70
           7       0.24      0.06      0.10       507
           8       0.39      0.44      0.41       732
           9       0.59      1.00      0.74       803

    accuracy                           0.50      3884
   macro avg       0.40      0.42      0.38      3884
weighted avg       0.50      0.50      0.48      3884

--------------------------------------------------


In [51]:
# fit Logistic Regression with best hyperparameters
clf = LogisticRegression(C=0.1, l1_ratio=0.6, penalty="elasticnet", solver="saga")
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions, average="weighted")
print(f"F1 Score for Logistic Regression: {f1:.4f}")
print(f"Accuracy for Logistic Regression: {accuracy:.4f}")
print(
    f"Classification Report for Logistic Regression:\n",
    classification_report(y_test, predictions),
)
print("-" * 50)

F1 Score for Logistic Regression: 0.4094
Accuracy for Logistic Regression: 0.4699
Classification Report for Logistic Regression:
               precision    recall  f1-score   support

           3       0.43      0.61      0.50       873
           4       0.55      0.53      0.54       683
           5       0.00      0.00      0.00       216
           6       0.00      0.00      0.00        70
           7       0.00      0.00      0.00       507
           8       0.45      0.37      0.41       732
           9       0.48      0.83      0.61       803

    accuracy                           0.47      3884
   macro avg       0.27      0.33      0.29      3884
weighted avg       0.38      0.47      0.41      3884

--------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
