In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


In [4]:
# Load the dataset (after downloading and placing it in your directory)
df = pd.read_csv("data.csv")  # Rename file if necessary
df.head()


Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,...,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes,good
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,...,real estate,22.0,none,own,1.0,skilled,1.0,none,yes,bad
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,...,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes,good
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,...,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes,good
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,...,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes,bad


In [5]:
# Check for nulls
print(df.isnull().sum())

# Drop or fill missing values
df = df.dropna()  # Or use df.fillna(method='ffill') if appropriate


checking_status           0
duration                  0
credit_history            0
purpose                   0
credit_amount             0
savings_status            0
employment                0
installment_commitment    0
personal_status           0
other_parties             0
residence_since           0
property_magnitude        0
age                       0
other_payment_plans       0
housing                   0
existing_credits          0
job                       0
num_dependents            0
own_telephone             0
foreign_worker            0
class                     0
dtype: int64


In [6]:
df = df.drop_duplicates()


In [7]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Apply Label Encoding for binary categorical and One-hot encoding for others
label_enc = LabelEncoder()
for col in categorical_cols:
    if df[col].nunique() == 2:
        df[col] = label_enc.fit_transform(df[col])
    else:
        df = pd.get_dummies(df, columns=[col], drop_first=True)


In [9]:
X = df.drop('class', axis=1)
y = label_enc.fit_transform(df['class'])  # assuming 'Class' has values like 'good', 'bad'


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [11]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)


In [12]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)


In [13]:
svm_linear = SVC(kernel='linear')
svm_linear.fit(X_train, y_train)
y_pred_svm_linear = svm_linear.predict(X_test)

svm_rbf = SVC(kernel='rbf')
svm_rbf.fit(X_train, y_train)
y_pred_svm_rbf = svm_rbf.predict(X_test)


In [14]:
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n--- {model_name} ---")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))


In [15]:
evaluate_model(y_test, y_pred_lr, "Logistic Regression")
evaluate_model(y_test, y_pred_knn, "KNN Classifier")
evaluate_model(y_test, y_pred_svm_linear, "SVM Linear Kernel")
evaluate_model(y_test, y_pred_svm_rbf, "SVM RBF Kernel")



--- Logistic Regression ---
Accuracy: 0.805
Confusion Matrix:
 [[ 35  24]
 [ 15 126]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.59      0.64        59
           1       0.84      0.89      0.87       141

    accuracy                           0.81       200
   macro avg       0.77      0.74      0.75       200
weighted avg       0.80      0.81      0.80       200


--- KNN Classifier ---
Accuracy: 0.705
Confusion Matrix:
 [[ 24  35]
 [ 24 117]]
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.41      0.45        59
           1       0.77      0.83      0.80       141

    accuracy                           0.70       200
   macro avg       0.63      0.62      0.62       200
weighted avg       0.69      0.70      0.70       200


--- SVM Linear Kernel ---
Accuracy: 0.785
Confusion Matrix:
 [[ 32  27]
 [ 16 125]]
Classification Report:
               precis