In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
import pandas as pd

df=pd.read_csv('/content/Assignment-2_Data.csv')
df.head()

Unnamed: 0,Id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,1001,999.0,management,married,tertiary,no,2143.0,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,1002,44.0,technician,single,secondary,no,29.0,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,1003,33.0,entrepreneur,married,secondary,no,2.0,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,1004,47.0,blue-collar,married,unknown,no,1506.0,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,1005,33.0,unknown,single,unknown,no,1.0,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [8]:
df.shape

(45211, 18)

In [9]:
df.drop(columns=['Id'], inplace=True)

In [10]:
df.drop_duplicates(inplace=True)

In [11]:
df.loc[df['age'] > 100, 'age'] = df['age'].median()

In [12]:
for col in df.select_dtypes(include='number'):
    df[col].fillna(df[col].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [13]:
for col in df.select_dtypes(include='object'):
    df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [14]:
#handle outliers
import numpy as np
num_cols = df.select_dtypes(include=np.number).columns
for col in num_cols:
  Q1=df[col].quantile(0.25)
  Q3=df[col].quantile(0.75)
  IQR=Q3-Q1
  lower_bound=Q1-1.5*IQR
  upper_bound=Q3+1.5*IQR
  df=df[(df[col]>=lower_bound)&(df[col]<=upper_bound)]

In [15]:
df['y'] = df['y'].map({'yes':1, 'no':0})


In [16]:
df_encoded = pd.get_dummies(df, drop_first=True)


In [17]:
X = df_encoded.drop(columns=['y'])
y = df_encoded['y']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [18]:
smote = SMOTE(random_state=42)

X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)


In [19]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_sm)
X_test_scaled = scaler.transform(X_test)


In [20]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train_sm)

y_pred_log = log_reg.predict(X_test_scaled)

print("Logistic Regression Accuracy:",
      accuracy_score(y_test, y_pred_log))


Logistic Regression Accuracy: 0.9232276451727823


In [21]:
dt = DecisionTreeClassifier(
    max_depth=8,
    min_samples_split=10,
    class_weight='balanced',
    random_state=42
)

dt.fit(X_train_sm, y_train_sm)

y_pred_dt = dt.predict(X_test)

print("Decision Tree Accuracy:",
      accuracy_score(y_test, y_pred_dt))


Decision Tree Accuracy: 0.8865336658354115


In [22]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("\nClassification Report:\n",
      classification_report(y_test, y_pred_dt))


Confusion Matrix:
 [[4838  463]
 [ 174  139]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.91      0.94      5301
           1       0.23      0.44      0.30       313

    accuracy                           0.89      5614
   macro avg       0.60      0.68      0.62      5614
weighted avg       0.92      0.89      0.90      5614



In [23]:
from sklearn.metrics import roc_auc_score

y_prob_log_reg = log_reg.predict_proba(X_test_scaled)[:, 1]
roc_auc_log_reg = roc_auc_score(y_test, y_prob_log_reg)

print("Logistic Regression ROC-AUC:", roc_auc_log_reg)

Logistic Regression ROC-AUC: 0.7858261718055486


In [24]:
y_prob_dt = dt.predict_proba(X_test)[:,1]
roc_auc_dt = roc_auc_score(y_test,y_prob_dt)
print("Decision Tree ROC-AUC:",roc_auc_dt)

Decision Tree ROC-AUC: 0.8274959272860086
