In [78]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20, 10)

import seaborn as sns
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix

## Reading full data and cleaning data eg. aligning and adding separators
after loading the data it was clear that it was not aligned well. so i used separators to make the data very well aligned for easy reading and cleaning.  

In [None]:
data = pd.read_csv("Data/bank-additional-full.csv")
data.head()

Unnamed: 0,"age;""job"";""marital"";""education"";""default"";""housing"";""loan"";""contact"";""month"";""day_of_week"";""duration"";""campaign"";""pdays"";""previous"";""poutcome"";""emp.var.rate"";""cons.price.idx"";""cons.conf.idx"";""euribor3m"";""nr.employed"";""y"""
0,"56;""housemaid"";""married"";""basic.4y"";""no"";""no"";..."
1,"57;""services"";""married"";""high.school"";""unknown..."
2,"37;""services"";""married"";""high.school"";""no"";""ye..."
3,"40;""admin."";""married"";""basic.6y"";""no"";""no"";""no..."
4,"56;""services"";""married"";""high.school"";""no"";""no..."


In [None]:
law = pd.read_csv("Data/bank-additional-full.csv", sep=';')
law.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [None]:
law.describe()

In [None]:
law.info()

In [None]:
law.isnull().sum()

Exploratory Data Analysis

In [None]:
law["y"].value_counts(normalize = True)

Changing month value into numerical values

In [None]:
month_to_nv = {
    "jan": 1,
    "feb": 2,
    "mar": 3,
    "apr": 4,
    "may": 5,
    "jun": 6,
    "jul": 7,
    "aug": 8,
    "sep": 9,
    "oct": 10,
    "nov": 11,
    "dec": 12
}

for df in [law]:
    if 'month' in df.columns and df['month'].dtype == 'object':
        df['month'] = df['month'].map(month_to_nv)

In [None]:
law

In [None]:
# Changing Yes and No to 1 and 0 (Where Yes is 1 and No is 0)
yes_no_columns = law.columns[law.isin(['yes', 'no']).any()]
law[yes_no_columns] = law[yes_no_columns].applymap(lambda x: 1 if x == 'yes' else (0 if x == 'no' else x))

In [None]:
law.head()

In [None]:
P_values = ['age', 'default', 'housing', 'loan', 'day_of_week', 'month', 'duration', 'campaign', 'previous']
plt.figure(figsize=(15, 20))
for i, P_values in enumerate(P_values):
    plt.subplot(5, 2, i + 1)
    if law[P_values].dtype == 'object':
        sns.stripplot(x = P_values, y='y', data = law, jitter=True, alpha=0.6)
    else:
        plt.scatter(law[P_values], law['y'], alpha = 0.5, color = "red")
        plt.xlabel(P_values)
        plt.ylabel('y')
        plt.title(f'{P_values} vs y')

plt.tight_layout()
plt.show()

Converting Columns into Binary

In [None]:
for column in law.select_dtypes(include = "object").columns:
    if len(law[column].unique()) == 2 and column != 'month':
        law[column] = law[column].apply(lambda x: 1 if x == "yes" else 0)

Encoding Columns

In [None]:
obj_columns = law.select_dtypes(include = "object").columns
law = pd.get_dummies(law, columns = obj_columns)
law[law.select_dtypes(include = "bool").columns] = law[law.select_dtypes(include = "bool").columns].astype(int)

Feature Extraction (Numerical and Categorical Extraction)

In [None]:
numerical_feature = []
categorical_feature = []

for column in law.columns:
    if column == "Target":
        continue
    elif len(law[column].unique()) <= 10:
        categorical_feature.append(column)
    else:
        numerical_feature.append(column)

Clearing Outliers

In [None]:
no_o_law = law.copy()
for column in numerical_feature:
    IQR = no_o_law[column].quantile(0.75) - no_o_law[column].quantile(0.25)
    lower_bound = no_o_law[column].quantile(0.25) - 1.5 * IQR
    upper_bound = no_o_law[column].quantile(0.75) + 1.5 * IQR
    no_o_law = no_o_law[(no_o_law[column] >= lower_bound) & (no_o_law[column] <= upper_bound)]

In [None]:
law.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
X = law.drop("y", axis=1)
y = law["y"]

non_numeric = X.select_dtypes(include="object").columns
print("Non-numeric columns:", non_numeric)

In [None]:
X_all = pd.get_dummies(X, drop_first=True)

In [None]:
law.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
model.predict(X_test,)

In [None]:
y_test

In [None]:
model.score(X_test, y_test)

In [None]:

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("\n", Counter(y_test), "\n", Counter(y_pred))