In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
df=pd.read_csv('bank-full.csv',sep=';')
df.head()

Unnamed: 0,"age"";""job"";""marital"";""education"";""default"";""balance"";""housing"";""loan"";""contact"";""day"";""month"";""duration"";""campaign"";""pdays"";""previous"";""poutcome"";""y"
0,"58;""management"";""married"";""tertiary"";""no"";2143..."
1,"44;""technician"";""single"";""secondary"";""no"";29;""..."
2,"33;""entrepreneur"";""married"";""secondary"";""no"";2..."
3,"47;""blue-collar"";""married"";""unknown"";""no"";1506..."
4,"33;""unknown"";""single"";""unknown"";""no"";1;""no"";""n..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 1 columns):
 #   Column                                                                                                                                                Non-Null Count  Dtype 
---  ------                                                                                                                                                --------------  ----- 
 0   age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y  45211 non-null  object
dtypes: object(1)
memory usage: 353.3+ KB


In [4]:
df.describe()

Unnamed: 0,"age"";""job"";""marital"";""education"";""default"";""balance"";""housing"";""loan"";""contact"";""day"";""month"";""duration"";""campaign"";""pdays"";""previous"";""poutcome"";""y"
count,45211
unique,45211
top,"37;""entrepreneur"";""married"";""secondary"";""no"";2..."
freq,1


In [5]:
print("\n--- Categorical Columns Statistics (df.describe(include='object')) ---")
print(df.describe(include='object'))


--- Categorical Columns Statistics (df.describe(include='object')) ---
       age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y
count                                               45211                                                                                                  
unique                                              45211                                                                                                  
top     37;"entrepreneur";"married";"secondary";"no";2...                                                                                                  
freq                                                    1                                                                                                  


In [6]:
print("\n--- Target Variable 'y' Distribution (Class Balance) ---")
# This is important for a classification problem
print(df['y'].value_counts(normalize=True))


--- Target Variable 'y' Distribution (Class Balance) ---


KeyError: 'y'

In [None]:
#preprocessing 

#binary coloums 
binary_cols = ['default', 'housing', 'loan', 'y']

for col in binary_cols:
    df[col]=df[col].map({'yes':1,'no':0})

print("one hot encoding for categorical columns")
df_processed=pd.get_dummies(df,drop_first=True)


X = df_processed.drop('y', axis=1)
Y = df_processed['y'] # Using your uppercase 'Y'


#scaling 
numerical_columns=['previous','pdays','campaign','duration','age','balance','day']
numerical_columns_to_scale=[col for col in numerical_columns if col in X.columns]


one hot encoding for categorical columns


In [None]:


class StandardScaler:
    def __init__(self):
        self.mean_ = None
        self.std_ = None 
    def fit(self, X):
        X_np = X.values # Convert DataFrame to np.array
        self.mean_ = np.mean(X_np, axis=0)
        self.std_ = np.std(X_np, axis=0)
    def transform(self, X):
        X_np = X.values # Convert DataFrame to np.array
        return (X_np - self.mean_) / (self.std_ + 1e-8)
    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

In [None]:
#train test split from scratch 

def train_test_split_scratch(X,y,test_size=0.2,random_state=None):

    if random_state:
        np.random.seed(random_state)

    samples=X.shape[0]
    indices=np.random.permutation(samples)
    test_split_idx=int(samples*test_size)

    test_indices=indices[:test_split_idx]
    train_indices=indices[test_split_idx:]

    X_train = X.iloc[train_indices]
    Y_train = y.iloc[train_indices]
    X_test=y.iloc[test_indices]
    Y_test=y.iloc[test_indices]

    return X_train, X_test, Y_train, Y_test

In [None]:
#logistic regression from scratch

class LogisticRegressionScratch:
    def __init__(self,learning_rate=0.01,n_iterations=1000):
        self.learning_rate=learning_rate
        self.n_iterations=n_iterations
        self.weights=None
        self.bias=None

    def _sigmoid(self,z):
        z_clip=np.clip(z,-500,500)

        return 1/(1+np.exp(-z_clip))

    def fit(self,X,y):
        samples,features=X.shape

        self.weights=np.zeros(features)
        self.bias=0

        for i in range(self.n_iterations):
            linear_model=np.dot(X,self.weights)+self.bias

            y_predicted=self._sigmoid(linear_model)

            dw=(1/samples)*np.dot(X.T,(y_predicted-y))
            db=(1/samples)*np.sum(y_predicted-y)

            self.weights-=self.learning_rate*dw
            self.bias-=self.learning_rate*db

    def predict_prob(self,X):
        linear_model=np.dot(X,self.weights)+self.bias
        y_pred=self._sigmoid(linear_model)
        return y_pred

    def predict(self,X,threshold=0.5):
        y_prob=self.predict_prob(X)
        
        return np.array([1 if i>threshold else 0 for i in y_prob])

In [None]:
#evaluation matrics

def evaluation_metrics(y_true,y_pred):

    tp=np.sum((y_true==1) & (y_pred==1))
    tn=np.sum((y_true==0) & (y_pred==0))
    fp=np.sum((y_true==0) & (y_pred==1))
    fn=np.sum((y_true==1) & (y_pred==0))

    e = 1e-8

    accuracy=(tp+tn)/(tp+tn+fp+fn+e)
    precision=tp/(tp+fp+e)
    recall=tp/(tp+fn+e)
    f1_score=2*(precision*recall)/(precision+recall)


    # 1. Confusion Matrix
    print("\nConfusion Matrix (from scratch):")
    print(f"     Predicted 0  | Predicted 1")
    print(f"True 0:  {tn:^13} | {fp:^13}")
    print(f"True 1:  {fn:^13} | {tp:^13}")
    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")
    

In [None]:
# #main code 

# --- (Your classes and data loading/splitting are all correct) ---

# --- Convert to NumPy for the Model ---
# Our "from scratch" model is built to work with NumPy arrays

# --- THE FIX IS HERE ---
# We MUST explicitly cast to 'float64' to prevent an 'object' dtype array
X_train_np = X_train_scaled.values.astype(np.float64)
X_test_np = X_test_scaled.values.astype(np.float64)
# --- END FIX ---

y_train_np = Y_train.values
y_test_np = Y_test.values

print(f"Data types for model: X_train_np is {X_train_np.dtype}, y_train_np is {y_train_np.dtype}")

model=LogisticRegressionScratch(learning_rate=0.05, n_iterations=1000)

# This line will now work
model.fit(X_train_np, y_train_np)
print("Model training completed.")

print("\n--- Making Predictions on Test Set ---")
y_pred_np = model.predict(X_test_np)

# (Assuming your evaluation_metrics function is defined)
evaluation_metrics(y_test_np, y_pred_np)

print("\n--- Assignment Script Finished ---")

Data types for model: X_train_np is float64, y_train_np is int64
Model training completed.

--- Making Predictions on Test Set ---

Confusion Matrix (from scratch):
     Predicted 0  | Predicted 1
True 0:      7802      |      149     
True 1:       855      |      236     

Accuracy: 0.8890
Precision: 0.6130
Recall: 0.2163
F1 Score: 0.3198

--- Assignment Script Finished ---
