In [63]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score

In [46]:
# Load the dataset
df = pd.read_csv('loan_approval_v2.csv')
# Display first few rows of the dataset
df.head()

Unnamed: 0,age,gender,occupation,education_level,marital_status,income,credit_score,loan_status
0,32,Male,Engineer,Bachelor's,Married,85000,720,Approved
1,45,Female,Teacher,Master's,Single,62000,680,Approved
2,28,Male,Student,High School,Single,25000,590,Denied
3,51,Female,Manager,Bachelor's,Married,105000,780,Approved
4,36,Male,Accountant,Bachelor's,Married,75000,710,Approved


In [47]:
# Exploring the dataset
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              61 non-null     int64 
 1   gender           61 non-null     object
 2   occupation       61 non-null     object
 3   education_level  61 non-null     object
 4   marital_status   61 non-null     object
 5   income           61 non-null     int64 
 6   credit_score     61 non-null     int64 
 7   loan_status      61 non-null     object
dtypes: int64(3), object(5)
memory usage: 3.9+ KB
None
             age         income  credit_score
count  61.000000      61.000000     61.000000
mean   37.081967   78983.606557    709.836066
std     8.424755   33772.025802     72.674888
min    24.000000   25000.000000    560.000000
25%    30.000000   52000.000000    650.000000
50%    36.000000   78000.000000    720.000000
75%    43.000000   98000.000000    770.000000
max    55.000000  180000.

In [49]:
df.columns

Index(['age', 'gender', 'occupation', 'education_level', 'marital_status',
       'income', 'credit_score', 'loan_status'],
      dtype='object')

In [50]:
# Identifying categorical and numeric features
categorical_features = ['gender', 'occupation', 'education_level', 'marital_status']
numeric_features = ['age','income','credit_score']

In [51]:
# Select categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
categorical_columns

Index(['gender', 'occupation', 'education_level', 'marital_status',
       'loan_status'],
      dtype='object')

In [52]:
# Encoding categorical features
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_categorical_features = encoder.fit_transform(df[categorical_features])
encoded_categorical_features



array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 1.],
       [1., 0., 0., ..., 1., 0., 1.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 1.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [53]:
# Create DataFrame for encoded categorical features
encoded_categorical_df = pd.DataFrame(encoded_categorical_features, 
                                      columns=encoder.get_feature_names_out(categorical_features))


In [54]:
# Scaling numeric features
scaler = StandardScaler()
scaled_numeric_features = scaler.fit_transform(df[numeric_features])

# Create DataFrame for scaled numeric features
scaled_numeric_df = pd.DataFrame(scaled_numeric_features, columns=numeric_features)
scaled_numeric_df.head()

Unnamed: 0,age,income,credit_score
0,-0.608224,0.179626,0.141015
1,0.947653,-0.507063,-0.413949
2,-1.086956,-1.611737,-1.662618
3,1.66575,0.776746,0.973462
4,-0.129493,-0.118935,0.002274


In [55]:
# Combine all features
processed_df = pd.concat([scaled_numeric_df, encoded_categorical_df], axis=1)
processed_df['loan_status'] = df['loan_status'].apply(lambda x: 1 if x == "Approved" else 0)  # Encoding target variable

In [56]:
processed_df['loan_status'].value_counts()

1    45
0    16
Name: loan_status, dtype: int64

In [57]:
# Display first few rows of processed features
processed_df.head()

Unnamed: 0,age,income,credit_score,gender_Male,occupation_Analyst,occupation_Architect,occupation_Artist,occupation_Banker,occupation_Chef,occupation_Consultant,...,occupation_Stylist,occupation_Teacher,occupation_Veterinarian,occupation_Writer,education_level_Bachelor's,education_level_Doctoral,education_level_High School,education_level_Master's,marital_status_Single,loan_status
0,-0.608224,0.179626,0.141015,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
1,0.947653,-0.507063,-0.413949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
2,-1.086956,-1.611737,-1.662618,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0
3,1.66575,0.776746,0.973462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
4,-0.129493,-0.118935,0.002274,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1


In [59]:
# Splitting the dataset
X = processed_df.drop('loan_status', axis=1)
y = processed_df['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [61]:
# Training a simple classification model (Logistic Regression)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [64]:
# Predicting and evaluating the model
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.9230769230769231
0.8888888888888888
1.0
0.9411764705882353
