In [31]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [32]:
dataset = pd.read_csv(r"C:\Users\vinny\Downloads\loan_approval_dataset.csv")
dataset.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [33]:
dataset.shape # it checks the size of my data set

(4269, 13)

In [34]:
dataset.nunique() # it tells how many unique values exist in each column

loan_id                      4269
 no_of_dependents               6
 education                      2
 self_employed                  2
 income_annum                  98
 loan_amount                  378
 loan_term                     10
 cibil_score                  601
 residential_assets_value     278
 commercial_assets_value      188
 luxury_assets_value          379
 bank_asset_value             146
 loan_status                    2
dtype: int64

In [35]:
#duplicates
dataset.duplicated() # The dataset.duplicated() method in pandas checks for duplication in out dataset. It only returns a boolean series(True for 
# True for duplicate rows and false for unique rows

0       False
1       False
2       False
3       False
4       False
        ...  
4264    False
4265    False
4266    False
4267    False
4268    False
Length: 4269, dtype: bool

In [36]:
dataset.duplicated().sum() # the sum method counts  the number of duplicates 

np.int64(0)

In [37]:
dataset.drop(columns="loan_id", inplace=True)

In [38]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0    no_of_dependents          4269 non-null   int64 
 1    education                 4269 non-null   object
 2    self_employed             4269 non-null   object
 3    income_annum              4269 non-null   int64 
 4    loan_amount               4269 non-null   int64 
 5    loan_term                 4269 non-null   int64 
 6    cibil_score               4269 non-null   int64 
 7    residential_assets_value  4269 non-null   int64 
 8    commercial_assets_value   4269 non-null   int64 
 9    luxury_assets_value       4269 non-null   int64 
 10   bank_asset_value          4269 non-null   int64 
 11   loan_status               4269 non-null   object
dtypes: int64(9), object(3)
memory usage: 400.3+ KB


In [39]:
dataset.columns # The dataset.columns command in pandas returns the names of all columns in the dataset.


Index([' no_of_dependents', ' education', ' self_employed', ' income_annum',
       ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [40]:

dataset.columns = [c.strip() for c in dataset.columns] # This removes extra spaces from column names in the dataset.
dataset.columns

Index(['no_of_dependents', 'education', 'self_employed', 'income_annum',
       'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value',
       'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value',
       'loan_status'],
      dtype='object')

In [41]:
# Why X and y are used:
# X:
# In machine learning, X typically represents the input features (the independent variables). These are the columns of the dataset that we will use to predict something.

# y:
# y usually represents the target variable (the dependent variable). This is the column you are trying to predict or model based on the input features.

# The reason why X is often capitalized is to conventionally denote that it contains the input data, typically a 2D matrix (rows and columns). y, on the other hand, is lowercase because it usually represents a single target variable (often a 1D vector).

X = dataset.drop(columns=["loan_status"]) 
y = dataset['loan_status']

In [42]:
# What does X.info() do?

# When you call X.info(), it displays a summary of the DataFrame X. Specifically, it shows:

# - The number of rows and columns in the DataFrame.
# - The names of the columns.
# - The data type of each column.
# - The number of non-null values (i.e., how many missing values there are in each column).

# So, X.info() does not show the actual values of the data. Instead, it gives you an overview of the structure of the dataset.


In [43]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   no_of_dependents          4269 non-null   int64 
 1   education                 4269 non-null   object
 2   self_employed             4269 non-null   object
 3   income_annum              4269 non-null   int64 
 4   loan_amount               4269 non-null   int64 
 5   loan_term                 4269 non-null   int64 
 6   cibil_score               4269 non-null   int64 
 7   residential_assets_value  4269 non-null   int64 
 8   commercial_assets_value   4269 non-null   int64 
 9   luxury_assets_value       4269 non-null   int64 
 10  bank_asset_value          4269 non-null   int64 
dtypes: int64(9), object(2)
memory usage: 367.0+ KB


In [44]:
# Apply the domain knowledge as part of preprocessing

# Purpose: In some datasets, different types of assets are recorded in separate columns, 
# but they represent different components of a total asset value. To create a new feature, 
# we can combine those asset columns into a single 'total_assets_value' column, which might 
# be more meaningful and useful for analysis or modeling.
#
# Here, we are summing up four different columns that represent different asset values:
# 'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', and 'bank_asset_value'.
# The resulting sum will be stored in a new column called 'total_assets_value'.
#X['total_assets_value'] = X['residential_assets_value'] + X['commercial_assets_value'] + X['luxury_assets_value'] + X['bank_asset_value']

# Drop the individual asset columns after creating the 'total_assets_value' column.
# Purpose: Once the 'total_assets_value' column is created, we no longer need the original individual asset columns.
# Dropping them will make the dataset cleaner and easier to work with, reducing redundancy.
# Also, if you are using machine learning models, dropping irrelevant features can improve performance.
#
# We are dropping the following columns:
# - 'residential_assets_value'
# - 'commercial_assets_value'
# - 'luxury_assets_value'
# - 'bank_asset_value'
# 
# After this operation, only the 'total_assets_value' column will remain in the dataset, 
# which represents the sum of all asset types.
# 'inplace=True' means the changes will be applied directly to the DataFrame (X) without creating a new variable.
#X.drop(columns=['residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value'], inplace=True)


In [45]:
# Apply the domain knowledge as part of preprocessing
X['total_assets_value'] = X['residential_assets_value'] + X['commercial_assets_value'] + X['luxury_assets_value'] + X['bank_asset_value']

# Drop the Columns
X.drop(columns=['residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value'], inplace=True)


In [46]:
X.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,2,Graduate,No,9600000,29900000,12,778,50700000
1,0,Not Graduate,Yes,4100000,12200000,8,417,17000000
2,3,Graduate,No,9100000,29700000,20,506,57700000
3,3,Graduate,No,8200000,30700000,8,467,52700000
4,5,Not Graduate,Yes,9800000,24200000,20,382,55000000


In [47]:
X['education'].unique()

array([' Graduate', ' Not Graduate'], dtype=object)

In [48]:
X['self_employed'].unique()

array([' No', ' Yes'], dtype=object)

In [49]:
y.unique()

array([' Approved', ' Rejected'], dtype=object)

In [50]:
X['education'] = X['education'].str.strip()
X['self_employed'] = X['self_employed'].str.strip()
y = y.str.strip()

In [51]:
X['education'].unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [52]:
X['self_employed'].unique()

array(['No', 'Yes'], dtype=object)

In [53]:
# Transformation to Binary
def transform_categorical_to_binary(df, columns_to_transform):
    """
    Transforms specified categorical columns in a DataFrame to binary (0, 1).

    Args:
    df: The DataFrame to modify.
    columns_to_transform: A dictionary where keys are column names and values are lists of the original categorical values to map to 1.

    Returns:
    The modified DataFrame with transformed columns.
    """

    for column_name, positive_values in columns_to_transform.items():
        df[column_name] = df[column_name].apply(lambda x: 1 if x in positive_values else 0)

    return df

# Define which values should be mapped to 1
columns_to_transform = {
    'education': ['Graduate'],  # 'Graduate' will be mapped to 1, others to 0
    'self_employed': ['Yes']  # 'Yes' will be mapped to 1, others to 0
}

# Apply transformation to dataset X
transformed_X = transform_categorical_to_binary(X, columns_to_transform)
transformed_X


Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,2,1,0,9600000,29900000,12,778,50700000
1,0,0,1,4100000,12200000,8,417,17000000
2,3,1,0,9100000,29700000,20,506,57700000
3,3,1,0,8200000,30700000,8,467,52700000
4,5,0,1,9800000,24200000,20,382,55000000
...,...,...,...,...,...,...,...,...
4264,5,1,1,1000000,2300000,12,317,7400000
4265,0,0,1,3300000,11300000,20,559,20000000
4266,2,0,0,6500000,23900000,18,457,39000000
4267,1,0,0,4100000,12800000,8,780,28800000


In [54]:
#log transformation
log_cols = ['income_annum', 'loan_amount', 'total_assets_value']
X[log_cols] = np.log(X[log_cols])



In [55]:
y = y.map({"Approved" :1,"Rejected":0})


In [56]:
y[:5]

0    1
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [57]:
X.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,2,1,0,16.077274,17.213369,12,778,17.741436
1,0,0,1,15.226498,16.316947,8,417,16.648724
2,3,1,0,16.023785,17.206658,20,506,17.870768
3,3,1,0,15.919645,17.239773,8,467,17.780126
4,5,0,1,16.097893,17.001863,20,382,17.822844


In [58]:
#split the dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0)


In [59]:
#Building the Model
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()
log.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [60]:
y_pred_test = log.predict(X_test)

In [62]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred_test)
print(f"Accuracy is {acc}")

Accuracy is 0.9047619047619048
