In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt



# 1. Data Loading
Load the dataset using pandas and show the head and tail of the dataset.


In [None]:
df = pd.read_csv("C:/Users/باسل/OneDrive/Desktop/my_part/loan_prediction.csv", encoding="utf-8")

df.head()

In [None]:
df.tail()

# 3. Data Cleaning
Handle missing values, remove duplicates, and check for outliers.


In [None]:
# Check how many duplicate rows there are
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")


In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
#checking the outliers
plt.figure(figsize=(12,8))
sns.boxenplot(data=df)

In [None]:
#fill the null values of numerical datatype
df['LoanAmount'] =df['LoanAmount'].fillna(df['LoanAmount'].median()) 
df['Loan_Amount_Term'] =df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean())
df['Credit_History'] =df['Credit_History'].fillna(df['Credit_History'].mean())

In [None]:
#fill the null value of object datatype
df['Gender'] =df['Gender'].fillna(df['Gender'].mode()[0]) 
df['Married'] =df['Married'].fillna(df['Married'].mode()[0])
df['Dependents'] =df['Dependents'].fillna(df['Dependents'].mode()[0])
df['Self_Employed'] =df['Self_Employed'].fillna(df['Self_Employed'].mode()[0]) 

In [None]:
df.isnull().sum()

In [None]:
# Display value counts and count plots for categorical columns
cat_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Dependents', 'Loan_Status','Credit_History','Loan_Amount_Term']
for col in cat_cols:
    print(f"\nValue counts for {col}:")
    print(df[col].value_counts())
    sns.countplot(x=col, data=df)
    plt.title(f'Distribution of {col}')
    plt.show()


In [None]:
# Visualize the distribution of the target variable (Loan_Status)
print("\nTarget variable (Loan_Status) distribution:")
print(df['Loan_Status'].value_counts())
sns.countplot(x='Loan_Status', data=df)
plt.title("Loan Approval Distribution")
plt.show()


In [None]:
#Plot histograms with KDE for numerical features
plt.figure(figsize=(10,6))
for col in ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']:
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()


In [None]:
 # correlation for numerical data
df_numeric = df.select_dtypes(include=['int64', 'float64'])
plt.figure(figsize=(10,6))
sns.heatmap(df_numeric.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

# 4. Feature Engineering
Create new features like Total_Income, and apply log transformations.


In [None]:
# Create a new column for total income
df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df.head()


In [None]:
# Plot distribution of Total_Income before log transformation
sns.histplot(df['Total_Income'], kde=True)
plt.title('Total Income Distribution (Before Log)')
plt.xlabel('Total Income')
plt.show()


In [None]:
 # Apply log transformation to reduce skewness
df['Log_Total_Income'] = np.log1p(df['Total_Income'])


In [None]:
# Plot distribution of Log_Total_Income
sns.histplot(df['Log_Total_Income'], kde=True)
plt.title('Total Income Distribution (After Log)')
plt.xlabel('Log of Total Income')
plt.show()


In [None]:
# Plot distribution of LoanAmount before log transformation
sns.histplot(df['LoanAmount'], kde=True)
plt.title('LoanAmount Distribution (Before Log)')
plt.xlabel('LoanAmount')
plt.show()

In [None]:
# Apply log transformation to LoanAmount 
df['Log_LoanAmount'] = np.log1p(df['LoanAmount'])


In [None]:
# Plot distribution of Log_LoanAmount
sns.histplot(df['Log_LoanAmount'], kde=True)
plt.title('Loan Amount Distribution (After Log)')
plt.xlabel('Log of Loan Amount')
plt.show()


In [None]:
# Apply log transformation to Loan_Amount_Term 
df['Log_Loan_Amount_Term'] = np.log1p(df['Loan_Amount_Term'])

In [None]:
# Plot distribution of Log_Loan_Amount_Term
sns.histplot(df['Log_Loan_Amount_Term'], kde=True)
plt.title('Loan_Amount_Term Distribution (After Log)')
plt.xlabel('Log of Loan_Amount_Term')
plt.show()

In [None]:
df.head()

In [None]:
#drop unnecessary columns
cols = ['ApplicantIncome','CoapplicantIncome','Total_Income','Loan_Amount_Term','LoanAmount','Loan_ID']
existing_cols = [col for col in cols if col in df.columns]
df = df.drop(columns=existing_cols)
df.head()


# 5. Encoding Categorical Variables
Convert object types to numerical values using Label Encoding.


In [None]:
#convert to mumerical data by label encoding
from sklearn.preprocessing import LabelEncoder

cols = ['Gender','Married','Education','Self_Employed','Dependents','Property_Area','Loan_Status']
le = LabelEncoder()

for col in cols:
    df[col] = le.fit_transform(df[col])
df.head()

In [None]:
df.dtypes

In [160]:
# split the features
X = df.drop('Loan_Status', axis=1)  # Independent features
Y = df['Loan_Status']               # Dependent feature


In [155]:
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Log_Total_Income,Log_LoanAmount,Log_Loan_Amount_Term
0,1,0,0,0,0,1.0,2,8.674197,4.859812,5.888878
1,1,1,1,0,0,1.0,0,8.714732,4.859812,5.888878
2,1,1,0,0,1,1.0,2,8.006701,4.204693,5.888878
3,1,1,0,1,0,1.0,2,8.505525,4.795791,5.888878
4,1,0,0,0,0,1.0,2,8.699681,4.955827,5.888878
...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,1.0,0,7.972811,4.276666,5.888878
610,1,1,3,0,0,1.0,0,8.320448,3.713572,5.198497
611,1,1,1,0,0,1.0,2,9.025576,5.537334,5.888878
612,1,1,2,0,0,1.0,2,8.933796,5.236442,5.888878


In [161]:
Y

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 614, dtype: int64