In [32]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [33]:
raw_train=pd.read_csv('train.csv')
raw_test=pd.read_csv('test.csv')
raw_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [34]:
raw_test.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
357,LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777,113,360.0,1.0,Urban
358,LP002975,Male,Yes,0,Graduate,No,4158,709,115,360.0,1.0,Urban
359,LP002980,Male,No,0,Graduate,No,3250,1993,126,360.0,,Semiurban
360,LP002986,Male,Yes,0,Graduate,No,5000,2393,158,360.0,1.0,Rural
361,LP002989,Male,No,0,Graduate,Yes,9200,0,98,180.0,1.0,Rural


In [35]:
raw_train.nunique() # shows me all the unique ids present

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [36]:
raw_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [37]:
raw_train.shape

(614, 13)

In [38]:
# To copy all the data into a dataframe
train_df = raw_train.copy()
test_df = raw_test.copy()

In [39]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [40]:
test_df.info() # here you can see the loan_Status data is missing

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            362 non-null    object 
 1   Gender             351 non-null    object 
 2   Married            362 non-null    object 
 3   Dependents         353 non-null    object 
 4   Education          362 non-null    object 
 5   Self_Employed      339 non-null    object 
 6   ApplicantIncome    362 non-null    int64  
 7   CoapplicantIncome  362 non-null    int64  
 8   LoanAmount         362 non-null    int64  
 9   Loan_Amount_Term   356 non-null    float64
 10  Credit_History     333 non-null    float64
 11  Property_Area      362 non-null    object 
dtypes: float64(2), int64(3), object(7)
memory usage: 34.1+ KB


In [41]:
# From the baove data frame i see that there is data of loan_status for training
# the test data dosent have the loan_status so we can only use it for predicition

# Preprocessing

In [42]:
train_y = train_df['Loan_Status'].copy()
train_y

0      Y
1      N
2      Y
3      Y
4      Y
      ..
609    Y
610    Y
611    Y
612    Y
613    N
Name: Loan_Status, Length: 614, dtype: object

In [43]:
train_df.drop(columns=['Loan_Status'], inplace=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


# Dropping unncessary columns

In [44]:
train_df.drop(columns='Loan_ID', inplace=True)
test_df.drop(columns='Loan_ID', inplace=True)

In [45]:
train_df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

# Check for dupliates


In [46]:
train_df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
609    False
610    False
611    False
612    False
613    False
Length: 614, dtype: bool

In [47]:
train_df[train_df.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area


In [48]:
test_df[test_df.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
192,Male,No,0,Graduate,Yes,5833,0,116,360.0,1.0,Urban


In [49]:
test_df.drop_duplicates(inplace=True)
test_df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,Yes,0,Graduate,No,5720,0,110,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,3076,1500,126,360.0,1.0,Urban
2,Male,Yes,2,Graduate,No,5000,1800,208,360.0,1.0,Urban
3,Male,Yes,2,Graduate,No,2340,2546,100,360.0,,Urban
4,Male,No,0,Not Graduate,No,3276,0,78,360.0,1.0,Urban
...,...,...,...,...,...,...,...,...,...,...,...
357,Male,Yes,3+,Not Graduate,Yes,4009,1777,113,360.0,1.0,Urban
358,Male,Yes,0,Graduate,No,4158,709,115,360.0,1.0,Urban
359,Male,No,0,Graduate,No,3250,1993,126,360.0,,Semiurban
360,Male,Yes,0,Graduate,No,5000,2393,158,360.0,1.0,Rural


# Misising Value analysis

In [50]:
train_df.isna().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
dtype: int64

# When we find missing values we can either remove them or impute values like:
# for all numeric values---> mean
# for all Categorical values ---> mode

In [51]:
# to check which are numerical and whicha re categorical
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
dtypes: float64(4), int64(1), object(6)
memory usage: 52.9+ KB


In [52]:
numeical_columns = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term'] # Removed extra space after 'LoanAmount'
cate_columns= ['Gender','Married','Dependents','Education','Self_Employed','Credit_History','Property_Area'] # Removed extra spaces in column names

In [53]:
cat_imputer = SimpleImputer(strategy="most_frequent")
cat_imputer.fit(train_df[cate_columns])

train_df[cate_columns] = cat_imputer.transform(train_df[cate_columns])
test_df[cate_columns] = cat_imputer.transform(test_df[cate_columns]) # Changed train_df to test_df to impute on the test data

In [54]:
num_imputer = SimpleImputer(strategy="mean")
num_imputer.fit(train_df[numeical_columns])

train_df[numeical_columns] = num_imputer.transform(train_df[numeical_columns])
test_df[numeical_columns] = num_imputer.transform(test_df[numeical_columns]) # Changed train_df to test_df to impute on the test data

In [55]:
train_df.isna().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

## **Preprocessing**


In [56]:
train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849.0,0.0,146.412162,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4583.0,1508.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,2583.0,2358.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000.0,0.0,141.0,360.0,1.0,Urban


In [57]:
train_df['ApplicantIncome'] = train_df['ApplicantIncome'] + train_df['CoapplicantIncome']
test_df['ApplicantIncome'] = test_df['ApplicantIncome'] + test_df['CoapplicantIncome']

#drop the co-applicant income column
train_df.drop(columns='CoapplicantIncome', inplace=True)
test_df.drop(columns='CoapplicantIncome', inplace=True)


In [58]:
train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849.0,146.412162,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,6091.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,4941.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000.0,141.0,360.0,1.0,Urban


In [59]:
train_df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History',
       'Property_Area'],
      dtype='object')

In [60]:
test_df.tail()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
357,Male,Yes,3+,Not Graduate,Yes,5786.0,113.0,360.0,1.0,Urban
358,Male,Yes,0,Graduate,No,4867.0,115.0,360.0,1.0,Urban
359,Male,No,0,Graduate,No,5243.0,126.0,360.0,1.0,Semiurban
360,Male,Yes,0,Graduate,No,7393.0,158.0,360.0,1.0,Rural
361,Male,No,0,Graduate,Yes,9200.0,98.0,180.0,1.0,Rural


In [61]:
# Application of label_encoder
train_df.nunique()

Gender                2
Married               2
Dependents            4
Education             2
Self_Employed         2
ApplicantIncome     554
LoanAmount          204
Loan_Amount_Term     11
Credit_History        2
Property_Area         3
dtype: int64

In [62]:
train_df.Dependents.unique()

array(['0', '1', '2', '3+'], dtype=object)

In [63]:
train_df.Property_Area.unique()

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [64]:
for col in cate_columns:
  train_df[col] = LabelEncoder().fit_transform(train_df[col])
  test_df[col] = LabelEncoder().fit_transform(test_df[col])


In [65]:
train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0,0,0,0,5849.0,146.412162,360.0,1,2
1,1,1,1,0,0,6091.0,128.0,360.0,1,0
2,1,1,0,0,1,3000.0,66.0,360.0,1,2
3,1,1,0,1,0,4941.0,120.0,360.0,1,2
4,1,0,0,0,0,6000.0,141.0,360.0,1,2


In [66]:
numeical_columns.remove('CoapplicantIncome')

In [67]:
# log transformation
train_df[numeical_columns]= np.log(train_df[numeical_columns])
test_df[numeical_columns]= np.log(test_df[numeical_columns])

# Scaling


In [68]:
min_max_scaler = MinMaxScaler()
train_df = min_max_scaler.fit_transform(train_df)
test_df = min_max_scaler.transform(test_df)

## Building the Model

In [69]:
X_train, X_test, y_train, y_test = train_test_split(train_df, train_y, test_size=0.2, random_state=42)

In [70]:
log = LogisticRegression()
log.fit(X_train, y_train)

In [71]:
y_pred_test = log.predict(X_test) # predicted values

In [72]:
acc = accuracy_score(y_test, y_pred_test)
acc

0.7886178861788617

# Create Custom data transformers

In [1]:
# Key thing --> Inherit - BaseEstimator, TransformerMixin
# implent fit and transform method
#accept input with __init__ method

from sklearn.base import BaseEstimator, TransformerMixin


class DemoTransformer(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    return X

In [2]:
# Numerical Imputation - mean
class MeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None):
        self.variables = variables

    def fit(self, X, y=None):
        self.mean_dict = {}
        for col in self.variables:
            self.mean_dict[col] = X[col].mean()
        return self
    
    def transform(self, X):
        X = X.copy()
        for col in self.variables:
            X[col].fillna(self.mean_dict[col], inplace=True)
        return X

# categorical imputation - mode or most frequent

In [6]:
import numpy as np
import pandas as pd
np.random.seed(0)
df = pd.DataFrame(np.random.randint(0,100,(10,2)), columns=['A','B'])
df.iloc[1,0] = np.nan
df.iloc[2,1] = np.nan
df.iloc[3,0] = np.nan
df.loc[4, 0] = np.nan
df.loc[5, 1] = np.nan
df

Unnamed: 0,A,B,0,1
0,44.0,47.0,,
1,,67.0,,
2,67.0,,,
3,,21.0,,
4,36.0,87.0,,
5,70.0,88.0,,
6,88.0,12.0,,
7,58.0,65.0,,
8,39.0,87.0,,
9,46.0,88.0,,
