# Getting Started

Import data and get a sense of it.

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [None]:
loan_data = pd.read_csv('house_loan_updated.csv')          # Load house loan data.
loan_data.head()                                       # Observe several data samples.

Unnamed: 0,Gender,Age,Income (USD),Income Stability,Property Age,Property Location,Property Price,Loan Sanction Amount (USD)
0,F,19,1641.25,Low,1651.25,Rural,59641.82,21026.420753
1,M,29,1989.71,Low,1990.71,Urban,179858.51,60595.183366
2,F,37,1849.91,Low,1856.91,Rural,117297.62,39181.648002
3,M,65,2735.18,High,2747.18,Rural,354417.72,128497.710865
4,F,62,4741.78,High,4740.78,Urban,82049.8,39386.919336


In [None]:
loan_data.shape #checking columns and rows of dataframe

(47297, 8)

In [None]:
loan_data.info() #checking the types of each column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47297 entries, 0 to 47296
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Gender                      47297 non-null  object 
 1   Age                         47297 non-null  int64  
 2   Income (USD)                47265 non-null  float64
 3   Income Stability            47285 non-null  object 
 4   Property Age                47263 non-null  float64
 5   Property Location           47294 non-null  object 
 6   Property Price              47297 non-null  float64
 7   Loan Sanction Amount (USD)  47297 non-null  float64
dtypes: float64(4), int64(1), object(3)
memory usage: 2.9+ MB


# Cleaning data

check if there is null and duplicate value

In [None]:
#checking the null data in the dataset
null_data = loan_data.isnull().sum()
null_data

Gender                         0
Age                            0
Income (USD)                  32
Income Stability              12
Property Age                  34
Property Location              3
Property Price                 0
Loan Sanction Amount (USD)     0
dtype: int64

we can see that there are some missing feature rows, so let's drop them. We also want to remove duplicates

In [None]:
loan_data = loan_data.dropna() #dropp null rows
loan_data.drop_duplicates() #drop duplicates

Unnamed: 0,Gender,Age,Income (USD),Income Stability,Property Age,Property Location,Property Price,Loan Sanction Amount (USD)
0,F,19,1641.25,Low,1651.25,Rural,59641.82,21026.420753
1,M,29,1989.71,Low,1990.71,Urban,179858.51,60595.183366
2,F,37,1849.91,Low,1856.91,Rural,117297.62,39181.648002
3,M,65,2735.18,High,2747.18,Rural,354417.72,128497.710865
4,F,62,4741.78,High,4740.78,Urban,82049.80,39386.919336
...,...,...,...,...,...,...,...,...
47292,F,62,2121.48,Low,2135.48,Rural,240291.63,78800.491330
47293,M,38,1506.72,Low,1505.72,Rural,218431.37,72483.293359
47294,F,20,1595.10,Low,1582.10,Semi-Urban,154458.88,52462.861498
47295,F,50,2002.27,Low,2007.27,Semi-Urban,72277.78,22909.596763


In [None]:
loan_data.shape #check if the code above works

(47251, 8)

#Pre-process the data


## Determine the features and label columns:

In [None]:
# Pre-process data, determine feature x and label y
loan_data = loan_data.rename(columns={'Loan Sanction Amount (USD)':'Loan'}) #rename the label y column to 'Loan'
columns = loan_data.columns.tolist()     # Get column names.
columns.remove('Loan')                   # Remove 'Loan' (label y column)  
feature_data = loan_data[columns]        # Assign a variable to features x, including all columns except 'Loan'
label_data = loan_data.Loan            # Assign 'Price' to label y

## Split train and test sets

In [None]:
trainX,testX, trainY,testY = train_test_split(feature_data, label_data, train_size=0.70)     # Split the data into two subsets for training and testing.
print('Training:' + str(trainX.shape))     # Count data samples in Training set.
print('Test:' + str(testX.shape))          # Count data samples in Test set.

Training:(33075, 7)
Test:(14176, 7)


 ## Separate categorical and numerical columns:

In [None]:
trainX_num = trainX.select_dtypes(exclude = 'object') # select numerical features only
testX_num = testX.select_dtypes(exclude = 'object')

trainX_num.head()

Unnamed: 0,Age,Income (USD),Property Age,Property Price
23829,65,2509.64,2511.64,395101.97
29948,43,1561.9,1572.9,109118.51
36447,18,2184.74,2195.74,218801.61
29888,62,1360.05,1371.05,135481.15
26358,20,2310.81,2300.81,187682.5


In [None]:
trainX_cat = trainX.select_dtypes(include = 'object') # select categorical features only
testX_cat = testX.select_dtypes(include = 'object')

trainX_cat.head()

Unnamed: 0,Gender,Income Stability,Property Location
23829,M,High,Semi-Urban
29948,M,Low,Urban
36447,F,Low,Rural
29888,M,Low,Urban
26358,M,Low,Urban


## Encode categorical data

In [None]:
trainX_cat['Income Stability'].unique() #determine unique value for 'Income Stability feature'

array(['High', 'Low'], dtype=object)

In [None]:
trainX_cat['Property Location'].unique() #determine unique value for 'Property Location'

array(['Semi-Urban', 'Urban', 'Rural'], dtype=object)

we will deal with Income Stability and Gender first since they are binary.

In [None]:
income_bin = LabelBinarizer()
gender_bin = LabelBinarizer()

trainX_cat['Income Stability'] = income_bin.fit_transform(trainX_cat['Income Stability']) #encode Income Stability
testX_cat['Income Stability'] = income_bin.fit_transform(testX_cat['Income Stability'])
trainX_cat['Gender'] = gender_bin.fit_transform(trainX_cat['Gender']) #encode Gender
testX_cat['Gender'] = gender_bin.fit_transform(testX_cat['Gender'])

trainX_cat.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: 

Unnamed: 0,Gender,Income Stability,Property Location
23829,1,0,Semi-Urban
29948,1,1,Urban
36447,0,1,Rural
29888,1,1,Urban
26358,1,1,Urban


Now we will look at the Property Location and encode it

In [None]:
loc_encoder = LabelEncoder()

trainX_cat['Property Location'] = loc_encoder.fit_transform(trainX_cat['Property Location']) #encode Property location
testX_cat['Property Location'] = loc_encoder.fit_transform(testX_cat['Property Location'])

trainX_cat.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,Gender,Income Stability,Property Location
23829,1,0,1
29948,1,1,2
36447,0,1,0
29888,1,1,2
26358,1,1,2


In [None]:
testX_cat.head()

Unnamed: 0,Gender,Income Stability,Property Location
32133,1,1,2
43140,1,1,2
37509,1,1,2
3798,1,1,2
42116,1,1,1


Property Location has been encoded, Rural: 0, Semi_Urban: 1, Urban: 2

## Put the trainX together

In [None]:
trainX_final = pd.merge(trainX_cat, trainX_num, left_index = True, right_index = True)
trainX_final.head()

Unnamed: 0,Gender,Income Stability,Property Location,Age,Income (USD),Property Age,Property Price
23829,1,0,1,65,2509.64,2511.64,395101.97
29948,1,1,2,43,1561.9,1572.9,109118.51
36447,0,1,0,18,2184.74,2195.74,218801.61
29888,1,1,2,62,1360.05,1371.05,135481.15
26358,1,1,2,20,2310.81,2300.81,187682.5


## Put the testX together

In [None]:
testX_final = pd.merge(testX_cat, testX_num, left_index = True, right_index = True)
testX_final.head()

Unnamed: 0,Gender,Income Stability,Property Location,Age,Income (USD),Property Age,Property Price
32133,1,1,2,26,1727.26,1738.26,74439.36
43140,1,1,2,59,1730.95,1736.95,127757.08
37509,1,1,2,54,1271.1,1271.1,63648.06
3798,1,1,2,50,2165.21,2162.21,115314.22
42116,1,1,1,39,1719.42,1711.42,48529.22


#Build Linear Regression model

In [None]:
model = LinearRegression(normalize=True) # Initialize Linear Regression model with normalization.

In [None]:
model.fit(trainX_final,trainY) # Learn Linear Regression using (x, y) pairs in the Training set.
print("Model intercept: " + str(model.intercept_))      # Observe the bias (theta_0) parameter.
print("Model coefficients: " + str(model.coef_))         # Observe 7 coefficients corresponding to 5 features x after learning.   

Model intercept: 11871.877834921514
Model coefficients: [-2.50788586e+02 -1.11427726e+04 -5.74771068e+01 -7.87710322e+01
  5.52565539e-01  4.76997363e-01  3.36197454e-01]


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




# Evaluate the model

In [None]:
testX[:5]     # Observe 5 features x of the first 5 samples in the Test set.

Unnamed: 0,Gender,Age,Income (USD),Income Stability,Property Age,Property Location,Property Price
32133,M,26,1727.26,Low,1738.26,Urban,74439.36
43140,M,59,1730.95,Low,1736.95,Urban,127757.08
37509,M,54,1271.1,Low,1271.1,Urban,63648.06
3798,M,50,2165.21,Low,2162.21,Urban,115314.22
42116,M,39,1719.42,Low,1711.42,Semi-Urban,48529.22


In [None]:
testY[:5]     # Observe labels y of the first 5 samples in the Test set.

32133    25157.645696
43140    40458.457909
37509    18845.683967
3798     37430.150086
42116    15270.139350
Name: Loan, dtype: float64

In [None]:
model.predict(testX_final[:5])     # Make prediction on the first 5 samples in the Test set.

array([25125.208678  , 40452.46044989, 18816.71980233, 37420.94689296,
       15430.60453343])

In [None]:
pred = model.predict(testX_final)                                # Make prediction on the whole Test set.
mean_absolute_error(y_pred=pred, y_true=testY)        # Calculate mean absolute error to observe the performance of the learned model based on the predictions and the labels.

157.87217011894685

overall, the gap between real output and predicted output is approximately $152

In [None]:
R_square = model.score(testX_final,testY) #checking the R^2 value
R_square

0.9980336017593532

around 99.7% of dependent variability can be explained by the model