# Home Credit Default Risk Prediction

## 1. Setup shared libraries on path

In [1]:
import lib.initShared  # custom library
import os

In [2]:
#Capture working directory
cwd = os.getcwd()

In [3]:
# Initialize environment to load custom libraries from shared path
lib.initShared.initializeSharedLib(cwd)

## 2. Data Ingestion

### 2.1. Create Spark session

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('HomeCredit').getOrCreate()

In [6]:
from Encode_Lib import EncodeLib
from Model_Fit_Lib import Model_Fit
from Feature_Importance_Lib.Feature_Importance import Feature_Importance_Transformer

  from numpy.core.umath_tests import inner1d


### 2.2 Connect to remote cluster

In [7]:
from S3serializer_Lib.S3Serializer import S3Bucket

In [8]:
s3=S3Bucket()
s3.connect(host = "18.212.194.194", username = "centos", key = 'data/InternalPOC_Digital.pem')

Connecting to host  18.212.194.194
Connected


### 2.3 Fetch data from remote cluster

In [9]:
s3.get_dir_remote(cwd+'/data/dataset','home_credit')

Getting data...
Getting File--> part-00000-ba0f831d-c1e9-4a84-aebd-6fe75d857d71-c000.snappy.parquet
Getting File--> part-00001-ba0f831d-c1e9-4a84-aebd-6fe75d857d71-c000.snappy.parquet
Getting File--> part-00002-ba0f831d-c1e9-4a84-aebd-6fe75d857d71-c000.snappy.parquet
Getting File--> part-00003-ba0f831d-c1e9-4a84-aebd-6fe75d857d71-c000.snappy.parquet
Getting File--> _SUCCESS
['part-00000-ba0f831d-c1e9-4a84-aebd-6fe75d857d71-c000.snappy.parquet', 'part-00001-ba0f831d-c1e9-4a84-aebd-6fe75d857d71-c000.snappy.parquet', 'part-00002-ba0f831d-c1e9-4a84-aebd-6fe75d857d71-c000.snappy.parquet', 'part-00003-ba0f831d-c1e9-4a84-aebd-6fe75d857d71-c000.snappy.parquet', '_SUCCESS']


### 2.4 Load data

In [10]:
data=spark.read.parquet(cwd+'/data/dataset')

# Data profiling

In [18]:
data.select('TARGET').describe().show()

+-------+-------------------+
|summary|             TARGET|
+-------+-------------------+
|  count|             307511|
|   mean|0.08072881945686496|
| stddev| 0.2724186456483938|
|    min|                  0|
|    max|                  1|
+-------+-------------------+



# 3. Data Cleansing

#### It performs follwing activities:<br>
<li>Removes the column containing highest number(30%) of null values</li>
<li>Imputes the remaining null containing columns with frequent occuring value of the respective columns</li>

In [14]:
from Data_Cleaning_Lib.CleanLib import DataCleaningLib

In [15]:
cln = DataCleaningLib()

In [16]:
data=cln.cleaning(data)

1.Data Cleaning and Preprocessing
processing--> SK_ID_CURR
processing--> TARGET
processing--> NAME_CONTRACT_TYPE
processing--> CODE_GENDER
processing--> FLAG_OWN_CAR
processing--> FLAG_OWN_REALTY
processing--> CNT_CHILDREN
processing--> AMT_INCOME_TOTAL
processing--> AMT_CREDIT
processing--> AMT_ANNUITY
processing--> AMT_GOODS_PRICE
processing--> NAME_TYPE_SUITE
processing--> NAME_INCOME_TYPE
processing--> NAME_EDUCATION_TYPE
processing--> NAME_FAMILY_STATUS
processing--> NAME_HOUSING_TYPE
processing--> REGION_POPULATION_RELATIVE
processing--> DAYS_BIRTH
processing--> DAYS_EMPLOYED
processing--> DAYS_REGISTRATION
processing--> DAYS_ID_PUBLISH
processing--> OWN_CAR_AGE
processing--> FLAG_MOBIL
processing--> FLAG_EMP_PHONE
processing--> FLAG_WORK_PHONE
processing--> FLAG_CONT_MOBILE
processing--> FLAG_PHONE
processing--> FLAG_EMAIL
processing--> OCCUPATION_TYPE
processing--> CNT_FAM_MEMBERS
processing--> REGION_RATING_CLIENT
processing--> REGION_RATING_CLIENT_W_CITY
processing--> WEEKDAY

In [17]:
data.write.csv(cwd+'data\cleaned_data.csv')