# Pipeline for Home credit default risk

Importing packages & loading data

In [2]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Row
from pyspark.ml.feature import Imputer
from pyspark.sql import SparkSession
from pyspark.sql import *
spark = SparkSession.builder.appName('CheckPyspark').getOrCreate()
data=spark.read.csv('/home/cnlindreshb/Downloads/all/application_train.csv',header=True,inferSchema=True)

Data cleaning & Feature selection

In [3]:
from pyspark.ml.pipeline import  Transformer,Estimator
from pyspark.ml.param.shared import HasInputCol,HasOutputCol
from pyspark import keyword_only
from pyspark.sql.functions import udf
from pyspark.ml import Pipeline
class PipelineCC(Transformer,HasInputCol,HasOutputCol):
    @keyword_only
    def __init__(self):
        super(PipelineCC,self).__init__()
        kwargs=self._input_kwargs
        self.setParams(**kwargs)
        print()
    @keyword_only
    def setParams(self,inputCol=None,outputCol=None):
        kwargs=self._input_kwargs
        return self._set(**kwargs)
    
    def sort_features(self,feature_importance,column_name):
        '''
        Sort the feature importance values 
        args:
        1.feature_importance: <NPARRAY> containg the feature importance 
        2. column names: <LIST> containing the column names from the dataframe
        returns: sorted DICT of {key: column_name ,value :importance}
        '''

        feature_map={}
        for feature in zip(feature_importance,column_name):
            feature_map[feature[1]]=feature[0]
        sorted_feature_map=sorted(feature_map.items(), key=lambda x: x[1],reverse=True)

        return sorted_feature_map
    def feature_importance_lightGBM(self,data,categorical_columns):
        '''
        FINDS THE FEATURE IMPORTANCE
        1.data:<SPARK DATAFRAME> Actual Dataframe
        2.categorical_columns:<LIST> of categorical columns
        RETURNS : <DICT> of top N_features
        '''
        import lightgbm as lgb
        import matplotlib.pyplot as plt
        data_pandas=data.toPandas()
        y=data_pandas['TARGET']
        data_pandas.drop(['TARGET'],axis=1,inplace=True)
        for col in categorical_columns:
            data_pandas[col]=data_pandas[col].astype('category')


        d_train = lgb.Dataset(data_pandas.iloc[:50000], label=y[:50000])

        param = {'learning_rate' : 0.1, 'n_estimators': 100}

        model2 = lgb.train(params=param,train_set=d_train,categorical_feature=list(categorical_columns))

        print('Plot feature importances…')

        #ax = lgb.plot_importance(model2, max_num_features=40,figsize=(16,16))

        #plt.show()
        feature_dict=self.sort_features(model2.feature_importance(),data_pandas.columns)
        return dict(feature_dict[:40])
    def fill_na_numerical(self,data,columns):
        columns=list(columns)
        imputer=Imputer(inputCols=columns,outputCols=['imputed_'+str(col) for col in columns])
        dataCopy=imputer.fit(data).transform(data)
        return dataCopy
    def get_max_value(self,count_dict):
        '''
        GET MAX VALUE FROM A DICTIONARY
        args:
        1.count_dict: <DICT> frequency of each category {key=category name, value=count}
        return: a tuple containing _key with tha maximim value _max
        '''
        _max=0
        _key=0
        for k,v in count_dict.items():
            if v>_max:
                _key=k
                _max=v
        return (_key,_max)
    def fill_na_categorical(self,data,columns):
        '''
        FILL NULL VALUES FOR CATEGORICAL DATA
        args:
        1.data: <SPARK DATAFRAME> actual spark dataframe
        2.columns: <LIST> of categorical columns we want to Impute

        return: <SPARK DATAFRAME>Imputed spark dataframe
        '''
        for category in columns:
            null_count=data.where(data[category].isNull()).count()
            print('NULL Values before---->',null_count)
            count_dict=data.cube(category).count().toPandas().set_index(category).to_dict()['count']
            #frequency_distribution_categorical(count_dict,category)
            key,_=self.get_max_value(count_dict)
            print('NULL value Imputed with:',key,' for',category)
            data=data.fillna(key,subset=[category])
            null_count=data.where(data[category].isNull()).count()
            print('NULL Values before---->',null_count)
        return data
    def find_null_values(self,data,column_names,ratio=0.5):
        '''
        FIND ALL NULL VALUES IN DATASET
        args:
        1. data: Actual Spark < DataFrame >
        2. column_names: < LIST > of columns to find missing values
        3. total_count: < int > row count
        4. ration: < float > range(0-1) how much null values is acceptable
        return: < dict > {key=column name over having more than acceptable null values, value= total number of null values}
        '''

        total_count=data.count()
        drop_column_list={}
        #print('REMOVING columns with NULL value above:',total_count*ratio,ratio*100,'%')
        #print('----------------------------------------------------------')
        for col in column_names:
            #print(col,' having NULL VALUES -->',data.where(data[col].isNull()).count())
            null_count=data.where(data[col].isNull()).count()

            if null_count>(total_count*ratio) and null_count>0:
                #print(col,' having NULL VALUES -->',null_count)
                drop_column_list[col]=null_count
        print('!-----DONE------!')
        return drop_column_list
    def pipeline(self,data,ratio=0.3):
        

        print('1.Data Cleaning and Preprocessing')
        column_names=data.columns
 
        drop_column_list=self.find_null_values(data,column_names,ratio)
        
        data=data.drop(*drop_column_list)
        #print('COLUMNS after removing NULL values\n',data.columns)

        # GETTING THE NUMERICAL AND CATEGORICAL COLUMN
        categorial_columns=[cat[0] for cat in data.dtypes if cat[1]=='string']
        numerical_columns=set(data.columns)-set(categorial_columns)
        #print(categorial_columns)

        print('IMPUTING CATEGORICAL VALUES')
        cate_fill=self.find_null_values(data,categorial_columns,ratio=0)
        print(cate_fill)
        data=self.fill_na_categorical(data,cate_fill)


        print('IMPUTING NUMERICAL VALUES')
        print(len(data.columns))
        impute_numerical=self.find_null_values(data,numerical_columns,ratio=0)
        #print('FILLING NUMERICAL VALUE')
        data=self.fill_na_numerical(data,impute_numerical.keys())
        data=data.drop(*list(impute_numerical.keys()))
        print('Final number of columns after pre-processing',len(data.columns))

        print('2.FEATURE SELECTION')
        data=data.drop('SK_ID_CURR')
        feature_list=self.feature_importance_lightGBM(data,categorial_columns)
        
        print('Final 40 columns selected after Feature Selection!')
        return feature_list,data
    def _transform(self,df):
        feature_list,data=self.pipeline(df)
        feature_list=list(feature_list.keys())
        feature_list.append('TARGET')
        data=data.select(feature_list)
        print(len(data.columns))
        
        return data

In [4]:
class LabelEncode(Transformer,HasInputCol,HasOutputCol):
    @keyword_only
    def __init__(self):
        super(LabelEncode,self).__init__()
        kwargs=self._input_kwargs
        self.setParams(**kwargs)
        print()
    @keyword_only
    def setParams(self,inputCol=None,outputCol=None):
        kwargs=self._input_kwargs
        return self._set(**kwargs)
    #def xv(self):
        #print('xv')
    def _transform(self,df):
        from pyspark.ml.feature import VectorAssembler,OneHotEncoder,StringIndexer
        #self.xv()
        #label_column=df.select('TARGET')  
        #df=df.drop('TARGET')
        cc=[cat[0] for cat in df.dtypes if cat[1]=='string']
        for column in cc:
            sti=StringIndexer(inputCol=column,outputCol='index_'+column)
            df=sti.fit(df).transform(df)
            df=df.drop(column)
        #print(df.columns)
        #df=df.join(label_column)
        return df

    

In [5]:
class OHEncode(Transformer,HasInputCol,HasOutputCol):
    @keyword_only
    def __init__(self):
        super(OHEncode,self).__init__()
        kwargs=self._input_kwargs
        self.setParams(**kwargs)
        print()
    @keyword_only
    def setParams(self,inputCol=None,outputCol=None):
        kwargs=self._input_kwargs
        return self._set(**kwargs)
    #def xv(self):
        #print('xv')
    def _transform(self,df):
        from pyspark.ml.feature import VectorAssembler,OneHotEncoder,StringIndexer
        #label_column=df.select('TARGET')
        #df=df.drop('TARGET')
        ohe_columns=[col for col in df.columns if col.startswith('index_')]
        for column in ohe_columns:
            sti=OneHotEncoder(inputCol=column,outputCol='ohe_'+column)
            df=sti.transform(df)
            df=df.drop(column)
        #print(df.columns)
        #df=df.join(label_column)
        return df


In [6]:
class VectorChange(Transformer,HasInputCol,HasOutputCol):
    @keyword_only
    def __init__(self):
        super(VectorChange,self).__init__()
        kwargs=self._input_kwargs
        self.setParams(**kwargs)
        print()
    @keyword_only
    def setParams(self,inputCol=None,outputCol=None):
        kwargs=self._input_kwargs
        return self._set(**kwargs)
    #def xv(self):
        #print('xv')
    def _transform(self,df):
        from pyspark.ml.feature import VectorAssembler,OneHotEncoder,StringIndexer
        assem=VectorAssembler(inputCols=list(set(df.columns)-set(['TARGET'])),outputCol='Feature')
        df=assem.transform(df)
        return df


In [7]:
class ModelFit(Transformer,HasInputCol,HasOutputCol):
    @keyword_only
    def __init__(self,inputCol=None):
        super(ModelFit,self).__init__()
        self.model=None
        kwargs=self._input_kwargs
        self.setParams(**kwargs)
    @keyword_only
    def setParams(self,inputCol=None,outputCol=None):
        kwargs=self._input_kwargs
        return self._set(**kwargs)
    def logistic_regression(self,data):
        from pyspark.ml.classification import LogisticRegression
        lr=LogisticRegression(featuresCol='Feature',labelCol='TARGET')
        self.model=lr.fit(self.train_data)
    def random_forest(self,data):
        from pyspark.ml.classification import RandomForestClassifier
        lr=RandomForestClassifier(featuresCol='Feature',labelCol='TARGET')
        self.model=lr.fit(self.train_data)
    def split_train_test(self,data):
        train_data,test_data=data.randomSplit([0.7,0.3])
        self.train_data=train_data
        self.test_data=test_data
    def _get_feature_list(self):
        return self.df.columns
    def _get_train_data(self):
        return self.train_data
    def _get_test_data(self):
        return self.test_data
    def _get_model(self):
        return self.model
    def _evaluate_prediction(self):
        from pyspark.ml.evaluation import BinaryClassificationEvaluator
        predicted=self.model.transform(self.test_data)
        binary_eval=BinaryClassificationEvaluator(labelCol='TARGET')
        print(binary_eval.evaluate(predicted))
    def _get_predicted():
        predicted=self.model.transform(self.test_data)
        return predicted
    def _transform(self,df):
        self.df=df
        self.split_train_test(df)
        if self.getInputCol() =='LR':
            self.logistic_regression(df)
        elif self.getInputCol() =='RF':
            self.RandomForestClassifier(df)
        return df

In [8]:
preprocessing=PipelineCC()
encode=LabelEncode()
ohe=OHEncode()
assembler=VectorChange()
model=ModelFit(inputCol='LR')







In [9]:
pip=Pipeline(stages=[preprocessing,encode,ohe,assembler,model])
data=pip.fit(data).transform(data)

1.Data Cleaning and Preprocessing
!-----DONE------!
IMPUTING CATEGORICAL VALUES
!-----DONE------!
{'NAME_TYPE_SUITE': 1292}
NULL Values before----> 1292
NULL value Imputed with: Unaccompanied  for NAME_TYPE_SUITE
NULL Values before----> 0
IMPUTING NUMERICAL VALUES
72
!-----DONE------!
Final number of columns after pre-processing 72
2.FEATURE SELECTION


New categorical_feature is ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_CONTRACT_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'NAME_INCOME_TYPE', 'NAME_TYPE_SUITE', 'ORGANIZATION_TYPE', 'WEEKDAY_APPR_PROCESS_START']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Plot feature importances…
Final 40 columns selected after Feature Selection!
41


In [10]:
# getting the AUROC Score
model._evaluate_prediction()

0.7429870271904693


In [None]:
#get TEST DATA
model._get_test_data() 

In [None]:
#get Train DATA
model._get_train_data()

In [None]:
#get model
model.model
model._get_model()

In [None]:
#get predicted 
model._get_predicted()

In [11]:
#manual Testing
from pyspark.ml.evaluation import BinaryClassificationEvaluator
predicted=model.model.transform(model._get_test_data())
binary_eval=BinaryClassificationEvaluator(labelCol='TARGET')
print(binary_eval.evaluate(predicted))

0.7429870271904494
