In [1]:
import findspark
findspark.init('/home/cnlindreshb/spark')

In [2]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Row
from pyspark.ml.feature import Imputer
from pyspark.sql import SparkSession
from pyspark.sql import *
spark = SparkSession.builder.appName('CheckPyspark').getOrCreate()
data=spark.read.csv('/home/cnlindreshb/Downloads/all/application_train.csv',header=True,inferSchema=True)

In [3]:
xcv=None
def sort_features(feature_importance,column_name):
    '''
    Sort the feature importance values 
    args:
    1.feature_importance: <NPARRAY> containg the feature importance 
    2. column names: <LIST> containing the column names from the dataframe
    returns: sorted DICT of {key: column_name ,value :importance}
    '''
    global xcv
    feature_map={}
    for feature in zip(feature_importance,column_name):
        feature_map[feature[1]]=feature[0]
    sorted_feature_map=sorted(feature_map.items(), key=lambda x: x[1],reverse=True)
    xcv=sorted_feature_map
    return sorted_feature_map

In [4]:
def feature_importance_lightGBM(data,categorical_columns):
    '''
    FINDS THE FEATURE IMPORTANCE
    1.data:<SPARK DATAFRAME> Actual Dataframe
    2.categorical_columns:<LIST> of categorical columns
    RETURNS : <DICT> of top N_features
    '''
    import lightgbm as lgb
    import matplotlib.pyplot as plt
    data_pandas=data.toPandas()
    y=data_pandas['TARGET']
    data_pandas.drop(['TARGET','SK_ID_CURR'],axis=1,inplace=True)
    for col in categorical_columns:
        data_pandas[col]=data_pandas[col].astype('category')
    
    
    d_train = lgb.Dataset(data_pandas.iloc[:50000], label=y[:50000])

    param = {'learning_rate' : 0.1, 'n_estimators': 100}

    model2 = lgb.train(params=param,train_set=d_train,categorical_feature=list(categorical_columns))

    print('Plot feature importances…')

    #ax = lgb.plot_importance(model2, max_num_features=40,figsize=(16,16))

    #plt.show()
    feature_dict=sort_features(model2.feature_importance(),data_pandas.columns)
    return dict(feature_dict[:40])

In [5]:
def fill_na_numerical(data,columns):
    columns=list(columns)
    imputer=Imputer(inputCols=columns,outputCols=['imputed_'+str(col) for col in columns])
    dataCopy=imputer.fit(data).transform(data)
    return dataCopy

In [6]:
def get_max_value(count_dict):
    '''
    GET MAX VALUE FROM A DICTIONARY
    args:
    1.count_dict: <DICT> frequency of each category {key=category name, value=count}
    return: a tuple containing _key with tha maximim value _max
    '''
    _max=0
    _key=0
    for k,v in count_dict.items():
        if v>_max:
            _key=k
            _max=v
    return (_key,_max)

In [7]:
def fill_na_categorical(data,columns):
    '''
    FILL NULL VALUES FOR CATEGORICAL DATA
    args:
    1.data: <SPARK DATAFRAME> actual spark dataframe
    2.columns: <LIST> of categorical columns we want to Impute

    return: <SPARK DATAFRAME>Imputed spark dataframe
    '''
    for category in columns:
        null_count=data.where(data[category].isNull()).count()
        print('NULL Values before---->',null_count)
        count_dict=data.cube(category).count().toPandas().set_index(category).to_dict()['count']
        #frequency_distribution_categorical(count_dict,category)
        key,_=get_max_value(count_dict)
        print('NULL value Imputed with:',key)
        data=data.fillna(key,subset=[category])
        null_count=data.where(data[category].isNull()).count()
        print('NULL Values before---->',null_count)
    return data

In [8]:
def find_null_values(data,column_names,ratio=0.5):
    '''
    FIND ALL NULL VALUES IN DATASET
    args:
    1. data: Actual Spark < DataFrame >
    2. column_names: < LIST > of columns to find missing values
    3. total_count: < int > row count
    4. ration: < float > range(0-1) how much null values is acceptable
    return: < dict > {key=column name over having more than acceptable null values, value= total number of null values}
    '''
    
    total_count=data.count()
    drop_column_list={}
    print('REMOVING columns with NULL value above:',total_count*ratio,ratio*100)
    #print('----------------------------------------------------------')
    for col in column_names:
        #print(col,' having NULL VALUES -->',data.where(data[col].isNull()).count())
        null_count=data.where(data[col].isNull()).count()
        
        if null_count>(total_count*ratio) and null_count>0:
            #print(col,' having NULL VALUES -->',null_count)
            drop_column_list[col]=null_count
    return drop_column_list

In [9]:
def pipeline(data,ratio=0.3):
    print('1.Data Cleaning and Preprocessing')
    column_names=data.columns
    drop_column_list=find_null_values(data,column_names,ratio)
    data=data.drop(*drop_column_list)
    #print('COLUMNS after removing NULL values\n',data.columns)
    
    # GETTING THE NUMERICAL AND CATEGORICAL COLUMN
    categorial_columns=[cat[0] for cat in data.dtypes if cat[1]=='string']
    numerical_columns=set(data.columns)-set(categorial_columns)
    print(categorial_columns)
    
    print('IMPUTING CATEGORICAL VALUES')
    cate_fill=find_null_values(data,categorial_columns,ratio=0)
    data=fill_na_categorical(data,cate_fill)
    
    
    print('IMPUTING NUMERICAL VALUES')
    print(len(data.columns))
    impute_numerical=find_null_values(data,numerical_columns,ratio=0)
    #print('FILLING NUMERICAL VALUE')
    data=fill_na_numerical(data,impute_numerical.keys())
    data=data.drop(*list(impute_numerical.keys()))
    print('Final number of columns after pre-processing',len(data.columns))
    
    print('2.FEATURE SELECTION')
    feature_list=feature_importance_lightGBM(data,categorial_columns)
    label_column=data.select('TARGET')
    data=data.drop(*['TARGET','SK_ID_CURR'])
    print('Final 40 columns selected after Feature Selection!')
    return feature_list,data

In [10]:
feature_list,data=pipeline(data)

1.Data Cleaning and Preprocessing
REMOVING columns with NULL value above: 92253.3 30.0
['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE']
IMPUTING CATEGORICAL VALUES
REMOVING columns with NULL value above: 0 0
NULL Values before----> 1292
NULL value Imputed with: Unaccompanied
NULL Values before----> 0
IMPUTING NUMERICAL VALUES
72
REMOVING columns with NULL value above: 0 0
Final number of columns after pre-processing 72
2.FEATURE SELECTION


New categorical_feature is ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_CONTRACT_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'NAME_INCOME_TYPE', 'NAME_TYPE_SUITE', 'ORGANIZATION_TYPE', 'WEEKDAY_APPR_PROCESS_START']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Plot feature importances…
Final 40 columns selected after Feature Selection!
