# Feature Selection

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.mllib.stat import Statistics
from pyspark.ml.linalg import DenseVector
from pyspark.sql import functions as F

In [None]:
import random
import numpy as np
from pyspark.sql import Row
from sklearn import neighbors
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.stat import Statistics
from pyspark.ml.feature import Imputer

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession\
    .builder\
    .appName("LC_Baseline_Model")\
    .config("spark.hadoop.fs.s3a.s3guard.ddb.region","us-east-1")\
    .config("spark.yarn.access.hadoopFileSystems","s3a://demo-aws-2/")\
    .getOrCreate()

In [None]:
df = spark.sql("SELECT * FROM default.LC_Table")

In [None]:
df_pd = df.groupby("is_default").count().toPandas()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(5,4))
sns.barplot(x=df_pd["is_default"], y=df_pd["count"]).set(title="Target Attribute")
plt.show()

In [None]:
#We will drop this feature based on its imbalance
df.groupby('application_type').count().show()

In [None]:
#We will drop this feature for now although it could be one hot encoded or transformed differently
df.select('emp_length').show(5)

In [None]:
#We keep this feature
df.groupby('verification_status').count().show()

In [None]:
#We remove categorical features that have too broad a set of values, or are highly imbalanced, or could cause data leakage. 
#We can elaborate and use them for feature extraction later, but they are not needed for a baseline
remove = ['addr_state', 'earliest_cr_line', 'home_ownership', 'initial_list_status', 'issue_d', 'emp_length',
          'loan_status', 'purpose', 'sub_grade', 'term', 'title', 'zip_code', 'application_type', 'desc', 'issue_month', 
         'id', 'emp_title', 'verirication_status']
df = df.drop(*remove)

In [None]:
#Creating list of categorical and numeric features
cat_cols = [item[0] for item in df.dtypes if item[1].startswith('string')]
num_cols = [item[0] for item in df.dtypes if item[1].startswith('in') or item[1].startswith('dou')]

In [None]:
#Creating a subset of the data with only numeric features:
df_num = df.select(num_cols)

In [None]:
features = df_num.rdd.map(lambda row: row[0:])

In [None]:
corr_mat=Statistics.corr(features, method="pearson")
corr_df = pd.DataFrame(corr_mat)

In [None]:
corr_df.index, corr_df.columns = df_num.columns, df_num.columns

In [None]:
#Dropping code features that are in numeric datatype (these are really categorical):
corr_df = corr_df.drop(['dti', 'num_rev_accts', 'policy_code', 'revol_util'], axis=1)
corr_df = corr_df.drop(['dti', 'num_rev_accts', 'policy_code', 'revol_util'], axis=0)

In [None]:
#Correlation Matrix between numeric columns
fig, ax = plt.subplots(figsize=(12,12))
sns.heatmap(corr_df, cbar=True, ax=ax, cmap='bwr')
plt.show()