<a href="https://colab.research.google.com/github/malikbaqi12/Applied-data-science-using-pyspark-code-files/blob/main/Exercise_3_2_Model_based_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Default Parameters

In [31]:
filename = "bank-full.csv"
target_variable_name = "y"

# Load Dataset

In [32]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [33]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = spark.read.csv(filename, header=True, inferSchema=True, sep=';')
df.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may

# Identify variable types

In [34]:
def variable_type(df):
    
    vars_list = df.dtypes
    char_vars = []
    num_vars = []
    for i in vars_list:
        if i[1] in ('string'):
            char_vars.append(i[0])
        else:
            num_vars.append(i[0])
    
    return char_vars, num_vars

In [35]:
char_vars, num_vars = variable_type(df)

In [36]:
char_vars

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome',
 'y']

In [37]:
num_vars

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [38]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

def category_to_index(df, char_vars):
    
    char_df = df.select(char_vars)
    indexers = [StringIndexer(inputCol=c, outputCol=c+"_index", handleInvalid="keep") for c in char_df.columns]
    pipeline = Pipeline(stages=indexers)
    char_labels = pipeline.fit(char_df)
    df = char_labels.transform(df)
    return df, char_labels

In [39]:
df, char_labels = category_to_index(df, char_vars)

In [40]:
df.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('y', 'string'),
 ('job_index', 'double'),
 ('marital_index', 'double'),
 ('education_index', 'double'),
 ('default_index', 'double'),
 ('housing_index', 'double'),
 ('loan_index', 'double'),
 ('contact_index', 'double'),
 ('month_index', 'double'),
 ('poutcome_index', 'double'),
 ('y_index', 'double')]

In [41]:
df = df.select([c for c in df.columns if c not in char_vars])

In [42]:
from pyspark.sql.functions import col

def rename_columns(df, char_vars):
    mapping = dict(zip([i + '_index' for i in char_vars], char_vars))
    df = df.select([col(c).alias(mapping.get(c, c)) for c in df.columns])
    return df

In [43]:
df = rename_columns(df, char_vars)

In [44]:
df.dtypes

[('age', 'int'),
 ('balance', 'int'),
 ('day', 'int'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('job', 'double'),
 ('marital', 'double'),
 ('education', 'double'),
 ('default', 'double'),
 ('housing', 'double'),
 ('loan', 'double'),
 ('contact', 'double'),
 ('month', 'double'),
 ('poutcome', 'double'),
 ('y', 'double')]

In [45]:
df.groupBy('y').count().show() 

+---+-----+
|  y|count|
+---+-----+
|0.0|39922|
|1.0| 5289|
+---+-----+



# Assemble input vectors

In [46]:
from pyspark.ml.feature import VectorAssembler

#assemble individual columns to one column - 'features'
def assemble_vectors(df, features_list, target_variable_name):
    stages = []
    #assemble vectors
    assembler = VectorAssembler(inputCols=features_list, outputCol='features')
    stages = [assembler]
    #select all the columns + target + newly created 'features' column
    selectedCols = [target_variable_name, 'features'] + features_list
    #use pipeline to process sequentially
    pipeline = Pipeline(stages=stages)
    #assembler model
    assembleModel = pipeline.fit(df)
    #apply assembler model on data
    df = assembleModel.transform(df).select(selectedCols)

    return df

In [47]:
#exclude target variable and select all other feature vectors
features_list = df.columns
features_list.remove(target_variable_name)

In [48]:
features_list

['age',
 'balance',
 'day',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [49]:
# apply the function on our dataframe
df = assemble_vectors(df, features_list, target_variable_name)

In [50]:
df.show()

+---+--------------------+---+-------+---+--------+--------+-----+--------+----+-------+---------+-------+-------+----+-------+-----+--------+
|  y|            features|age|balance|day|duration|campaign|pdays|previous| job|marital|education|default|housing|loan|contact|month|poutcome|
+---+--------------------+---+-------+---+--------+--------+-----+--------+----+-------+---------+-------+-------+----+-------+-----+--------+
|0.0|(16,[0,1,2,3,4,5,...| 58|   2143|  5|     261|       1|   -1|       0| 1.0|    0.0|      1.0|    0.0|    0.0| 0.0|    1.0|  0.0|     0.0|
|0.0|(16,[0,1,2,3,4,5,...| 44|     29|  5|     151|       1|   -1|       0| 2.0|    1.0|      0.0|    0.0|    0.0| 0.0|    1.0|  0.0|     0.0|
|0.0|(16,[0,1,2,3,4,5,...| 33|      2|  5|      76|       1|   -1|       0| 7.0|    0.0|      0.0|    0.0|    0.0| 1.0|    1.0|  0.0|     0.0|
|0.0|(16,[0,1,2,3,4,5,...| 47|   1506|  5|      92|       1|   -1|       0| 0.0|    0.0|      3.0|    0.0|    0.0| 0.0|    1.0|  0.0|     0.0|

In [51]:
df.schema["features"].metadata["ml_attr"]["attrs"]

{'numeric': [{'idx': 0, 'name': 'age'},
  {'idx': 1, 'name': 'balance'},
  {'idx': 2, 'name': 'day'},
  {'idx': 3, 'name': 'duration'},
  {'idx': 4, 'name': 'campaign'},
  {'idx': 5, 'name': 'pdays'},
  {'idx': 6, 'name': 'previous'}],
 'nominal': [{'vals': ['blue-collar',
    'management',
    'technician',
    'admin.',
    'services',
    'retired',
    'self-employed',
    'entrepreneur',
    'unemployed',
    'housemaid',
    'student',
    'unknown',
    '__unknown'],
   'idx': 7,
   'name': 'job'},
  {'vals': ['married', 'single', 'divorced', '__unknown'],
   'idx': 8,
   'name': 'marital'},
  {'vals': ['secondary', 'tertiary', 'primary', 'unknown', '__unknown'],
   'idx': 9,
   'name': 'education'},
  {'vals': ['no', 'yes', '__unknown'], 'idx': 10, 'name': 'default'},
  {'vals': ['yes', 'no', '__unknown'], 'idx': 11, 'name': 'housing'},
  {'vals': ['no', 'yes', '__unknown'], 'idx': 12, 'name': 'loan'},
  {'vals': ['cellular', 'unknown', 'telephone', '__unknown'],
   'idx': 13,


In [52]:
import pandas as pd
for k, v in df.schema["features"].metadata["ml_attr"]["attrs"].items():
    features_df = pd.DataFrame(v)

In [53]:
features_df

Unnamed: 0,vals,idx,name
0,"[blue-collar, management, technician, admin., ...",7,job
1,"[married, single, divorced, __unknown]",8,marital
2,"[secondary, tertiary, primary, unknown, __unkn...",9,education
3,"[no, yes, __unknown]",10,default
4,"[yes, no, __unknown]",11,housing
5,"[no, yes, __unknown]",12,loan
6,"[cellular, unknown, telephone, __unknown]",13,contact
7,"[may, jul, aug, jun, nov, apr, feb, jan, oct, ...",14,month
8,"[unknown, failure, other, success, __unknown]",15,poutcome


# Model based feature selection

# Question 1: Implement decision tree feature importance. Compare and contrast with Random Forest output.

In [54]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(featuresCol='features', labelCol=target_variable_name)
dt_model = dt.fit(df)
dt_model.featureImportances

SparseVector(16, {1: 0.003, 2: 0.0023, 3: 0.5507, 8: 0.0017, 9: 0.0009, 11: 0.0055, 13: 0.0049, 14: 0.118, 15: 0.3131})

In [55]:
#temporary output rf_output
dt_output = dt_model.featureImportances
features_df['Decision_Tree'] = features_df['idx'].apply(lambda x: dt_output[x] if x in dt_output.indices else 0)

In [56]:
#sort values based on descending importance feature
features_df.sort_values("Decision_Tree", ascending=False, inplace=True)

In [57]:
features_df

Unnamed: 0,vals,idx,name,Decision_Tree
8,"[unknown, failure, other, success, __unknown]",15,poutcome,0.313066
7,"[may, jul, aug, jun, nov, apr, feb, jan, oct, ...",14,month,0.118036
4,"[yes, no, __unknown]",11,housing,0.005458
6,"[cellular, unknown, telephone, __unknown]",13,contact,0.00491
1,"[married, single, divorced, __unknown]",8,marital,0.0017
2,"[secondary, tertiary, primary, unknown, __unkn...",9,education,0.000894
0,"[blue-collar, management, technician, admin., ...",7,job,0.0
3,"[no, yes, __unknown]",10,default,0.0
5,"[no, yes, __unknown]",12,loan,0.0


# Question 2: Implement gradient boosted tree feature importance. Compare and contrast with Random Forest output.

In [29]:
from pyspark.ml.classification import GBTClassifier
gb = GBTClassifier(featuresCol='features', labelCol=target_variable_name)
gb_model = gb.fit(df)
gb_model.featureImportances

SparseVector(16, {0: 0.0314, 1: 0.01, 2: 0.083, 3: 0.3048, 4: 0.0081, 5: 0.0397, 6: 0.0012, 7: 0.0271, 8: 0.0125, 9: 0.0054, 10: 0.0013, 11: 0.0543, 12: 0.0016, 13: 0.0612, 14: 0.2984, 15: 0.0599})

In [30]:
#temporary output rf_output
gb_output = gb_model.featureImportances
features_df['Gradient Boosting'] = features_df['idx'].apply(lambda x: gb_output[x] if x in gb_output.indices else 0)

In [60]:
#sort values based on descending importance feature
#features_df.sort_values("Gradient Boosting", ascending=True, inplace=True)
print(features_df.columns)


Index(['vals', 'idx', 'name', 'Decision_Tree'], dtype='object')


In [61]:
features_df

Unnamed: 0,vals,idx,name,Decision_Tree
8,"[unknown, failure, other, success, __unknown]",15,poutcome,0.313066
7,"[may, jul, aug, jun, nov, apr, feb, jan, oct, ...",14,month,0.118036
4,"[yes, no, __unknown]",11,housing,0.005458
6,"[cellular, unknown, telephone, __unknown]",13,contact,0.00491
1,"[married, single, divorced, __unknown]",8,marital,0.0017
2,"[secondary, tertiary, primary, unknown, __unkn...",9,education,0.000894
0,"[blue-collar, management, technician, admin., ...",7,job,0.0
3,"[no, yes, __unknown]",10,default,0.0
5,"[no, yes, __unknown]",12,loan,0.0


# Question 3: Implement logistic regression feature importance. Compare and contrast with Random Forest output. (Hint: Use the coefficient of logistic regression. Note: Logistic regression produces Dense Vector instead of sparse vector)

In [62]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol='features', labelCol=target_variable_name)
lr_model = lr.fit(df)

In [64]:
lr_model.coefficientMatrix


DenseMatrix(3, 16, [-0.0049, 0.0, -0.0003, 0.0016, -0.0542, -0.0007, -0.0239, -0.0073, ..., -0.3735, 0.0354, 0.0214, -1.6384, 1.2687, 0.3493, -0.2454, -0.3946], 1)

In [67]:
lr_output = lr_model.coefficientMatrix
# absolute value is used to convert the negative coefficients. This should be done only for feature importance.
features_df['Logistic Regression'] = features_df['idx'].apply(lambda x: abs(lr_output[1, x]))


In [68]:
features_df.sort_values("Logistic Regression", ascending=False, inplace=True)

In [69]:
features_df

Unnamed: 0,vals,idx,name,Decision_Tree,Logistic Regression
4,"[yes, no, __unknown]",11,housing,0.005458,1.221455
5,"[no, yes, __unknown]",12,loan,0.0,0.911572
8,"[unknown, failure, other, success, __unknown]",15,poutcome,0.313066,0.618026
6,"[cellular, unknown, telephone, __unknown]",13,contact,0.00491,0.349746
1,"[married, single, divorced, __unknown]",8,marital,0.0017,0.270534
7,"[may, jul, aug, jun, nov, apr, feb, jan, oct, ...",14,month,0.118036,0.199362
3,"[no, yes, __unknown]",10,default,0.0,0.102095
2,"[secondary, tertiary, primary, unknown, __unkn...",9,education,0.000894,0.01869
0,"[blue-collar, management, technician, admin., ...",7,job,0.0,0.01331


# Random forest addition for voting based selection

In [70]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol='features', labelCol=target_variable_name)
rf_model = rf.fit(df)
rf_model.featureImportances

SparseVector(16, {0: 0.0378, 1: 0.0029, 2: 0.0043, 3: 0.4265, 4: 0.0029, 5: 0.0386, 6: 0.029, 7: 0.0066, 8: 0.0025, 9: 0.0019, 11: 0.0115, 12: 0.0015, 13: 0.0254, 14: 0.1585, 15: 0.2501})

In [71]:
#temporary output rf_output
rf_output = rf_model.featureImportances
features_df['Random Forest'] = features_df['idx'].apply(lambda x: rf_output[x] if x in rf_output.indices else 0)

In [72]:
#sort values based on descending importance feature
features_df.sort_values("Random Forest", ascending=False, inplace=True)

In [73]:
features_df

Unnamed: 0,vals,idx,name,Decision_Tree,Logistic Regression,Random Forest
8,"[unknown, failure, other, success, __unknown]",15,poutcome,0.313066,0.618026,0.250116
7,"[may, jul, aug, jun, nov, apr, feb, jan, oct, ...",14,month,0.118036,0.199362,0.158498
6,"[cellular, unknown, telephone, __unknown]",13,contact,0.00491,0.349746,0.025356
4,"[yes, no, __unknown]",11,housing,0.005458,1.221455,0.011549
0,"[blue-collar, management, technician, admin., ...",7,job,0.0,0.01331,0.006636
1,"[married, single, divorced, __unknown]",8,marital,0.0017,0.270534,0.002516
2,"[secondary, tertiary, primary, unknown, __unkn...",9,education,0.000894,0.01869,0.001924
5,"[no, yes, __unknown]",12,loan,0.0,0.911572,0.001522
3,"[no, yes, __unknown]",10,default,0.0,0.102095,0.0


# Voting based selection

In [74]:
features_df.drop('idx', axis=1, inplace=True)


In [86]:
features_df.columns

Index(['vals', 'name', 'Decision_Tree', 'Logistic Regression',
       'Random Forest'],
      dtype='object')

In [88]:
num_top_features = 7
columns = ['Decision_Tree', 'Logistic Regression', 'Random Forest']
balance = pd.DataFrame({},[])
balance['name'] = features_df['name']
for i in columns:
    balance[i] = features_df['name'].isin(list(features_df.nlargest(num_top_features,i)['name'])).astype(int)

In [90]:
score_table['final_score'] = score_table.sum(axis=1)
score_table.sort_values('final_score',ascending=0)

Unnamed: 0,final_score
