# Random Forest with Hyperparameter Tuning

In [1]:
import numpy as np
import pandas as pd
import pyspark
import sys

In [2]:
import pyspark.sql.functions as fn

In [3]:
import pyspark.pandas as ps

In [4]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [None]:
# Local mode
spark = SparkSession\
        .builder\
        .appName("iris")\
        .getOrCreate()

In [None]:
# yarn mode
spark = SparkSession\
        .builder\
        .master("yarn")\
        .config('spark.executor.instances','99')\
        .config('spark.executor.memory','4G')\
        .appName("iris")\
        .getOrCreate()

In [5]:
# Check spark app name
spark.sparkContext.appName

'PySparkShell'

In [6]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [7]:
ps.set_option("compute.default_index_type", "distributed")

In [8]:
# print runtime versions
# Python version
sys.version

'3.8.10 (default, Sep 28 2021, 16:10:42) \n[GCC 9.3.0]'

In [9]:
# Spark version
spark.version

'3.2.0'

### Exploring Data

In [10]:
# load iris.csv into Spark dataframe
#df = spark.read.csv('file:///vagrant/data/classification_data.csv', header=True, inferSchema=True)
psdf = ps.read_csv('data/classification_data.csv')

In [11]:
# check the shape of the data 
psdf.shape

(46751, 12)

In [12]:
psdf.dtypes

loan_id                                             object
loan_purpose                                        object
is_first_loan                                        int32
total_credit_card_limit                              int32
avg_percentage_credit_card_limit_used_last_year    float64
saving_amount                                        int32
checking_amount                                      int32
is_employed                                          int32
yearly_salary                                        int32
age                                                  int32
dependent_number                                     int32
label                                                int32
dtype: object

In [13]:
# First 5 rows of Iris dataset
psdf.head()

Unnamed: 0,loan_id,loan_purpose,is_first_loan,total_credit_card_limit,avg_percentage_credit_card_limit_used_last_year,saving_amount,checking_amount,is_employed,yearly_salary,age,dependent_number,label
0,A_1,personal,1,7900,0.8,1103,6393,1,16400,42,4,0
1,A_2,personal,0,3300,0.29,2588,832,1,75500,56,1,0
2,A_3,personal,0,7600,0.9,1651,8868,1,59000,46,1,0
3,A_4,personal,1,3400,0.38,1269,6863,1,26000,55,8,0
4,A_5,emergency,0,2600,0.89,1310,3423,1,9700,41,4,1


In [14]:
# Exploratory Data Analysis
psdf.describe()

                                                                                

Unnamed: 0,is_first_loan,total_credit_card_limit,avg_percentage_credit_card_limit_used_last_year,saving_amount,checking_amount,is_employed,yearly_salary,age,dependent_number,label
count,46751.0,46751.0,46751.0,46751.0,46751.0,46751.0,46751.0,46751.0,46751.0,46751.0
mean,0.541443,4615.304485,0.700091,2037.636585,3520.671429,0.917328,29527.6208,41.539796,3.74484,0.346538
std,0.498285,1890.194454,0.177729,1498.671091,2160.933242,0.275389,16149.757703,12.817646,2.619153,0.475872
min,0.0,500.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0
25%,0.0,3200.0,0.58,920.0,1884.0,1.0,19200.0,32.0,2.0,0.0
50%,1.0,4500.0,0.71,1572.0,3050.0,1.0,29600.0,41.0,3.0,0.0
75%,1.0,5900.0,0.83,2907.0,4876.0,1.0,40400.0,50.0,6.0,1.0
max,1.0,13500.0,1.09,10641.0,13165.0,1.0,97200.0,79.0,8.0,1.0


In [15]:
psdf['label'].value_counts()

0    30550
1    16201
Name: label, dtype: int64

In [16]:
#df.groupBy('loan_purpose').count().show()
psdf['loan_purpose'].value_counts()

property      11388
operations    10580
personal      10458
emergency      7562
others         6763
Name: loan_purpose, dtype: int64

### Feature Engineering

In [17]:
df = psdf.to_spark()

In [18]:
loan_purpose_indexer = StringIndexer(inputCol="loan_purpose", outputCol="loan_index").fit(df)
df = loan_purpose_indexer.transform(df)

In [19]:
df.select(['loan_purpose','loan_index']).show(5,False)

+------------+----------+
|loan_purpose|loan_index|
+------------+----------+
|personal    |2.0       |
|personal    |2.0       |
|personal    |2.0       |
|personal    |2.0       |
|emergency   |3.0       |
+------------+----------+
only showing top 5 rows



In [20]:
df.columns

['loan_id',
 'loan_purpose',
 'is_first_loan',
 'total_credit_card_limit',
 'avg_percentage_credit_card_limit_used_last_year',
 'saving_amount',
 'checking_amount',
 'is_employed',
 'yearly_salary',
 'age',
 'dependent_number',
 'label',
 'loan_index']

In [21]:
feature_cols = df.columns[2:-2]
feature_cols += ['loan_index']
feature_cols

['is_first_loan',
 'total_credit_card_limit',
 'avg_percentage_credit_card_limit_used_last_year',
 'saving_amount',
 'checking_amount',
 'is_employed',
 'yearly_salary',
 'age',
 'dependent_number',
 'loan_index']

In [22]:
df_assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df = df_assembler.transform(df)

In [23]:
df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- is_first_loan: integer (nullable = true)
 |-- total_credit_card_limit: integer (nullable = true)
 |-- avg_percentage_credit_card_limit_used_last_year: double (nullable = true)
 |-- saving_amount: integer (nullable = true)
 |-- checking_amount: integer (nullable = true)
 |-- is_employed: integer (nullable = true)
 |-- yearly_salary: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- dependent_number: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- loan_index: double (nullable = false)
 |-- features: vector (nullable = true)



In [24]:
df.select(['features','label']).show(10,False)

+--------------------------------------------------------+-----+
|features                                                |label|
+--------------------------------------------------------+-----+
|[1.0,7900.0,0.8,1103.0,6393.0,1.0,16400.0,42.0,4.0,2.0] |0    |
|[0.0,3300.0,0.29,2588.0,832.0,1.0,75500.0,56.0,1.0,2.0] |0    |
|[0.0,7600.0,0.9,1651.0,8868.0,1.0,59000.0,46.0,1.0,2.0] |0    |
|[1.0,3400.0,0.38,1269.0,6863.0,1.0,26000.0,55.0,8.0,2.0]|0    |
|[0.0,2600.0,0.89,1310.0,3423.0,1.0,9700.0,41.0,4.0,3.0] |1    |
|[0.0,7600.0,0.51,1040.0,2406.0,1.0,22900.0,52.0,0.0,1.0]|0    |
|[1.0,6900.0,0.82,2408.0,5556.0,1.0,34800.0,48.0,4.0,1.0]|0    |
|[0.0,5700.0,0.56,1933.0,4139.0,1.0,32500.0,64.0,2.0,2.0]|0    |
|[1.0,3400.0,0.95,3866.0,4131.0,1.0,13300.0,23.0,3.0,2.0]|0    |
|[0.0,2900.0,0.91,88.0,2725.0,1.0,21100.0,52.0,1.0,2.0]  |1    |
+--------------------------------------------------------+-----+
only showing top 10 rows



In [25]:
# select data for building model
model_df=df.select(['features','label'])

### Split Data - Train & Test sets

In [26]:
# use Random Forest to train on the training set
train_df, test_df = model_df.randomSplit([0.70, 0.30], seed=42)

In [27]:
train_df.count(), len(train_df.columns)

                                                                                

(32776, 2)

In [28]:
test_df.count(), len(test_df.columns)

(13975, 2)

### Build Random Forest Model

In [29]:
rf = RandomForestClassifier()
rf_model = rf.fit(train_df)

                                                                                

In [30]:
# predict on the test set
model_predictions = rf_model.transform(test_df)

In [31]:
# print prediction
model_predictions.show(10)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(10,[1,2,3,4,7],[...|    0|[16.6838849289974...|[0.83419424644987...|       0.0|
|(10,[1,2,3,4,7],[...|    0|[19.2409223242261...|[0.96204611621130...|       0.0|
|(10,[1,2,3,4,7],[...|    1|[5.52256473103276...|[0.27612823655163...|       1.0|
|(10,[1,2,3,4,7],[...|    0|[18.8323430040495...|[0.94161715020247...|       0.0|
|[0.0,500.0,0.59,9...|    1|[3.64939484193013...|[0.18246974209650...|       1.0|
|[0.0,500.0,0.64,1...|    1|[2.12679355088605...|[0.10633967754430...|       1.0|
|[0.0,500.0,0.69,1...|    1|[3.23323701493719...|[0.16166185074685...|       1.0|
|[0.0,500.0,0.76,5...|    1|[2.12679355088605...|[0.10633967754430...|       1.0|
|[0.0,500.0,0.77,1...|    1|[3.31208121245175...|[0.16560406062258...|       1.0|
|[0.0,500.0,0.78

### Evaluate Model

In [32]:
# Select (prediction, true label) to compute AUC
evaluator = BinaryClassificationEvaluator(
    labelCol='label')
rf_auc = evaluator.evaluate(model_predictions)

In [33]:
rf_auc

0.962829631887776

In [34]:
rf_accuracy = MulticlassClassificationEvaluator(labelCol='label',
               metricName='accuracy').evaluate(model_predictions)

In [35]:
'The accuracy of RF on test data is {0:.0%}'.format(rf_accuracy)

'The accuracy of RF on test data is 90%'

### Hyperparameter Tuning

In [36]:
evaluator = BinaryClassificationEvaluator()
rf = RandomForestClassifier()

In [None]:
#paramGrid = (ParamGridBuilder()
#             .addGrid(rf.maxDepth, [5,10,20,25,30])
#             .addGrid(rf.maxBins, [20,30,40 ])
#             .addGrid(rf.numTrees, [5, 20,50])
#             .build())

In [37]:
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [10,20])
             .addGrid(rf.maxBins, [20,30])
             .addGrid(rf.numTrees, [5,20])
             .build())

In [38]:
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, 
                    evaluator=evaluator, numFolds=5)

In [39]:
#%%time
cv_model = cv.fit(train_df)

21/11/16 04:44:39 WARN DAGScheduler: Broadcasting large task binary with size 1079.1 KiB
21/11/16 04:44:40 WARN DAGScheduler: Broadcasting large task binary with size 1638.9 KiB
21/11/16 04:44:41 WARN DAGScheduler: Broadcasting large task binary with size 1108.9 KiB
21/11/16 04:44:44 WARN DAGScheduler: Broadcasting large task binary with size 1084.1 KiB
21/11/16 04:44:44 WARN DAGScheduler: Broadcasting large task binary with size 1625.6 KiB
21/11/16 04:44:45 WARN DAGScheduler: Broadcasting large task binary with size 1124.0 KiB
21/11/16 04:44:46 WARN DAGScheduler: Broadcasting large task binary with size 1096.1 KiB
21/11/16 04:44:47 WARN DAGScheduler: Broadcasting large task binary with size 1345.1 KiB
21/11/16 04:44:47 WARN DAGScheduler: Broadcasting large task binary with size 1581.3 KiB
21/11/16 04:44:47 WARN DAGScheduler: Broadcasting large task binary with size 1788.8 KiB
21/11/16 04:44:47 WARN DAGScheduler: Broadcasting large task binary with size 1970.8 KiB
21/11/16 04:44:47 WAR

In [40]:
best_rf_model = cv_model.bestModel

In [41]:
f'Best Param(maxDepth): {best_rf_model._java_obj.getMaxDepth()}'

'Best Param(maxDepth): 10'

In [42]:
f'Best Param(maxBins): {best_rf_model._java_obj.getMaxBins()}'

'Best Param(maxBins): 30'

In [43]:
f'Best Param(NumTrees): {best_rf_model._java_obj.getNumTrees()}'

'Best Param(NumTrees): 20'

### Evaluate Tuned Model

In [44]:
# Generate predictions for entire dataset
model_predictions = best_rf_model.transform(test_df)

In [45]:
best_rf_auc = evaluator.evaluate(model_predictions)

21/11/16 04:53:10 WARN DAGScheduler: Broadcasting large task binary with size 1170.7 KiB


In [46]:
best_rf_auc

0.9690665508778045

In [47]:
true_pos=model_predictions.filter(model_predictions['label']==1).filter(model_predictions['prediction']==1).count()

21/11/16 04:53:17 WARN DAGScheduler: Broadcasting large task binary with size 1177.9 KiB


In [48]:
actual_pos=model_predictions.filter(model_predictions['label']==1).count()

In [49]:
pred_pos=model_predictions.filter(model_predictions['prediction']==1).count()

21/11/16 04:53:20 WARN DAGScheduler: Broadcasting large task binary with size 1180.1 KiB


In [50]:
#Recall 
float(true_pos)/(actual_pos)

0.9105196982397318

In [51]:
#Precision on test Data 
float(true_pos)/(pred_pos)

0.849960876369327