# Random Forest

In [1]:
import os
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

import findspark
findspark.init()

import pyspark
#import SparkSession
from pyspark.sql import SparkSession

In [2]:
#import SparkSession
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('random_forest').getOrCreate()

In [3]:
#read the dataset
df=spark.read.csv('affairs.csv',inferSchema=True,header=True)

In [4]:
#check the shape of the data 
print((df.count(),len(df.columns)))

(6366, 6)


As we can see there are no categorical columns which need to be
converted into numerical form.

In [5]:
#printSchema
df.printSchema()

root
 |-- rate_marriage: integer (nullable = true)
 |-- age: double (nullable = true)
 |-- yrs_married: double (nullable = true)
 |-- children: double (nullable = true)
 |-- religious: integer (nullable = true)
 |-- affairs: integer (nullable = true)



In [6]:
#view the dataset
df.show(5)

+-------------+----+-----------+--------+---------+-------+
|rate_marriage| age|yrs_married|children|religious|affairs|
+-------------+----+-----------+--------+---------+-------+
|            5|32.0|        6.0|     1.0|        3|      0|
|            4|22.0|        2.5|     0.0|        2|      0|
|            3|32.0|        9.0|     3.0|        3|      1|
|            3|27.0|       13.0|     3.0|        1|      1|
|            4|22.0|        2.5|     0.0|        1|      1|
+-------------+----+-----------+--------+---------+-------+
only showing top 5 rows



We can observe that the average age of people is close to 29 years, and
they have been married for 9 years.

In [7]:
#Exploratory Data Analysis
df.describe().select('summary','rate_marriage','age','yrs_married','children','religious').show()

+-------+------------------+------------------+-----------------+------------------+------------------+
|summary|     rate_marriage|               age|      yrs_married|          children|         religious|
+-------+------------------+------------------+-----------------+------------------+------------------+
|  count|              6366|              6366|             6366|              6366|              6366|
|   mean| 4.109644989004084|29.082862079798932| 9.00942507068803|1.3968740182218033|2.4261702796104303|
| stddev|0.9614295945655025| 6.847881883668817|7.280119972766412| 1.433470828560344|0.8783688402641785|
|    min|                 1|              17.5|              0.5|               0.0|                 1|
|    max|                 5|              42.0|             23.0|               5.5|                 4|
+-------+------------------+------------------+-----------------+------------------+------------------+



we have almost 33% of the people who are involved in some
sort of extramarital affair out of a total number of people

In [8]:
df.groupBy('affairs').count().show()

+-------+-----+
|affairs|count|
+-------+-----+
|      1| 2053|
|      0| 4313|
+-------+-----+



In [9]:
df.groupBy('rate_marriage').count().show()

+-------------+-----+
|rate_marriage|count|
+-------------+-----+
|            1|   99|
|            3|  993|
|            5| 2684|
|            4| 2242|
|            2|  348|
+-------------+-----+



Let’s drill down a little bit further to
understand if the marriage rating is related to the affair variable or not. 

Clearly, the figures indicate a high percentage of people having affairs
when rating their marriages low (74%,63%,55%,32%,18%). This might prove to be a useful feature for
the prediction.

In [10]:
df.groupBy('rate_marriage','affairs').count().orderBy('rate_marriage','affairs','count',ascending=True).show()

+-------------+-------+-----+
|rate_marriage|affairs|count|
+-------------+-------+-----+
|            1|      0|   25|
|            1|      1|   74|
|            2|      0|  127|
|            2|      1|  221|
|            3|      0|  446|
|            3|      1|  547|
|            4|      0| 1518|
|            4|      1|  724|
|            5|      0| 2197|
|            5|      1|  487|
+-------------+-------+-----+



We have a similar story from ratings on religious perspective as well
as the number of people who have rated lower on religious features and a
higher percentage of affair involvement.

In [11]:
df.groupBy('religious','affairs').count().orderBy('religious','affairs','count',ascending=True).show()

+---------+-------+-----+
|religious|affairs|count|
+---------+-------+-----+
|        1|      0|  613|
|        1|      1|  408|
|        2|      0| 1448|
|        2|      1|  819|
|        3|      0| 1715|
|        3|      1|  707|
|        4|      0|  537|
|        4|      1|  119|
+---------+-------+-----+



The above table does not clearly indicate any of the trends regarding
the relation between the number of children and chances of being
involved in an affair.

In [12]:
df.groupBy('children','affairs').count().orderBy('children','affairs','count',ascending=True).show()

+--------+-------+-----+
|children|affairs|count|
+--------+-------+-----+
|     0.0|      0| 1912|
|     0.0|      1|  502|
|     1.0|      0|  747|
|     1.0|      1|  412|
|     2.0|      0|  873|
|     2.0|      1|  608|
|     3.0|      0|  460|
|     3.0|      1|  321|
|     4.0|      0|  197|
|     4.0|      1|  131|
|     5.5|      0|  124|
|     5.5|      1|   79|
+--------+-------+-----+



In [13]:
df.groupBy('affairs').mean().show()

+-------+------------------+------------------+------------------+------------------+------------------+------------+
|affairs|avg(rate_marriage)|          avg(age)|  avg(yrs_married)|     avg(children)|    avg(religious)|avg(affairs)|
+-------+------------------+------------------+------------------+------------------+------------------+------------+
|      1|3.6473453482708234|30.537018996590355|11.152459814905017|1.7289332683877252| 2.261568436434486|         1.0|
|      0| 4.329700904242986| 28.39067934152562| 7.989334569904939|1.2388128912589844|2.5045212149316023|         0.0|
+-------+------------------+------------------+------------------+------------------+------------------+------------+



## Feature Engineer

We need to assemble all of the input columns into a single vector
that would act as the input feature for the model. So,we select the input
columns that we need to use to create the single feature vector and name
the output vector as features.

In [14]:
from pyspark.ml.feature import VectorAssembler

In [15]:
df_assembler = VectorAssembler(inputCols=['rate_marriage', 'age', 'yrs_married', 'children', 'religious'], outputCol="features")
df = df_assembler.transform(df)

In [16]:
df.printSchema()

root
 |-- rate_marriage: integer (nullable = true)
 |-- age: double (nullable = true)
 |-- yrs_married: double (nullable = true)
 |-- children: double (nullable = true)
 |-- religious: integer (nullable = true)
 |-- affairs: integer (nullable = true)
 |-- features: vector (nullable = true)



In [17]:
df.select(['features','affairs']).show(10,False)

+-----------------------+-------+
|features               |affairs|
+-----------------------+-------+
|[5.0,32.0,6.0,1.0,3.0] |0      |
|[4.0,22.0,2.5,0.0,2.0] |0      |
|[3.0,32.0,9.0,3.0,3.0] |1      |
|[3.0,27.0,13.0,3.0,1.0]|1      |
|[4.0,22.0,2.5,0.0,1.0] |1      |
|[4.0,37.0,16.5,4.0,3.0]|1      |
|[5.0,27.0,9.0,1.0,1.0] |1      |
|[4.0,27.0,9.0,0.0,2.0] |1      |
|[5.0,37.0,23.0,5.5,2.0]|1      |
|[5.0,37.0,23.0,5.5,2.0]|1      |
+-----------------------+-------+
only showing top 10 rows



In [18]:
#select data for building model
model_df = df.select(['features','affairs'])

In [19]:
train_df,test_df = model_df.randomSplit([0.75,0.25])

In [20]:
train_df.count()

4811

We can see the training and the test sets are propotional equivalent:
    
    -> 1537/(1537+3205)=0,324
    -> 516/(516+1108)=0,317

In [21]:
train_df.groupBy('affairs').count().show()

+-------+-----+
|affairs|count|
+-------+-----+
|      1| 1549|
|      0| 3262|
+-------+-----+



In [22]:
test_df.groupBy('affairs').count().show()

+-------+-----+
|affairs|count|
+-------+-----+
|      1|  504|
|      0| 1051|
+-------+-----+



## Build and Train Random Forest Model

In [23]:
from pyspark.ml.classification import RandomForestClassifier

### Training

In [24]:
rf_classifier=RandomForestClassifier(labelCol='affairs',numTrees=50).fit(train_df)

### Evaluation

In [25]:
rf_predictions=rf_classifier.transform(test_df)

The first column in the predictions table is that of input features of the
test data. 

The second column is the actual label or output of the test data.

The third column (rawPrediction) represents the measure of confidence
for both possible outputs. 

The fourth column is that of conditional
probability of each class label. 

The final column is the prediction by the
random forest classifier.

In [26]:
rf_predictions.show()

+--------------------+-------+--------------------+--------------------+----------+
|            features|affairs|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|[1.0,22.0,2.5,1.0...|      1|[14.2188952902338...|[0.28437790580467...|       1.0|
|[1.0,22.0,2.5,1.0...|      0|[16.8945378580841...|[0.33789075716168...|       1.0|
|[1.0,22.0,2.5,1.0...|      0|[16.8945378580841...|[0.33789075716168...|       1.0|
|[1.0,27.0,2.5,0.0...|      1|[18.4397123455307...|[0.36879424691061...|       1.0|
|[1.0,27.0,2.5,0.0...|      1|[18.4397123455307...|[0.36879424691061...|       1.0|
|[1.0,27.0,6.0,1.0...|      1|[15.9443596769160...|[0.31888719353832...|       1.0|
|[1.0,27.0,6.0,1.0...|      0|[16.6120043807729...|[0.33224008761545...|       1.0|
|[1.0,27.0,6.0,2.0...|      1|[18.1836007615482...|[0.36367201523096...|       1.0|
|[1.0,27.0,6.0,3.0...|      0|[15.6418369099363...|[0.31283673819872...|    

We can apply a groupBy function on the prediction
column to find out the number of predictions made for the positive and
negative classes.

In [27]:
rf_predictions.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0| 1255|
|       1.0|  300|
+----------+-----+



In [28]:
rf_predictions.select(['probability','affairs','prediction']).show(10,False)

+----------------------------------------+-------+----------+
|probability                             |affairs|prediction|
+----------------------------------------+-------+----------+
|[0.28437790580467753,0.7156220941953224]|1      |1.0       |
|[0.33789075716168365,0.6621092428383164]|0      |1.0       |
|[0.33789075716168365,0.6621092428383164]|0      |1.0       |
|[0.36879424691061563,0.6312057530893843]|1      |1.0       |
|[0.36879424691061563,0.6312057530893843]|1      |1.0       |
|[0.3188871935383216,0.6811128064616784] |1      |1.0       |
|[0.3322400876154589,0.667759912384541]  |0      |1.0       |
|[0.3636720152309641,0.636327984769036]  |1      |1.0       |
|[0.31283673819872604,0.687163261801274] |0      |1.0       |
|[0.34117727657597785,0.6588227234240221]|0      |1.0       |
+----------------------------------------+-------+----------+
only showing top 10 rows



### Accuracy

In [29]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [30]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [31]:
rf_accuracy=MulticlassClassificationEvaluator(labelCol='affairs',metricName='accuracy').evaluate(rf_predictions)

In [32]:
print('The accuracy of RF on test data is {0:.0%}'.format(rf_accuracy))

The accuracy of RF on test data is 71%


In [33]:
print(rf_accuracy)

0.7144694533762058


### Precision

In [34]:
rf_precision=MulticlassClassificationEvaluator(labelCol='affairs',metricName='weightedPrecision').evaluate(rf_predictions)

In [35]:
print('The precision rate on test data is {0:.0%}'.format(rf_precision))

The precision rate on test data is 70%


In [36]:
rf_precision

0.6958624665325835

### AUC

In [37]:
rf_auc=BinaryClassificationEvaluator(labelCol='affairs').evaluate(rf_predictions)

In [38]:
print(rf_auc)

0.7446772159545713


In [39]:
# Feature importance

In [40]:
rf_classifier.featureImportances

SparseVector(5, {0: 0.5589, 1: 0.0321, 2: 0.2528, 3: 0.0744, 4: 0.0819})

The _rate_marriage_ is the most important feature from a prediction
standpoint followed by _yrs_married_. The least significant variable seems to
be _Age_.

In [41]:
df.schema["features"].metadata["ml_attr"]["attrs"]

{'numeric': [{'idx': 0, 'name': 'rate_marriage'},
  {'idx': 1, 'name': 'age'},
  {'idx': 2, 'name': 'yrs_married'},
  {'idx': 3, 'name': 'children'},
  {'idx': 4, 'name': 'religious'}]}

In [42]:
# Save the model 

In [46]:
pwd

'C:\\Users\\ctw00071\\Desktop\\Desktop\\PySpark\\chapter_6_Random_Forests'

## Saving the Model
Sometimes, after training the model, we just need to call the model for
preditions, and hence it makes a lot of sense to persist the model object
and reuse it for predictions. There are two parts to this.

- 1. Save the ML model
- 2. Load the ML model

In [54]:
rf_classifier.save("../chapter_6_Random_Forests/RF_model")

In [56]:
from pyspark.ml.classification import RandomForestClassificationModel

In [58]:
rf=RandomForestClassificationModel.load("../chapter_6_Random_Forests/RF_model")

In [59]:
model_preditions=rf.transform(test_df)

In [60]:
model_preditions.show()

+--------------------+-------+--------------------+--------------------+----------+
|            features|affairs|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|[1.0,22.0,2.5,1.0...|      1|[14.2188952902338...|[0.28437790580467...|       1.0|
|[1.0,22.0,2.5,1.0...|      0|[16.8945378580841...|[0.33789075716168...|       1.0|
|[1.0,22.0,2.5,1.0...|      0|[16.8945378580841...|[0.33789075716168...|       1.0|
|[1.0,27.0,2.5,0.0...|      1|[18.4397123455307...|[0.36879424691061...|       1.0|
|[1.0,27.0,2.5,0.0...|      1|[18.4397123455307...|[0.36879424691061...|       1.0|
|[1.0,27.0,6.0,1.0...|      1|[15.9443596769160...|[0.31888719353832...|       1.0|
|[1.0,27.0,6.0,1.0...|      0|[16.6120043807729...|[0.33224008761545...|       1.0|
|[1.0,27.0,6.0,2.0...|      1|[18.1836007615482...|[0.36367201523096...|       1.0|
|[1.0,27.0,6.0,3.0...|      0|[15.6418369099363...|[0.31283673819872...|    