## Importing Libraries

In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
#from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer, MinMaxScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder.master("local[*]").getOrCreate()

23/09/24 16:49:07 WARN Utils: Your hostname, MacBook-Air-de-Mauricio-5.local resolves to a loopback address: 127.0.0.1; using 192.168.100.74 instead (on interface en0)
23/09/24 16:49:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/24 16:49:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
import numpy as np
import pandas as pd

## Loading data

In [3]:
data = spark.read.csv('train.csv', header = True, inferSchema = True)
data.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

## Preparing the data

> ### Splitting the data

We will use 80% of data as training data, and 20% as testing data.

In [64]:
splits = data.randomSplit([0.8,0.2])
train = splits[0]
test = splits[1]
train_rows = train.count()
test_rows = test.count()
print("Training Rows:", train_rows, " Testing Rows:", test_rows)

Training Rows: 710  Testing Rows: 181


> ### Defining our pipeline for five relevant features (Age,Fare,Sex,Pclass,SibSp)

In [65]:
#Importing libraries
from pyspark.ml.feature import Imputer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import RandomForestClassifier, NaiveBayes

# Create an Imputer transformer to fill null values with median
imputer = Imputer(inputCols=["Age", "Fare"], outputCols=["Age_imputed", "Fare_imputed"], strategy="median")

# Modify the VectorAssembler to include the imputed columns
numVect = VectorAssembler(inputCols=["Fare_imputed", "Age_imputed"], outputCol="numFeatures")

# Define the pipeline stages
strIdx = StringIndexer(inputCol="Sex", outputCol="SexIdx")
catVect = VectorAssembler(inputCols=["Pclass", "SexIdx", "SibSp"], outputCol="catFeatures")
catIdx = VectorIndexer(inputCol=catVect.getOutputCol(), outputCol="idxCatFeatures")
minMax = MinMaxScaler(inputCol=numVect.getOutputCol(), outputCol="normFeatures")
featVect = VectorAssembler(inputCols=["idxCatFeatures", "normFeatures"], outputCol="features")

#Logistic Regression
lr = LogisticRegression(labelCol="Survived", featuresCol="features", maxIter=10, regParam=0.3)
nb = NaiveBayes(labelCol = "Survived",featuresCol="features",smoothing=1.0)
rf = RandomForestClassifier(labelCol = "Survived",featuresCol="features",numTrees=10, maxDepth=5, seed=42)
# Define the pipeline
pipeline_lr = Pipeline(stages=[strIdx, catVect, catIdx, imputer, numVect, minMax, featVect, lr])
pipeline_nb = Pipeline(stages=[strIdx, catVect, catIdx, imputer, numVect, minMax, featVect, nb])
pipeline_rf = Pipeline(stages=[strIdx, catVect, catIdx, imputer, numVect, minMax, featVect, rf])

### Training the model

In [66]:
pipline_model_lr = pipeline_lr.fit(train)
pipline_model_nb = pipeline_nb.fit(train)
pipline_model_rf = pipeline_rf.fit(train)

### Generate label predictions with logistic regression

In [87]:
prediction_lr = pipline_model_lr.transform(test)
print(prediction_lr.columns)

predicted_lr = prediction_lr.select("features","prediction","Survived")
predicted_lr.show(10,truncate = False)

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'SexIdx', 'catFeatures', 'idxCatFeatures', 'Age_imputed', 'Fare_imputed', 'numFeatures', 'normFeatures', 'features', 'rawPrediction', 'probability', 'prediction']
+------------------------------------------------------+----------+--------+
|features                                              |prediction|Survived|
+------------------------------------------------------+----------+--------+
|[0.0,1.0,1.0,0.13913573538264068,0.4722292033174164]  |1.0       |1       |
|[2.0,1.0,0.0,0.015468569817999833,0.32143754712239253]|0.0       |1       |
|[2.0,0.0,1.0,0.06104473451835265,0.4847951746670017]  |0.0       |0       |
|[1.0,0.0,0.0,0.025374310111545468,0.35913546117114853]|0.0       |1       |
|[2.0,1.0,1.0,0.03513366015444757,0.38426740387031916] |0.0       |0       |
|[0.0,1.0,1.0,0.2859895551532101,0.35913546117114853]  |1.0       |1       |
|[0.0,0.0,1.0,0.10149724044

### Acuraccy, Recall, F1-Score for logistic regression

In [104]:
# Count the number of correct predictions
correct_predictions_lr = predicted_lr.filter(predicted_lr.prediction == predicted_lr.Survived).count()

# Calculate the total number of predictions
total_predictions_lr = predicted_rf.count()

# Calculate the accuracy
accuracy_lr = correct_predictions_lr / total_predictions_lr

# Count the number of true positives
true_positives_lr = predicted_lr.filter((predicted_lr.prediction == 1) & (predicted_lr.Survived == 1)).count()

# Count the number of false negatives
false_negatives_lr = predicted_lr.filter((predicted_lr.prediction == 0) & (predicted_lr.Survived == 1)).count()

# Calculate the recall
recall_lr = true_positives_lr / (true_positives_lr + false_negatives_lr)

print("Accuracy of random forest:", accuracy_lr)
print("Recall of random forest:", recall_rf)
f1_score = 2 * tp / (2 * tp + fp + fn)
print("F1 score for logistic regression:", f1_score)

Accuracy of random forest: 0.7569060773480663
Recall of random forest: 0.6756756756756757
F1 score for logistic regression: 0.7101449275362319


### Generate label predictions with Naive bayes

In [86]:
prediction_nb = pipline_model_nb.transform(test)
print(prediction_nb.columns)

predicted_nb = prediction_nb.select("features","prediction","Survived")
predicted_nb.show(10,truncate = False)

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'SexIdx', 'catFeatures', 'idxCatFeatures', 'Age_imputed', 'Fare_imputed', 'numFeatures', 'normFeatures', 'features', 'rawPrediction', 'probability', 'prediction']
+------------------------------------------------------+----------+--------+
|features                                              |prediction|Survived|
+------------------------------------------------------+----------+--------+
|[0.0,1.0,1.0,0.13913573538264068,0.4722292033174164]  |1.0       |1       |
|[2.0,1.0,0.0,0.015468569817999833,0.32143754712239253]|1.0       |1       |
|[2.0,0.0,1.0,0.06104473451835265,0.4847951746670017]  |0.0       |0       |
|[1.0,0.0,0.0,0.025374310111545468,0.35913546117114853]|0.0       |1       |
|[2.0,1.0,1.0,0.03513366015444757,0.38426740387031916] |1.0       |0       |
|[0.0,1.0,1.0,0.2859895551532101,0.35913546117114853]  |1.0       |1       |
|[0.0,0.0,1.0,0.10149724044

### Acuraccy, Recall and F1-Score for Naive Bayes

In [105]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Make predictions on the test data
prediction_nb = pipline_model_nb.transform(test)

# Create a MulticlassClassificationEvaluator object
evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")

# Calculate the accuracy
accuracy = evaluator.evaluate(prediction_nb)

# Create a BinaryClassificationEvaluator object
evaluator = BinaryClassificationEvaluator(labelCol="Survived", rawPredictionCol="rawPrediction", metricName="areaUnderROC")

# Calculate the F1 score
tp = prediction_nb.filter((prediction_nb.Survived == 1) & (prediction_nb.prediction == 1)).count()
fp = prediction_nb.filter((prediction_nb.Survived == 0) & (prediction_nb.prediction == 1)).count()
fn = prediction_nb.filter((prediction_nb.Survived == 1) & (prediction_nb.prediction == 0)).count()
recall = tp / (tp + fn)
f1_score = 2 * tp / (2 * tp + fp + fn)

print("Accuracy for logistic regression:", accuracy)
print("Recall for logistic regression:", recall)
print("F1 score for logistic regression:", f1_score)


Accuracy for logistic regression: 0.7790055248618785
Recall for logistic regression: 0.6621621621621622
F1 score for logistic regression: 0.7101449275362319


### Generate predictions with Random Forest

In [88]:
prediction_rf = pipline_model_rf.transform(test)
print(prediction_rf.columns)

predicted_rf = prediction_rf.select("features","prediction","Survived")
predicted_rf.show(10,truncate = False)

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'SexIdx', 'catFeatures', 'idxCatFeatures', 'Age_imputed', 'Fare_imputed', 'numFeatures', 'normFeatures', 'features', 'rawPrediction', 'probability', 'prediction']
+------------------------------------------------------+----------+--------+
|features                                              |prediction|Survived|
+------------------------------------------------------+----------+--------+
|[0.0,1.0,1.0,0.13913573538264068,0.4722292033174164]  |1.0       |1       |
|[2.0,1.0,0.0,0.015468569817999833,0.32143754712239253]|1.0       |1       |
|[2.0,0.0,1.0,0.06104473451835265,0.4847951746670017]  |0.0       |0       |
|[1.0,0.0,0.0,0.025374310111545468,0.35913546117114853]|0.0       |1       |
|[2.0,1.0,1.0,0.03513366015444757,0.38426740387031916] |1.0       |0       |
|[0.0,1.0,1.0,0.2859895551532101,0.35913546117114853]  |1.0       |1       |
|[0.0,0.0,1.0,0.10149724044

### Acuraccy, Recall and F1 Score for Random Forest

In [102]:
# Count the number of correct predictions
correct_predictions_rf = predicted_rf.filter(predicted_rf.prediction == predicted_rf.Survived).count()

# Calculate the total number of predictions
total_predictions_rf = predicted_rf.count()

# Calculate the accuracy
accuracy_rf = correct_predictions_rf / total_predictions_rf

# Count the number of true positives
true_positives_rf = predicted_rf.filter((predicted_rf.prediction == 1) & (predicted_rf.Survived == 1)).count()

# Count the number of false negatives
false_negatives_rf = predicted_rf.filter((predicted_rf.prediction == 0) & (predicted_rf.Survived == 1)).count()

# Calculate the recall
recall_rf = true_positives_rf / (true_positives_rf + false_negatives_rf)

print("Accuracy of random forest:", accuracy_rf)
print("Recall of random forest:", recall_rf)
f1_score = 2 * tp / (2 * tp + fp + fn)
print("F1 score for logistic regression:", f1_score)

Accuracy of random forest: 0.8066298342541437
Recall of random forest: 0.6756756756756757
F1 score for logistic regression: 0.7101449275362319
