In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=a1d96b3121c74946baaafbad4e5879742a05b973a1dc3a238362639b942017dc
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('BigDataProject').getOrCreate()

In [3]:
# read data
df = pd.read_csv('./data_cleaned.csv')
df.head()

############# checking the accuracy of the model before oversampling ###########
X_train, X_test, y_train, y_test = train_test_split(df.drop('HeartDisease', axis=1), df['HeartDisease'], test_size=0.2, random_state=42)
# create a classifier
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
# fit the classifier to the training data
clf.fit(X_train, y_train)
# predict the test data
y_pred = clf.predict(X_test)
# show the accuracy score
print('Accuracy score before oversampling: ', accuracy_score(y_test, y_pred))
# show the confusion matrix
print('Confusion matrix before oversampling: \n', confusion_matrix(y_test, y_pred))
# show the classification report
print('Classification report before oversampling: \n', classification_report(y_test, y_pred))


############# checking the accuracy of the model after oversampling ############
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
# show the shape of the resampled data
print(X_resampled.shape)
print(y_resampled.shape)
# use the resampled data to train the classifier
clf.fit(X_resampled, y_resampled)
# predict the test data
y_pred = clf.predict(X_test)
# show the accuracy score
print('Accuracy score after oversampling: ', accuracy_score(y_test, y_pred))
# show the confusion matrix
print('Confusion matrix after oversampling: \n', confusion_matrix(y_test, y_pred))
# show the classification report
print('Classification report after oversampling: \n', classification_report(y_test, y_pred))


############# combining the features in one column ############
train_df = pd.concat([X_resampled, y_resampled], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)
df_resampled = pd.concat([train_df, test_df], ignore_index=True)
df_sk = spark.createDataFrame(df_resampled)
# df_sk = spark.createDataFrame(df)
# Define a function to convert sparse vectors to dense vectors
def sparse_to_dense(vector):
    return Vectors.dense(vector)

# Register the function as a UDF
sparse_to_dense_udf = udf(sparse_to_dense, VectorUDT())

# Combine all feature columns into a single vector column
assembler = VectorAssembler(inputCols=df_sk.drop('HeartDisease').columns, outputCol='Fvec')
df_sk = assembler.transform(df_sk)
df_sk.show(5, truncate=False)
df_sk = df_sk.withColumn('DenseFvec', sparse_to_dense_udf(df_sk['Fvec']))
df_sk.show(5, truncate=False)
df_sk = df_sk.drop('Fvec')
df_sk = df_sk.select('HeartDisease', 'DenseFvec')
# Split the data into training and testing sets
train_data, test_data = df_sk.randomSplit([0.8, 0.2], seed=42)

# Extract the label column
label_col = 'HeartDisease'

# Extract the training and testing labels
train_labels = train_data.select(label_col)
test_labels = test_data.select(label_col)
train_labels.show(5)

train_data.show(5)

Accuracy score before oversampling:  0.9125689895089042
Confusion matrix before oversampling: 
 [[58367     0]
 [ 5592     0]]
Classification report before oversampling: 
               precision    recall  f1-score   support

           0       0.91      1.00      0.95     58367
           1       0.00      0.00      0.00      5592

    accuracy                           0.91     63959
   macro avg       0.46      0.50      0.48     63959
weighted avg       0.83      0.91      0.87     63959



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(468110, 17)
(468110,)
Accuracy score after oversampling:  0.7250113353867321
Confusion matrix after oversampling: 
 [[42175 16192]
 [ 1396  4196]]
Classification report after oversampling: 
               precision    recall  f1-score   support

           0       0.97      0.72      0.83     58367
           1       0.21      0.75      0.32      5592

    accuracy                           0.73     63959
   macro avg       0.59      0.74      0.58     63959
weighted avg       0.90      0.73      0.78     63959

+-----+-------+---------------+------+--------------+------------+-----------+---+-----------+----+--------+----------------+---------+---------+------+-------------+----------+------------+----------------------------------------------------------------------------+
|BMI  |Smoking|AlcoholDrinking|Stroke|PhysicalHealth|MentalHealth|DiffWalking|Sex|AgeCategory|Race|Diabetic|PhysicalActivity|GenHealth|SleepTime|Asthma|KidneyDisease|SkinCancer|HeartDisease|Fvec                   

In [4]:
print(train_labels.count())

425414


# Models

In [10]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import LinearSVC
from pyspark.sql.functions import col
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

## -----------------Logistic Regression-----------------

In [11]:
lr = LogisticRegression(maxIter=10, regParam=0.01,featuresCol='DenseFvec', labelCol='HeartDisease')
model = lr.fit(train_data)
predictions = model.evaluate(test_data)

evaluator1 = MulticlassClassificationEvaluator(labelCol='HeartDisease', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator1.evaluate(predictions.predictions)
print(f'Test Accuracy = {accuracy}')
# generate the confusion matrix
confusion_matrix_lr = predictions.predictions.groupBy('HeartDisease').pivot('prediction').count().na.fill(0)
confusion_matrix_lr.show()


Test Accuracy = 0.7611645023674465
+------------+-----+-----+
|HeartDisease|  0.0|  1.0|
+------------+-----+-----+
|           0|46449|12186|
|           1|13287|34733|
+------------+-----+-----+



In [12]:
evaluator2 = MulticlassClassificationEvaluator(labelCol='HeartDisease', predictionCol='prediction', metricName='f1')

# calculate the f1 score for each class
f1 = evaluator2.evaluate(predictions.predictions, {evaluator.metricName: "f1ByLabel"})

# calculate the macro avg f1 score
macro_avg_f1 = evaluator2.evaluate(predictions.predictions, {evaluator.metricName: "weightedFMeasure", evaluator.weightCol: "label"})

# calculate the micro avg f1 score
micro_avg_f1 = evaluator2.evaluate(predictions.predictions, {evaluator.metricName: "f1"})

print("F1 score for each class:", f1)
print("Macro avg F1 score:", macro_avg_f1)
print("Micro avg F1 score:", micro_avg_f1)

F1 score for each class: 0.76089036036038
Macro avg F1 score: 0.76089036036038
Micro avg F1 score: 0.76089036036038


In [13]:
y_test = test_labels.toPandas().values
y_test_ = y_test.flatten().tolist()
y_pred = predictions.predictions.select('prediction').toPandas().values
y_pred_ = y_pred.flatten().tolist()
print('Confusion matrix after oversampling: \n', confusion_matrix(y_test_, y_pred_))
# show the classification report
print('Classification report after oversampling: \n', classification_report(y_test_, y_pred_))

Confusion matrix after oversampling: 
 [[46449 12186]
 [13287 34733]]
Classification report after oversampling: 
               precision    recall  f1-score   support

           0       0.78      0.79      0.78     58635
           1       0.74      0.72      0.73     48020

    accuracy                           0.76    106655
   macro avg       0.76      0.76      0.76    106655
weighted avg       0.76      0.76      0.76    106655



## -----------------Naive Bayes-----------------


In [14]:
# Use the MLlib API to create and train a Naive Bayes model on the training data
nb = NaiveBayes(featuresCol='DenseFvec', labelCol='HeartDisease')
nbModel = nb.fit(train_data)

# Use the model to make predictions on the test data
predictions_nb = nbModel.transform(test_data)

# Evaluate the model's performance using various metrics
evaluator_acc = MulticlassClassificationEvaluator(labelCol="HeartDisease", predictionCol="prediction", metricName="accuracy")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="HeartDisease", predictionCol="prediction", metricName="f1")

accuracy = evaluator_acc.evaluate(predictions_nb)
f1_score = evaluator_f1.evaluate(predictions_nb)

print("Accuracy: ", accuracy)
print("F1 score: ", f1_score)


Accuracy:  0.6541465472786085
F1 score:  0.6325471656455405


In [15]:
y_pred = predictions_nb.select('prediction').toPandas().values
y_pred_ = y_pred.flatten().tolist()
print('Confusion matrix after oversampling: \n', confusion_matrix(y_test_, y_pred_))
# show the classification report
print('Classification report after oversampling: \n', classification_report(y_test_, y_pred_))

<class 'numpy.ndarray'>
Confusion matrix after oversampling: 
 [[50550  8085]
 [28802 19218]]
Classification report after oversampling: 
               precision    recall  f1-score   support

           0       0.64      0.86      0.73     58635
           1       0.70      0.40      0.51     48020

    accuracy                           0.65    106655
   macro avg       0.67      0.63      0.62    106655
weighted avg       0.67      0.65      0.63    106655



## -----------------SVM-----------------


In [16]:
# Use the MLlib API to create and train a Linear SVM model on the training data
svm = LinearSVC(featuresCol='DenseFvec', labelCol='HeartDisease', maxIter=10)
svmModel = svm.fit(train_data)
# Use the model to make predictions on the test data
predictions_svm = svmModel.transform(test_data)

# Evaluate the model's performance using various metrics
evaluator_acc = MulticlassClassificationEvaluator(labelCol="HeartDisease", predictionCol="prediction", metricName="accuracy")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="HeartDisease", predictionCol="prediction", metricName="f1")

accuracy = evaluator_acc.evaluate(predictions_svm)
f1_score = evaluator_f1.evaluate(predictions_svm)

print("Accuracy: ", accuracy)
print("F1 score: ", f1_score)

Accuracy:  0.762608410294876
F1 score:  0.762786271251688


In [17]:
y_pred = predictions_svm.select('prediction').toPandas().values
y_pred_ = y_pred.flatten().tolist()
print('Confusion matrix after oversampling: \n', confusion_matrix(y_test_, y_pred_))
# show the classification report
print('Classification report after oversampling: \n', classification_report(y_test_, y_pred_))

<class 'numpy.ndarray'>
Confusion matrix after oversampling: 
 [[45542 13093]
 [12226 35794]]
Classification report after oversampling: 
               precision    recall  f1-score   support

           0       0.79      0.78      0.78     58635
           1       0.73      0.75      0.74     48020

    accuracy                           0.76    106655
   macro avg       0.76      0.76      0.76    106655
weighted avg       0.76      0.76      0.76    106655

