In [2]:
!pip install pyspark
!pip install findspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=39c085ccd6a2e8e2749cefc2bb8d8daca88ec5ee73477b9d121d81b0fe8c25c2
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 k

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark import SparkContext
spark = SparkSession.builder.appName('BigDataProject').getOrCreate()
sc = spark.sparkContext

In [4]:
# read data
df = pd.read_csv('./data_cleaned.csv')
df.head()

############# checking the accuracy of the model before oversampling ###########
X_train, X_test, y_train, y_test = train_test_split(df.drop('HeartDisease', axis=1), df['HeartDisease'], test_size=0.2, random_state=42)
# create a classifier
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
# fit the classifier to the training data
clf.fit(X_train, y_train)
# predict the test data
y_pred = clf.predict(X_test)
# show the accuracy score
print('Accuracy score before oversampling: ', accuracy_score(y_test, y_pred))
# show the confusion matrix
print('Confusion matrix before oversampling: \n', confusion_matrix(y_test, y_pred))
# show the classification report
print('Classification report before oversampling: \n', classification_report(y_test, y_pred))


############# checking the accuracy of the model after oversampling ############
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
# show the shape of the resampled data
print(X_resampled.shape)
print(y_resampled.shape)
# use the resampled data to train the classifier
clf.fit(X_resampled, y_resampled)
# predict the test data
y_pred = clf.predict(X_test)
# show the accuracy score
print('Accuracy score after oversampling: ', accuracy_score(y_test, y_pred))
# show the confusion matrix
print('Confusion matrix after oversampling: \n', confusion_matrix(y_test, y_pred))
# show the classification report
print('Classification report after oversampling: \n', classification_report(y_test, y_pred))


############# combining the features in one column ############
train_df = pd.concat([X_resampled, y_resampled], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)
df_resampled = pd.concat([train_df, test_df], ignore_index=True)
df_sk = spark.createDataFrame(df_resampled)
df_rdd = df_sk.rdd
# df_sk = spark.createDataFrame(df)
# Define a function to convert sparse vectors to dense vectors
# def sparse_to_dense(vector):
#     return Vectors.dense(vector)

# # Register the function as a UDF
# sparse_to_dense_udf = udf(sparse_to_dense, VectorUDT())

# # Combine all feature columns into a single vector column
# assembler = VectorAssembler(inputCols=df_sk.drop('HeartDisease').columns, outputCol='Fvec')
# df_sk = assembler.transform(df_sk)
# df_sk.show(5, truncate=False)
# df_sk = df_sk.withColumn('DenseFvec', sparse_to_dense_udf(df_sk['Fvec']))
# df_sk.show(5, truncate=False)
# df_sk = df_sk.drop('Fvec')
# df_sk = df_sk.select('HeartDisease', 'DenseFvec')
# Split the data into training and testing sets

train_data, test_data = df_rdd.randomSplit([0.8, 0.2], seed=42)

# # Extract the label column
# label_col = 'HeartDisease'

# # Extract the training and testing labels
# train_labels = train_data.select(label_col)
# test_labels = test_data.select(label_col)
# train_labels.show(5)

# train_data.show(5)

Accuracy score before oversampling:  0.9125689895089042
Confusion matrix before oversampling: 
 [[58367     0]
 [ 5592     0]]
Classification report before oversampling: 
               precision    recall  f1-score   support

           0       0.91      1.00      0.95     58367
           1       0.00      0.00      0.00      5592

    accuracy                           0.91     63959
   macro avg       0.46      0.50      0.48     63959
weighted avg       0.83      0.91      0.87     63959



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(468110, 17)
(468110,)
Accuracy score after oversampling:  0.7250113353867321
Confusion matrix after oversampling: 
 [[42175 16192]
 [ 1396  4196]]
Classification report after oversampling: 
               precision    recall  f1-score   support

           0       0.97      0.72      0.83     58367
           1       0.21      0.75      0.32      5592

    accuracy                           0.73     63959
   macro avg       0.59      0.74      0.58     63959
weighted avg       0.90      0.73      0.78     63959



In [5]:
print(train_data.count())
print(train_data.take(20))

425493
[Row(BMI=23.33, Smoking=1, AlcoholDrinking=0, Stroke=0, PhysicalHealth=0.0, MentalHealth=0.0, DiffWalking=0, Sex=1, AgeCategory=11, Race=0, Diabetic=0, PhysicalActivity=1, GenHealth=3, SleepTime=7.0, Asthma=0, KidneyDisease=0, SkinCancer=0, HeartDisease=0), Row(BMI=27.46, Smoking=1, AlcoholDrinking=0, Stroke=1, PhysicalHealth=30.0, MentalHealth=0.0, DiffWalking=0, Sex=1, AgeCategory=7, Race=0, Diabetic=0, PhysicalActivity=1, GenHealth=2, SleepTime=6.0, Asthma=0, KidneyDisease=0, SkinCancer=0, HeartDisease=1), Row(BMI=32.69, Smoking=0, AlcoholDrinking=0, Stroke=0, PhysicalHealth=2.0, MentalHealth=2.0, DiffWalking=0, Sex=1, AgeCategory=6, Race=5, Diabetic=0, PhysicalActivity=0, GenHealth=3, SleepTime=8.0, Asthma=0, KidneyDisease=0, SkinCancer=0, HeartDisease=0), Row(BMI=31.32, Smoking=0, AlcoholDrinking=0, Stroke=0, PhysicalHealth=0.0, MentalHealth=0.0, DiffWalking=0, Sex=0, AgeCategory=1, Race=0, Diabetic=0, PhysicalActivity=1, GenHealth=4, SleepTime=8.0, Asthma=0, KidneyDisease=

# Models

In [6]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import LinearSVC
from pyspark.sql.functions import col
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

## -----------------Logistic Regression-----------------

## -----------------Naive Bayes-----------------


In [7]:
def count_values(column_index, specified_value):    
    # Filter the RDD based on the specified value and calculate the count
    count = train_data.filter(lambda row: row[column_index] == specified_value).count()
    # Print the count
    print("Count of value '{}' : {}".format(specified_value, count))
  
count_values(0, 32.69)

Count of value '32.69' : 492


In [8]:
# Count the total number of records
total_count = train_data.count()

# Get the header names
headers = df_sk.columns

# MapReduce Phase 1: Calculate the prior probabilities
prior_probs = {}
for i,header in enumerate(headers):
    # counts = train_data.groupBy(header, "HeartDisease").count().rdd
    # print(f'count :\n{counts.take(5)}')
    # header_counts = counts.map(lambda x: ((header, x[0]), x[2]))  # ((header, attribute), count)
    header_counts = train_data.map(lambda x: ((header, x[i]), 1))
    # print(f'Header Counts :\n{header_counts.take(5)}')
    header_total_counts = header_counts.reduceByKey(lambda x, y: x + y)  # ((header, attribute), total count)
    # print(f'Header Total Counts :\n{header_total_counts.take(5)}')
    prior_probs[header] = header_total_counts.collectAsMap()
print(f'Prior Probabilites: {prior_probs}')

Prior Probabilites: {'BMI': {('BMI', 32.69): 492, ('BMI', 24.63): 404, ('BMI', 26.39): 1055, ('BMI', 24.86): 169, ('BMI', 28.73): 274, ('BMI', 22.85): 197, ('BMI', 26.78): 569, ('BMI', 26.54): 1365, ('BMI', 30.13): 1692, ('BMI', 22.43): 431, ('BMI', 26.63): 4710, ('BMI', 22.51): 16, ('BMI', 38.11): 60, ('BMI', 30.52): 805, ('BMI', 22.69): 51, ('BMI', 26.87): 253, ('BMI', 32.01): 457, ('BMI', 22.14): 858, ('BMI', 28.89): 2297, ('BMI', 24.37): 1600, ('BMI', 24.39): 1877, ('BMI', 26.61): 1988, ('BMI', 20.14): 21, ('BMI', 22.67): 776, ('BMI', 20.9): 276, ('BMI', 28.59): 768, ('BMI', 32.86): 65, ('BMI', 32.99): 42, ('BMI', 22.15): 956, ('BMI', 26.83): 292, ('BMI', 36.18): 204, ('BMI', 22.6): 1441, ('BMI', 24.03): 1753, ('BMI', 24.96): 1961, ('BMI', 34.17): 269, ('BMI', 18.87): 22, ('BMI', 26.5): 1376, ('BMI', 24.28): 921, ('BMI', 22.16): 79, ('BMI', 44.93): 125, ('BMI', 32.89): 767, ('BMI', 24.41): 3400, ('BMI', 22.96): 1591, ('BMI', 26.96): 793, ('BMI', 32.08): 1099, ('BMI', 36.58): 809, (

In [9]:
# MapReduce Phase 2: Calculate the conditional probabilities
cond_probs = {}
class_header = "HeartDisease"
for i,header in enumerate(headers):
    if header != "HeartDisease":
        header_counts = train_data.map(lambda x: ((x[i], x[17]), 1))  # ((header, attribute), count)
        # print(f'Header Counts :\n{header_counts.take(5)}')
        header_total_counts = header_counts.reduceByKey(lambda x, y: x + y)  # ((header, attribute), total count)
        # print(f'Header Total Counts :\n{header_total_counts.take(5)}')
        cond_probs[header] = header_total_counts.map(lambda x: ((x[0][0], x[0][1]), x[1] / prior_probs[class_header][(class_header,x[0][1])]))

In [10]:
for header, probabilities in cond_probs.items():
    print(f'Conditional Probabilities for : {header}')
    print(probabilities.collect())
    data_as_list = probabilities.collect()
    class_value = 1
    feature_value = 225
    target_pair = (feature_value, class_value)
    target_value = [value for pair, value in data_as_list if pair == target_pair]
    print(target_value[0] if len(target_value) else "0")  # Access the first element (assuming there's only one matching element)
    print()

# data_as_list = probabilities.collect()
# class_value = 1
# feature_value = 27.46
# target_pair = (feature_value, class_value)
# target_value = [value for pair, value in data_as_list if pair == target_pair]
# print(target_value[0])  # Access the first element (assuming there's only one matching element)

Conditional Probabilities for : BMI
[((27.46, 1), 0.006233080706659225), ((32.69, 0), 0.000979578567334263), ((26.39, 0), 0.0027376868257376783), ((24.86, 0), 0.00027804631823898296), ((28.73, 0), 0.0005475373651475356), ((26.78, 0), 0.0014415632191774961), ((26.54, 0), 0.003353666361528656), ((30.13, 0), 0.0032296149272374175), ((22.43, 0), 0.001013799652655984), ((26.63, 0), 0.011767775715006801), ((22.51, 0), 6.844217064344196e-05), ((38.11, 0), 0.00012832906995645366), ((30.52, 0), 0.0016597226381034674), ((22.69, 0), 0.00012405143429123855), ((26.87, 0), 0.0005689255434736112), ((32.01, 0), 0.0013089565135558274), ((22.14, 0), 0.0023526996158683175), ((28.89, 0), 0.0047652861310496465), ((25.75, 1), 0.0038389517992478577), ((24.37, 0), 0.003995311711310924), ((24.39, 0), 0.004675455782080128), ((26.61, 0), 0.005180216790575513), ((20.14, 0), 8.555271330430245e-05), ((22.67, 0), 0.002100319111620625), ((20.9, 0), 0.0007485862414126464), ((23.78, 1), 0.002070738946061684), ((28.59, 

In [21]:
# Function to calculate the posterior probability
def calculate_posterior(attributes, class_value):
    posterior_probs = []
    for i,header in enumerate(headers):
        if header != "HeartDisease":
            posterior_prob = cond_probs[header].collect()
            # print(f'posterior_prob : {posterior_prob}')
            feature_value = attributes[i]
            # print(f'feature_value : {feature_value}')
            target_pair = (feature_value, class_value)
            target_value = [value for pair, value in posterior_prob if pair == target_pair]
            # print(target_value)
            probability_value = target_value[0] if len(target_value) else 0
            posterior_probs.append((header, probability_value))
    return posterior_probs

rec = test_data.take(1)
print(rec[0])
print(f'Class 0 :\n{calculate_posterior(rec[0],0)}')
print()
print(f'Class 1 :\n{calculate_posterior(rec[0],1)}')

Row(BMI=32.73, Smoking=1, AlcoholDrinking=0, Stroke=0, PhysicalHealth=0.0, MentalHealth=5.0, DiffWalking=0, Sex=1, AgeCategory=2, Race=0, Diabetic=0, PhysicalActivity=0, GenHealth=2, SleepTime=8.0, Asthma=0, KidneyDisease=0, SkinCancer=0, HeartDisease=0)
Class 0 :
[('BMI', 8.555271330430245e-05), ('Smoking', 0.3960748415135986), ('AlcoholDrinking', 0.9295430629582417), ('Stroke', 0.9737738157365661), ('PhysicalHealth', 0.7258634407590236), ('MentalHealth', 0.04505633646171088), ('DiffWalking', 0.8825746233541797), ('Sex', 0.46419191184648423), ('AgeCategory', 0.06343305927947505), ('Race', 0.762338840076313), ('Diabetic', 0.8619307536338515), ('PhysicalActivity', 0.2118413510484485), ('GenHealth', 0.2856177333664137), ('SleepTime', 0.30511947436412945), ('Asthma', 0.8700283179481038), ('KidneyDisease', 0.9715622780976498), ('SkinCancer', 0.9153883665420449)]

Class 1 :
[('BMI', 0.00024515045457153435), ('Smoking', 0.5865094226445996), ('AlcoholDrinking', 0.9585852210787663), ('Stroke',

In [22]:
# Function to classify a record
def classify(record):
    actual_class = record[17]
    class0_probs = calculate_posterior(record,0)
    class1_probs = calculate_posterior(record,1)
    max_prob = 0
    predicted_class = None
    for heart_disease in prior_probs["HeartDisease"]:
        prob = prior_probs["HeartDisease"][heart_disease] / total_count
        posterior_probs = class0_probs if heart_disease[1]==0 else class1_probs
        for posterior_prob in posterior_probs:
            prob *= posterior_prob[1]
        if prob > max_prob:
            max_prob = prob
            predicted_class = heart_disease
    final_class = predicted_class[1] if predicted_class else 0
    return final_class, actual_class

# rec = test_data.take(1)
# print(classify(rec[0]))
test_examples_count = int(test_data.count())
rec = test_data.take(test_examples_count)
# print(rec[97])
# print(classify(rec[97]))
predictions = []
actual = []
for i in range(test_examples_count):
    print(i)
    predictions.append(classify(rec[i])[0])
    actual.append(classify(rec[i])[1])



# Print the predictions
print(predictions)


0
1
2
3
4
5
6
7
8
9


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: ignored

## -----------------SVM-----------------
