In [1]:
# Installing Pyspark
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=1a0b0a2c413a3110f5614e92031188e1f550cd602a5123e6ce9abc0bdf62bbdb
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, LinearSVC, GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("AsthmaClassification").getOrCreate()

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Load your CSV file into a DataFrame
file_path = r'/content/drive/MyDrive/Dataset/encoded_dataset(FINAL).csv'
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Display the schema to verify the data types
print(df.printSchema())
# Display column list of dataframe
print((df.columns))

root
 |-- SleepHours: double (nullable = true)
 |-- WeightInKilograms: double (nullable = true)
 |-- BMI: double (nullable = true)
 |-- HeightInCentimeters: double (nullable = true)
 |-- SexIndexed: integer (nullable = true)
 |-- PhysicalActivitiesIndexed: integer (nullable = true)
 |-- HadHeartAttackIndexed: integer (nullable = true)
 |-- HadAnginaIndexed: integer (nullable = true)
 |-- HadStrokeIndexed: integer (nullable = true)
 |-- HadAsthmaIndexed: integer (nullable = true)
 |-- HadSkinCancerIndexed: integer (nullable = true)
 |-- HadCOPDIndexed: integer (nullable = true)
 |-- HadDepressiveDisorderIndexed: integer (nullable = true)
 |-- HadKidneyDiseaseIndexed: integer (nullable = true)
 |-- HadArthritisIndexed: integer (nullable = true)
 |-- HadDiabetesIndexed: integer (nullable = true)
 |-- DeafOrHardOfHearingIndexed: integer (nullable = true)
 |-- BlindOrVisionDifficultyIndexed: integer (nullable = true)
 |-- DifficultyConcentratingIndexed: integer (nullable = true)
 |-- Diffic

In [5]:
# Print number of columns
len(df.columns)

32

In [6]:
# Define List of columns to remove
columns_to_exclude = ['HadDiabetesIndexed', 'HadHeartAttackIndexed',
                       'HadAsthmaIndexed', 'HadArthritisIndexed', 'StateFrequency']

# Determining the columns to keep
feature_columns = [col for col in df.columns if col not in columns_to_exclude]

# Display feature columns
print("List of feature column: ",feature_columns)
# Display number of feature columns
print("Length of Feature Column: ",len(feature_columns))

# Assemble the features into a single vector column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df)

List of feature column:  ['SleepHours', 'WeightInKilograms', 'BMI', 'HeightInCentimeters', 'SexIndexed', 'PhysicalActivitiesIndexed', 'HadAnginaIndexed', 'HadStrokeIndexed', 'HadSkinCancerIndexed', 'HadCOPDIndexed', 'HadDepressiveDisorderIndexed', 'HadKidneyDiseaseIndexed', 'DeafOrHardOfHearingIndexed', 'BlindOrVisionDifficultyIndexed', 'DifficultyConcentratingIndexed', 'DifficultyWalkingIndexed', 'DifficultyDressingBathingIndexed', 'DifficultyErrandsIndexed', 'ChestScanIndexed', 'AlcoholDrinkersIndexed', 'HIVTestingIndexed', 'FluVaxLast12Indexed', 'PneumoVaxEverIndexed', 'GeneralHealthIndex', 'LastCheckupTimeIndex', 'SmokerStatusIndex', 'AgeCategoryIndex']
Length of Feature Column:  27


In [7]:
# Display last column to verify if feature column was created
df.columns[-1]

'features'

In [8]:
# Stratified Sampling using Pyspark sampleBy()

from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, LinearSVC

# Define the seed for reproducibility
seed1 = 32

# Define the fractions for stratified sampling (e.g., 80% for training and 20% for testing)
fractions = {"HadAsthmaIndexed": [0.8, 0.2]}

# Calculate the fractions for each class
class_counts = df.groupBy("HadAsthmaIndexed").count()
total_count = df.count()

# Define fractions for each class
fractions = {}
for row in class_counts.collect():
    class_value = row["HadAsthmaIndexed"]
    fractions[class_value] = 0.8  # 80% for training

# Perform stratified sampling for training data
train_data = df.sampleBy("HadAsthmaIndexed", fractions, seed=seed1)

# Define fractions for test data (remaining 20%)
test_fractions = {k: 1.0 - v for k, v in fractions.items()}

# Perform stratified sampling for test data
test_data = df.sampleBy("HadAsthmaIndexed", test_fractions, seed=seed1)

# Initialize the models
lr = LogisticRegression(featuresCol="features", labelCol="HadAsthmaIndexed")
rf = RandomForestClassifier(featuresCol="features", labelCol="HadAsthmaIndexed")
lsvc = LinearSVC(featuresCol="features", labelCol="HadAsthmaIndexed")
gbt = GBTClassifier(featuresCol="features", labelCol="HadAsthmaIndexed", maxIter=10)

# Now `train_data` and `test_data` are stratified splits


In [9]:
# Printing No of Rows in Training & Test Data
print("No. of rows in Training data: ",train_data.count())
print("No. of rows in Test data: ",test_data.count())

No. of rows in Training data:  190754
No. of rows in Test data:  47693


In [10]:
# Train the models
lr_model = lr.fit(train_data)
rf_model = rf.fit(train_data)
lsvc_model = lsvc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [11]:
# Make prediction
lr_predictions = lr_model.transform(test_data)
rf_predictions = rf_model.transform(test_data)
lsvc_predictions = lsvc_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)


In [12]:
# Initialize evaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="HadAsthmaIndexed", predictionCol="prediction", metricName="accuracy")

# Evaluate accuracy for each model
lr_accuracy = evaluator.evaluate(lr_predictions)
rf_accuracy = evaluator.evaluate(rf_predictions)
lsvc_accuracy = evaluator.evaluate(lsvc_predictions)
gbt_accuracy = evaluator.evaluate(gbt_predictions)

In [13]:
# Print accuracy for each model
print(f"Logistic Regression Accuracy: {lr_accuracy:.6f}")
print(f"Random Forest Accuracy: {rf_accuracy:.6f}")
print(f"Linear SVM Accuracy: {lsvc_accuracy:.6f}")
print(f"Gradient-Boosted Trees Accuracy: {gbt_accuracy:.6f}")


Logistic Regression Accuracy: 0.854654
Random Forest Accuracy: 0.850565
Linear SVM Accuracy: 0.850272
Gradient-Boosted Trees Accuracy: 0.855388


In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import format_number

# Initialize evaluators for precision, recall, and F1 score
evaluator_precision = MulticlassClassificationEvaluator(labelCol="HadAsthmaIndexed", predictionCol="prediction", metricName="weightedPrecision")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="HadAsthmaIndexed", predictionCol="prediction", metricName="weightedRecall")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="HadAsthmaIndexed", predictionCol="prediction", metricName="f1")

# Evaluate precision, recall, and F1 score for each model and collect the metrics in a list of tuples
metrics_list = [
    ("Logistic Regression",
     evaluator_precision.evaluate(lr_predictions),
     evaluator_recall.evaluate(lr_predictions),
     evaluator_f1.evaluate(lr_predictions)),

    ("Random Forest",
     evaluator_precision.evaluate(rf_predictions),
     evaluator_recall.evaluate(rf_predictions),
     evaluator_f1.evaluate(rf_predictions)),

    ("Linear SVM",
     evaluator_precision.evaluate(lsvc_predictions),
     evaluator_recall.evaluate(lsvc_predictions),
     evaluator_f1.evaluate(lsvc_predictions)),

    ("Gradient-Boosted Trees",
     evaluator_precision.evaluate(gbt_predictions),
     evaluator_recall.evaluate(gbt_predictions),
     evaluator_f1.evaluate(gbt_predictions))
]

# Convert the list of tuples to a DataFrame
metrics_df = spark.createDataFrame(metrics_list, ["Model", "Precision", "Recall", "F1 Score"])

# Format the numerical columns to 6 decimal places
formatted_df = metrics_df.withColumn("Precision", format_number("Precision", 6)) \
                         .withColumn("Recall", format_number("Recall", 6)) \
                         .withColumn("F1 Score", format_number("F1 Score", 6))

# Show the metrics DataFrame
formatted_df.show(truncate=False)

+----------------------+---------+--------+--------+
|Model                 |Precision|Recall  |F1 Score|
+----------------------+---------+--------+--------+
|Logistic Regression   |0.821971 |0.854654|0.804606|
|Random Forest         |0.850481 |0.850565|0.782296|
|Linear SVM            |0.722962 |0.850272|0.781465|
|Gradient-Boosted Trees|0.826004 |0.855388|0.804495|
+----------------------+---------+--------+--------+



In [18]:
# Save the trained models to the file system
lr_model.write().overwrite().save("lr_model_asthma")

In [19]:
%pwd

'/content'

In [20]:
  %ls

[0m[01;34mdrive[0m/  [01;34mlr_model_asthma[0m/  [01;34msample_data[0m/


In [22]:
%cp -r lr_model_asthma/ sample_data/ /content/drive/MyDrive/Colab_Saved_ML_Models/Asthma_Model