In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=cd8fe011f7040991b7699ccee212a23c9b50387e5c19f18680a9248cee39ee15
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [12]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Initialize Spark session
spark = SparkSession.builder.appName("RandomForestClassification").getOrCreate()

# Load your dataset into a DataFrame
df = spark.read.csv('/content/final_encoded_disease_csv.csv', header=True, inferSchema=True)

# Identify string columns and index them
string_columns = [col for col, dtype in df.dtypes if dtype == 'string' and col != 'HadAsthmaIndexed']
indexers = [StringIndexer(inputCol=col, outputCol=col + "Indexed") for col in string_columns]

# Apply StringIndexers to convert string columns to numerical indices
for indexer in indexers:
    df = indexer.fit(df).transform(df)

# Define features (X) and target (y)
# Include indexed string columns in the features list
assembler = VectorAssembler(
    inputCols=[col for col in df.columns if (col != 'HadAsthmaIndexed' and col not in string_columns) or col.endswith("Indexed")],
    outputCol="features"
)
df = assembler.transform(df)

# Split the data into training and testing sets
train_df, test_df = df.randomSplit([0.7, 0.3], seed=42)

# Initialize the RandomForestClassifier
rf = RandomForestClassifier(labelCol="HadAsthmaIndexed", featuresCol="features", numTrees=100, seed=42)

# Set up the cross-validation process
paramGrid = ParamGridBuilder().build()  # No hyperparameters to tune in this example
evaluator = MulticlassClassificationEvaluator(labelCol="HadAsthmaIndexed", metricName="accuracy")

crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)

# Train the model
cvModel = crossval.fit(train_df)

# Make predictions on the test set
predictions = cvModel.transform(test_df)

# Evaluate the model
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="HadAsthmaIndexed", metricName="f1")
f1_score = evaluator_f1.evaluate(predictions)
print(f"F1 Score: {f1_score:.2f}")

# Print confusion matrix
predictions.groupBy("HadAsthmaIndexed", "prediction").count().show()

# Stop the Spark session
spark.stop()


Accuracy: 1.00
F1 Score: 1.00
+----------------+----------+-----+
|HadAsthmaIndexed|prediction|count|
+----------------+----------+-----+
|               0|       0.0|61089|
|               1|       1.0|10474|
+----------------+----------+-----+



In [7]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from xgboost import XGBClassifier
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize Spark session
spark = SparkSession.builder.appName("XGBoostClassification").getOrCreate()

# Load your dataset into a DataFrame
df = pd.read_csv('/content/final_encoded_disease_csv.csv')  # Pandas DataFrame
sdf = spark.createDataFrame(df)  # Convert Pandas DataFrame to Spark DataFrame

# Convert string columns to categorical using StringIndexer
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index").fit(sdf) for col in ['SmokerStatusVec', 'AgeCategoryVec']]
for indexer in indexers:
    sdf = indexer.transform(sdf).drop(indexer.getInputCol()).withColumnRenamed(indexer.getOutputCol(), indexer.getInputCol())

# Define features (X) and target (y)
assembler = VectorAssembler(
    inputCols=[col for col in sdf.columns if col != 'HadAsthmaIndexed'],
    outputCol="features"
)
sdf = assembler.transform(sdf).select("features", "HadAsthmaIndexed")

# Split the data into training and testing sets
train_df, test_df = sdf.randomSplit([0.7, 0.3], seed=42)

# Convert Spark DataFrame to Pandas DataFrame for XGBoost
train_pd = train_df.toPandas()
test_pd = test_df.toPandas()

# Extract features and target, and convert features to a NumPy array for XGBoost
X_train = train_pd.drop(columns=["HadAsthmaIndexed"])['features'].apply(lambda x: x.toArray()).tolist() # Convert features to list of arrays
y_train = train_pd["HadAsthmaIndexed"]

X_test = test_pd.drop(columns=["HadAsthmaIndexed"])['features'].apply(lambda x: x.toArray()).tolist() # Convert features to list of arrays
y_test = test_pd["HadAsthmaIndexed"]

# Initialize the XGBoost classifier
xgb_model = XGBClassifier(use_label_encoder=False,
                          eval_metric='logloss',
                          random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Stop the Spark session
spark.stop()

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.86
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.99      0.92     61141
           1       0.53      0.10      0.17     10431

    accuracy                           0.86     71572
   macro avg       0.70      0.54      0.55     71572
weighted avg       0.82      0.86      0.81     71572

Confusion Matrix:
[[60228   913]
 [ 9385  1046]]


In [13]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Initialize Spark session
spark = SparkSession.builder.appName("SVMClassification").getOrCreate()

# Load your dataset into a DataFrame
df = spark.read.csv('/content/final_encoded_disease_csv.csv', header=True, inferSchema=True)

# Identify string columns and index them
string_columns = [col for col, dtype in df.dtypes if dtype == 'string' and col != 'HadAsthmaIndexed']
indexers = [StringIndexer(inputCol=col, outputCol=col + "Indexed") for col in string_columns]

# Apply StringIndexers to convert string columns to numerical indices
for indexer in indexers:
    df = indexer.fit(df).transform(df)

# Define features (X) and target (y)
assembler = VectorAssembler(
    inputCols=[col for col in df.columns if (col != 'HadAsthmaIndexed' and col not in string_columns) or col.endswith("Indexed")],
    outputCol="features"
)
df = assembler.transform(df)

# Split the data into training and testing sets
train_df, test_df = df.randomSplit([0.7, 0.3], seed=42)

# Initialize the LinearSVC (SVM) classifier
svm = LinearSVC(labelCol="HadAsthmaIndexed", featuresCol="features", maxIter=100, regParam=0.1)

# Set up the cross-validation process
paramGrid = ParamGridBuilder().build()  # No hyperparameters to tune in this example
evaluator = MulticlassClassificationEvaluator(labelCol="HadAsthmaIndexed", metricName="accuracy")

crossval = CrossValidator(estimator=svm,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)

# Train the model
cvModel = crossval.fit(train_df)

# Make predictions on the test set
predictions = cvModel.transform(test_df)

# Evaluate the model
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="HadAsthmaIndexed", metricName="f1")
f1_score = evaluator_f1.evaluate(predictions)
print(f"F1 Score: {f1_score:.2f}")

# Print confusion matrix
predictions.groupBy("HadAsthmaIndexed", "prediction").count().show()

# Stop the Spark session
spark.stop()


Accuracy: 1.00
F1 Score: 1.00
+----------------+----------+-----+
|HadAsthmaIndexed|prediction|count|
+----------------+----------+-----+
|               0|       0.0|61089|
|               1|       1.0|10474|
+----------------+----------+-----+



In [11]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize Spark session
spark = SparkSession.builder.appName("LogisticRegressionClassification").getOrCreate()

# Load your dataset into a DataFrame
df = pd.read_csv('/content/final_encoded_disease_csv.csv')  # Pandas DataFrame
sdf = spark.createDataFrame(df)  # Convert Pandas DataFrame to Spark DataFrame

# Identify columns with non-numeric values and handle them
non_numeric_columns = [col for col, dtype in sdf.dtypes if dtype == 'string']

# Convert categorical features to numerical using StringIndexer
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index") for col in non_numeric_columns]
for indexer in indexers:
    sdf = indexer.fit(sdf).transform(sdf)
    sdf = sdf.drop(indexer.getInputCol()).withColumnRenamed(indexer.getOutputCol(), indexer.getInputCol())

# Fill any missing values with 0 (or use another appropriate strategy)
sdf = sdf.na.fill(0)

# Assemble feature columns into a single vector
assembler = VectorAssembler(
    inputCols=[col for col in sdf.columns if col != 'HadAsthmaIndexed'],
    outputCol="features"
)
sdf = assembler.transform(sdf).select("features", "HadAsthmaIndexed")

# Split the data into training and testing sets
train_df, test_df = sdf.randomSplit([0.7, 0.3], seed=42)

# Initialize the Logistic Regression model
log_reg = LogisticRegression(featuresCol='features', labelCol='HadAsthmaIndexed', maxIter=1000)

# Train the model
log_reg_model = log_reg.fit(train_df)

# Make predictions on the test set
predictions = log_reg_model.transform(test_df)

# Evaluate the model using accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol="HadAsthmaIndexed", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy:.2f}")

# Evaluate the model using F1 score
evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="HadAsthmaIndexed", predictionCol="prediction", metricName="f1")
f1_score = evaluator_f1.evaluate(predictions)
print(f"F1 Score: {f1_score:.2f}")

# Print confusion matrix
predictions.groupBy('HadAsthmaIndexed', 'prediction').count().show()

# Stop the Spark session
spark.stop()


Accuracy: 0.86
F1 Score: 0.81
+----------------+----------+-----+
|HadAsthmaIndexed|prediction|count|
+----------------+----------+-----+
|               1|       0.0| 9700|
|               0|       1.0|  558|
|               0|       0.0|60583|
|               1|       1.0|  731|
+----------------+----------+-----+

