In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession
from sklearn.datasets import load_breast_cancer
import pandas as pd
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, LinearSVC, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import matplotlib.pyplot as plt

# Inicializar Spark
spark = SparkSession.builder.appName("BreastCancerClassification").getOrCreate()

# Cargar datos y convertir a DataFrame de Spark
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['label'] = data.target
spark_df = spark.createDataFrame(df)

# Separar en conjuntos de entrenamiento y prueba
train_data, test_data = spark_df.randomSplit([0.8, 0.2], seed=42)

# Preparar los datos
assembler = VectorAssembler(inputCols=data.feature_names, outputCol="features")
train_data = assembler.transform(train_data).select(["features", "label"])
test_data = assembler.transform(test_data).select(["features", "label"])

models = [
    ("Logistic Regression", LogisticRegression(featuresCol='features', labelCol='label')),
    ("Decision Tree", DecisionTreeClassifier(featuresCol='features', labelCol='label')),
    ("Random Forest", RandomForestClassifier(featuresCol='features', labelCol='label')),
    ("GBT", GBTClassifier(featuresCol='features', labelCol='label')),
    ("SVM", LinearSVC(featuresCol='features', labelCol='label')),
    ("Naive Bayes", NaiveBayes(featuresCol='features', labelCol='label'))
]

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

best_acc = 0.0
best_model_name = ""

for name, model in models:
    model_fit = model.fit(train_data)
    predictions = model_fit.transform(test_data)

    accuracy = evaluator.evaluate(predictions)
    print(f"Accuracy for {name} = {accuracy}")

    if accuracy > best_acc:
        best_acc = accuracy
        best_model_name = name

print(f"\nThe best model is: {best_model_name} with Accuracy = {best_acc:.2f}")

In [None]:
# Visualizar las primeras filas del DataFrame de Spark
spark_df.show()

In [None]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

# Supongamos que el mejor modelo es RandomForest
best_model = RandomForestClassifier(featuresCol='features', labelCol='label').fit(train_data)
predictions = best_model.transform(test_data).select("label", "prediction", "probability").toPandas()

# 1. Matriz de Confusión
cm = confusion_matrix(predictions['label'], predictions['prediction'])
plt.figure(figsize=(6,6))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Wistia)
plt.title('Matriz de Confusión', size=15)
plt.colorbar()
tick_marks = np.arange(2)
plt.xticks(tick_marks, ['0', '1'], rotation=45, size=10)
plt.yticks(tick_marks, ['0', '1'], size=10)
plt.tight_layout()
plt.ylabel('Etiqueta Real', size=15)
plt.xlabel('Etiqueta Predicha', size=15)
width, height = cm.shape
for x in range(width):
    for y in range(height):
        plt.annotate(str(cm[x][y]), xy=(y, x),
        horizontalalignment='center',
        verticalalignment='center')
plt.show()

# 2. Curva ROC
prob = [i[1] for i in predictions["probability"].tolist()]
fpr, tpr, _ = roc_curve(predictions["label"], prob)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

# Suponiendo que `best_model` es el modelo entrenado de bosque aleatorio
importances = best_model.featureImportances

# Extraer valores e índices del SparseVector
importances_values = importances.values
importances_indices = importances.indices

# Crear un array de numpy para representar las importancias
importances_array = np.zeros(importances.size)
importances_array[importances_indices] = importances_values

# Ordenar las importancias y obtener los nombres de las características
sorted_indices = np.argsort(importances_array)[::-1]
names = [data.feature_names[i] for i in sorted_indices]

# Gráfico de barras para la importancia de las características
plt.figure(figsize=(15,5))
plt.title("Importancia de Características", size=15)
plt.bar(range(len(importances_array)), importances_array[sorted_indices])
plt.xticks(range(len(importances_array)), names, rotation=90)
plt.show()