# 🎓 **Maestría en Inteligencia Artificial Aplicada**

## 📈 **Curso: Análisis de grandes volúmenes de datos (Gpo 10)**

### 🏛️ Tecnológico de Monterrey

#### 👨‍🏫 **Profesor titular:** Dr. Iván Olmos Pineda
#### 👩‍🏫 **Profesor asistence:** Verónica Sandra Guzmán de Valle

### 📊 **Actividad 4 | Métricas de calidad de resultados**

#### 📅 **6 de junio de 2025**

🧑‍💻 **A01016093:** Oscar Enrique García García 

# 1. Importar librerías

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rand
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import matplotlib.pyplot as plt
import pandas as pd

ModuleNotFoundError: No module named 'pyspark'

In [None]:
spark = SparkSession.builder.appName("Actividad4_MetricasCalidad").getOrCreate()
df = spark.read.parquet("/ruta/a/tu/dataset/parquet")

condiciones_climaticas = ["Fair", "Mostly Cloudy", "Cloudy", "Partly Cloudy", "Clear", "Light Rain", "Overcast"]
severidades = [2, 3]

particiones = []
for clima in condiciones_climaticas:
    for severidad in severidades:
        particion = df.filter((col("Weather_Condition") == clima) & (col("Severity") == severidad)).limit(1000)
        particiones.append(particion)

muestra_M = particiones[0]
for part in particiones[1:]:
    muestra_M = muestra_M.union(part)

muestra_M.cache()
muestra_M.write.mode("overwrite").parquet("/ruta/salida/muestra_M")

muestra_pd = muestra_M.select("Severity", "Weather_Condition").toPandas()
plt.figure(figsize=(10, 5))
muestra_pd.groupby(["Weather_Condition", "Severity"]).size().unstack().plot(kind='bar', stacked=True)
plt.title("Distribución de la muestra por condición climática y severidad")
plt.ylabel("Número de registros")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

train_test_particiones = []
for part in particiones:
    train, test = part.randomSplit([0.7, 0.3], seed=42)
    train_test_particiones.append((train, test))

train_df = train_test_particiones[0][0]
test_df = train_test_particiones[0][1]
for t, s in train_test_particiones[1:]:
    train_df = train_df.union(t)
    test_df = test_df.union(s)

features_cols = ["Temperature", "Humidity", "Visibility", "Wind_Speed", "Precipitation"]
assembler = VectorAssembler(inputCols=features_cols, outputCol="raw_features")
scaler = StandardScaler(inputCol="raw_features", outputCol="features")
label_indexer = StringIndexer(inputCol="Severity", outputCol="label")
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=20, maxDepth=6)
pipeline = Pipeline(stages=[assembler, scaler, label_indexer, rf])
model = pipeline.fit(train_df)
predictions = model.transform(test_df)

accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")

accuracy = accuracy_evaluator.evaluate(predictions)
precision = precision_evaluator.evaluate(predictions)
recall = recall_evaluator.evaluate(predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

metricas_df = pd.DataFrame({
    'Métrica': ['Accuracy', 'Precision', 'Recall'],
    'Valor': [accuracy, precision, recall]
})
plt.figure(figsize=(6,4))
plt.bar(metricas_df['Métrica'], metricas_df['Valor'], color='skyblue')
plt.title('Métricas de Evaluación del Modelo')
plt.ylim(0, 1)
plt.ylabel('Puntaje')
plt.tight_layout()
plt.show()