In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=7e3c3bf85ff48a8215cd493a2a403fafea465a8282ceb5ac89184a0d8cfbb5a2
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg

# Crear una sesión de Spark

In [3]:
spark = SparkSession.builder \
    .appName("Sample PySpark Script") \
    .getOrCreate()

# Generar Datos Masivamente

In [4]:
import pandas as pd
import numpy as np

# Number of rows to generate
num_rows = 100_000

# Generate random data
data = {
    "id": np.arange(1, num_rows + 1),
    "name": np.random.choice(["Alice", "Bob", "Charlie", "David", "Eve"], size=num_rows),
    "age": np.random.randint(18, 65, size=num_rows),
    "gender": np.random.choice(["Male", "Female"], size=num_rows),
    "salary": np.random.normal(50000, 15000, size=num_rows)
}

# Create a DataFrame
df = pd.DataFrame(data)

# Convertir los datos a un DataFrame de Spark

In [5]:
df = spark.createDataFrame(df)
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: double (nullable = true)



In [6]:
df.show()

+---+-------+---+------+------------------+
| id|   name|age|gender|            salary|
+---+-------+---+------+------------------+
|  1|    Bob| 64|  Male| 30178.10805706068|
|  2|  Alice| 47|  Male| 48502.12824799271|
|  3|Charlie| 38|Female|42605.016657075095|
|  4|Charlie| 24|Female|105557.56287640336|
|  5|  Alice| 44|  Male| 64113.61095802544|
|  6|    Bob| 53|Female|26942.160836412342|
|  7|  Alice| 58|Female| 48940.57802635619|
|  8|  David| 43|Female|44175.398923480374|
|  9|Charlie| 43|Female|31980.190475333915|
| 10|    Bob| 19|  Male| 43238.67845574033|
| 11|  David| 19|  Male|52082.302125653776|
| 12|    Bob| 28|  Male|48806.972350299344|
| 13|    Bob| 29|  Male| 51923.95874002501|
| 14|Charlie| 34|Female| 52403.43136456106|
| 15|    Eve| 51|Female|51598.742398525465|
| 16|  David| 35|Female| 72623.84284936468|
| 17|    Bob| 55|  Male| 34082.21321172005|
| 18|  Alice| 18|Female|  16282.6680582509|
| 19|  Alice| 34|Female| 59544.48600455537|
| 20|    Eve| 64|  Male| 35720.0

# Aplicar una transformación y mostar resultados

In [7]:
# Ejemplo: Calcular promedio de edad por género
average_age_per_gender = df.groupBy("gender").agg(avg(col("age")).alias("avg_age"))
average_age_per_gender.show()

+------+------------------+
|gender|           avg_age|
+------+------------------+
|Female|40.991816759597256|
|  Male|41.124925212396796|
+------+------------------+



# Detener sesión de Spark

In [8]:
spark.stop()