In [2]:
import os
import seaborn as sns
import pandas as pd
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('pyspark_teoria').getOrCreate()
spark

### Para que se ejecute hay que poner alguna accion
* take(n)
* first()
* head(n)
* count()
* show(n)
* write.format().save()


In [15]:
df = spark.createDataFrame(sns.load_dataset('tips'))
df.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [16]:
# Convertir a dataframe de pandas
df.limit(10).toPandas()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [17]:
# Ver columnas
df.columns

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

In [18]:
#seleccionar columnas concretas
df.select('total_bill', 'tip', 'sex').show(5)

+----------+----+------+
|total_bill| tip|   sex|
+----------+----+------+
|     16.99|1.01|Female|
|     10.34|1.66|  Male|
|     21.01| 3.5|  Male|
|     23.68|3.31|  Male|
|     24.59|3.61|Female|
+----------+----+------+
only showing top 5 rows



In [19]:
# Calculos del data frame(min, max, etc)
df.describe().toPandas()

Unnamed: 0,summary,total_bill,tip,sex,smoker,day,time,size
0,count,244.0,244.0,244,244,244,244,244.0
1,mean,19.785942622950813,2.9982786885245907,,,,,2.569672131147541
2,stddev,8.902411954856856,1.383638189001182,,,,,0.9510998047322344
3,min,3.07,1.0,Female,No,Fri,Dinner,1.0
4,max,50.81,10.0,Male,Yes,Thur,Lunch,6.0


In [20]:
# Tipos de columnas
df.dtypes

[('total_bill', 'double'),
 ('tip', 'double'),
 ('sex', 'string'),
 ('smoker', 'string'),
 ('day', 'string'),
 ('time', 'string'),
 ('size', 'bigint')]

In [21]:
# Conversion tipo de datos, en pandas se usa astype()
# No se modifica el df original, te devuelve uno modificado
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, FloatType
df_cast = df.withColumn('total_bill', col('total_bill').cast(FloatType())) \
            .withColumn('tip', col('tip').cast(IntegerType()))
            
df_cast.printSchema()

root
 |-- total_bill: float (nullable = true)
 |-- tip: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: long (nullable = true)



In [22]:
# Agregaciones (operaciones de suma, media, etc)
df.select('total_bill', 'tip', 'size').summary('count', 'min', 'max', 'mean').show()

+-------+------------------+------------------+-----------------+
|summary|        total_bill|               tip|             size|
+-------+------------------+------------------+-----------------+
|  count|               244|               244|              244|
|    min|              3.07|               1.0|                1|
|    max|             50.81|              10.0|                6|
|   mean|19.785942622950813|2.9982786885245907|2.569672131147541|
+-------+------------------+------------------+-----------------+



In [23]:
# Equivalente al 'describe' de pandas
df.summary().show()

+-------+------------------+------------------+------+------+----+------+------------------+
|summary|        total_bill|               tip|   sex|smoker| day|  time|              size|
+-------+------------------+------------------+------+------+----+------+------------------+
|  count|               244|               244|   244|   244| 244|   244|               244|
|   mean|19.785942622950813|2.9982786885245907|  NULL|  NULL|NULL|  NULL| 2.569672131147541|
| stddev| 8.902411954856856| 1.383638189001182|  NULL|  NULL|NULL|  NULL|0.9510998047322344|
|    min|              3.07|               1.0|Female|    No| Fri|Dinner|                 1|
|    25%|             13.28|               2.0|  NULL|  NULL|NULL|  NULL|                 2|
|    50%|             17.78|              2.88|  NULL|  NULL|NULL|  NULL|                 2|
|    75%|             24.08|              3.55|  NULL|  NULL|NULL|  NULL|                 3|
|    max|             50.81|              10.0|  Male|   Yes|Thur| Lun

In [24]:
# filtro por una columna
df.filter(df['total_bill'] > 40).show()

+----------+----+------+------+----+------+----+
|total_bill| tip|   sex|smoker| day|  time|size|
+----------+----+------+------+----+------+----+
|     48.27|6.73|  Male|    No| Sat|Dinner|   4|
|     40.17|4.73|  Male|   Yes| Fri|Dinner|   4|
|      44.3| 2.5|Female|   Yes| Sat|Dinner|   3|
|     41.19| 5.0|  Male|    No|Thur| Lunch|   5|
|     48.17| 5.0|  Male|    No| Sun|Dinner|   6|
|     50.81|10.0|  Male|   Yes| Sat|Dinner|   3|
|     45.35| 3.5|  Male|   Yes| Sun|Dinner|   3|
|     40.55| 3.0|  Male|   Yes| Sun|Dinner|   2|
|     43.11| 5.0|Female|   Yes|Thur| Lunch|   4|
|     48.33| 9.0|  Male|    No| Sat|Dinner|   4|
+----------+----+------+------+----+------+----+



In [25]:
df.filter((df['total_bill'] > 40) & (df['tip'] > 3)).show()

+----------+----+------+------+----+------+----+
|total_bill| tip|   sex|smoker| day|  time|size|
+----------+----+------+------+----+------+----+
|     48.27|6.73|  Male|    No| Sat|Dinner|   4|
|     40.17|4.73|  Male|   Yes| Fri|Dinner|   4|
|     41.19| 5.0|  Male|    No|Thur| Lunch|   5|
|     48.17| 5.0|  Male|    No| Sun|Dinner|   6|
|     50.81|10.0|  Male|   Yes| Sat|Dinner|   3|
|     45.35| 3.5|  Male|   Yes| Sun|Dinner|   3|
|     43.11| 5.0|Female|   Yes|Thur| Lunch|   4|
|     48.33| 9.0|  Male|    No| Sat|Dinner|   4|
+----------+----+------+------+----+------+----+



In [26]:
# Eliminar filas donde hay al menos un valor nulo
df_no_nulos = df.dropna()

In [27]:
# Eliminar filas donde hay nulos en columnas especificadas
df_no_nulos = df.dropna(subset=['tip'])

In [None]:
# Rellenar nulos
df_imputed = df.fillna({
    'total_bill': 0,
    'smoker': 'desconocido'
})

df_imputed.show(4)

In [29]:
# Cargar CSV desde pandas y de ahì a pyspark
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/tips.csv'
df_pandas = pd.read_csv(url)

# Pasar a spark
df_spark = spark.createDataFrame(df_pandas)
df_spark.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [31]:
# Cargar directamente a pyspark (Mas reconmendable)
#from isort import file
import requests

url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/tips.csv'
csv_path = 'tips.csv'

# Descarga a carpeta temporal
#with open('/tmp/tips.csv')

with open(csv_path, 'wb') as file: # w write - b binary
    file.write(requests.get(url).content)
    
spark.read.csv(csv_path, header=True, inferSchema=True)
df_spark.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [None]:
# Cargar directamente a pyspark + schema (Mas reconmendable)

import requests
from pyspark.sql.types import StructType, StructField, FloatType, StringType, IntegerType
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/tips.csv'
# csv_path= '/tmp/tips.csv'
csv_path= 'tips.csv'
with open(csv_path, 'wb') as file: # w de write b de binary
    file.write(requests.get(url).content)
    
schema = StructType([
    # columnas del dataset y su tipo de dato
    StructField('total_bill', FloatType(), True),
    StructField('tip', FloatType(), True),
    StructField('sex', StringType(), True),
    StructField('smoker', StringType(), True),
    StructField('day', StringType(), True),
    StructField('time', StringType(), True),
    StructField('size', IntegerType(), True)
])
    
df_spark = spark.read.csv(csv_path, header=True, inferSchema=False, schema=schema) # Al poner nosotros el schema hay que poner el inferschema en False
df_spark.show(5)
df_spark.printSchema()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows

root
 |-- total_bill: float (nullable = true)
 |-- tip: float (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [None]:
# Guardar datos en un CSV - Lo guarda en varios archivos par arepartirlos en nodos
# Spark lo detecta y automaticamente lo lee como un archivo
df.write.csv('tips_clean.csv', header=True, mode='overwrite')

In [39]:
# Guardar en un solo archivo (No recomendable)
df.coalesce(1).write.csv('tips_clean.csv', header=True, mode='overwrite')

In [None]:
df_tips_clean = spark.read.csv('tips_clean.csv', header=True, inferSchema=True)
df_tips_clean.show(3)

+----------+----+----+------+---+------+----+
|total_bill| tip| sex|smoker|day|  time|size|
+----------+----+----+------+---+------+----+
|     30.06| 2.0|Male|   Yes|Sat|Dinner|   3|
|     25.89|5.16|Male|   Yes|Sat|Dinner|   4|
|     48.33| 9.0|Male|    No|Sat|Dinner|   4|
+----------+----+----+------+---+------+----+
only showing top 3 rows

