### ANÁLISE EXPLORATÓRIA DE DADOS

In [0]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [0]:
from pyspark.context import SparkContext
from pyspark.conf import SparkConf

# Get the existing SparkContext
sc = SparkContext.getOrCreate()

# Update the configuration if needed
conf = sc.getConf()
conf.set("spark.executor.memory", "3g")

<pyspark.conf.SparkConf at 0x7f75a4179a20>

In [0]:
# Carregar dados de um arquivo CSV
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("dbfs:/mnt/temp/sandbox/luciana_santos/Obesity.csv")

In [0]:
%time
display(df)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 8.11 µs


Gender,Age,Height,Weight,family_history,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity
Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II
Male,29.0,1.62,53.0,no,yes,2.0,3.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Automobile,Normal_Weight
Female,23.0,1.5,55.0,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,0.0,Sometimes,Motorbike,Normal_Weight
Male,22.0,1.64,53.0,no,no,2.0,3.0,Sometimes,no,2.0,no,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
Male,24.0,1.78,64.0,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Frequently,Public_Transportation,Normal_Weight
Male,22.0,1.72,68.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,1.0,1.0,no,Public_Transportation,Normal_Weight


In [0]:
df.printSchema()

root
 |-- Gender: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Height: double (nullable = true)
 |-- Weight: double (nullable = true)
 |-- family_history: string (nullable = true)
 |-- FAVC: string (nullable = true)
 |-- FCVC: double (nullable = true)
 |-- NCP: double (nullable = true)
 |-- CAEC: string (nullable = true)
 |-- SMOKE: string (nullable = true)
 |-- CH2O: double (nullable = true)
 |-- SCC: string (nullable = true)
 |-- FAF: double (nullable = true)
 |-- TUE: double (nullable = true)
 |-- CALC: string (nullable = true)
 |-- MTRANS: string (nullable = true)
 |-- Obesity: string (nullable = true)



In [0]:

# Exibir o número de linhas e colunas do DataFrame
rows, columns = df.count(), len(df.columns)
print(f"Linhas: {rows}\nColunas: {columns}")

Linhas: 2111
Colunas: 17


In [0]:

print(df.rdd.getNumPartitions())

1


In [0]:

df.rdd.glom().map(lambda x: x[0]).collect()

[Row(Gender='Female', Age=21.0, Height=1.62, Weight=64.0, family_history='yes', FAVC='no', FCVC=2.0, NCP=3.0, CAEC='Sometimes', SMOKE='no', CH2O=2.0, SCC='no', FAF=0.0, TUE=1.0, CALC='no', MTRANS='Public_Transportation', Obesity='Normal_Weight')]

In [0]:
df_repartitioned = df.repartition(10)  # Divide em 10 partitions

In [0]:

print(df_repartitioned.rdd.getNumPartitions())

10


In [0]:
%time
df_repartitioned.show()

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 8.34 µs
+------+---------+--------+----------+--------------+----+--------+--------+----------+-----+--------+---+--------+--------+----------+--------------------+-------------------+
|Gender|      Age|  Height|    Weight|family_history|FAVC|    FCVC|     NCP|      CAEC|SMOKE|    CH2O|SCC|     FAF|     TUE|      CALC|              MTRANS|            Obesity|
+------+---------+--------+----------+--------------+----+--------+--------+----------+-----+--------+---+--------+--------+----------+--------------------+-------------------+
|Female|16.928791|1.710948| 45.248627|            no| yes|2.910733| 3.12544| Sometimes|   no|2.204263| no|2.407906|1.403037| Sometimes|Public_Transporta...|Insufficient_Weight|
|Female|17.451085|     1.6|      65.0|           yes| yes|     3.0|2.449723| Sometimes|   no|     2.0|yes|0.479592|1.720642| Sometimes|Public_Transporta...| Overweight_Level_I|
|  Male|     27.0|    1.64|      78.0|           ye

In [0]:
df_repartitioned.unpersist()

DataFrame[Gender: string, Age: double, Height: double, Weight: double, family_history: string, FAVC: string, FCVC: double, NCP: double, CAEC: string, SMOKE: string, CH2O: double, SCC: string, FAF: double, TUE: double, CALC: string, MTRANS: string, Obesity: string]

In [0]:
df_repartitionedFilter = df_repartitioned.filter(df_repartitioned.Gender == 'Female')

In [0]:
%time
display(df_repartitionedFilter)

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 11.2 µs


Gender,Age,Height,Weight,family_history,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity
Female,25.974446,1.628855,108.090006,yes,yes,3.0,3.0,Sometimes,no,1.757105,no,0.085119,0.465444,Sometimes,Public_Transportation,Obesity_Type_III
Female,20.0,1.62,53.0,no,yes,3.0,1.0,Sometimes,no,3.0,no,1.0,1.0,no,Public_Transportation,Normal_Weight
Female,22.998709,1.740108,53.65727,yes,yes,2.241606,3.0,Frequently,no,1.846626,no,2.460238,0.814518,no,Public_Transportation,Insufficient_Weight
Female,21.832995,1.580964,65.363941,no,yes,2.021446,3.0,Sometimes,no,1.077917,no,0.523847,0.808599,Sometimes,Public_Transportation,Overweight_Level_I
Female,23.501249,1.6,45.0,no,no,2.591439,3.0,Frequently,no,2.074048,no,1.679935,0.0,no,Public_Transportation,Insufficient_Weight
Female,36.0,1.58,60.0,yes,no,3.0,3.0,Sometimes,no,1.0,no,2.0,0.0,Sometimes,Automobile,Normal_Weight
Female,26.0,1.640606,111.036881,yes,yes,3.0,3.0,Sometimes,no,2.709428,no,0.0,0.228486,Sometimes,Public_Transportation,Obesity_Type_III
Female,34.044229,1.665807,77.098973,yes,yes,2.976975,1.346987,Sometimes,no,1.868349,no,0.702538,0.879333,no,Automobile,Overweight_Level_II
Female,18.233541,1.792378,137.859737,yes,yes,3.0,3.0,Sometimes,no,2.838893,no,1.990317,0.735868,Sometimes,Public_Transportation,Obesity_Type_III
Female,23.455303,1.677672,114.470482,yes,yes,3.0,3.0,Sometimes,no,2.426094,no,0.294763,0.737226,Sometimes,Public_Transportation,Obesity_Type_III


In [0]:
# Remover o DataFrame do cache
%time
df_repartitionedFilter.unpersist()

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 11.7 µs


DataFrame[Gender: string, Age: double, Height: double, Weight: double, family_history: string, FAVC: string, FCVC: double, NCP: double, CAEC: string, SMOKE: string, CH2O: double, SCC: string, FAF: double, TUE: double, CALC: string, MTRANS: string, Obesity: string]

In [0]:
%time
display(df_repartitionedFilter)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 8.82 µs


Gender,Age,Height,Weight,family_history,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity
Female,25.974446,1.628855,108.090006,yes,yes,3.0,3.0,Sometimes,no,1.757105,no,0.085119,0.465444,Sometimes,Public_Transportation,Obesity_Type_III
Female,20.0,1.62,53.0,no,yes,3.0,1.0,Sometimes,no,3.0,no,1.0,1.0,no,Public_Transportation,Normal_Weight
Female,22.998709,1.740108,53.65727,yes,yes,2.241606,3.0,Frequently,no,1.846626,no,2.460238,0.814518,no,Public_Transportation,Insufficient_Weight
Female,21.832995,1.580964,65.363941,no,yes,2.021446,3.0,Sometimes,no,1.077917,no,0.523847,0.808599,Sometimes,Public_Transportation,Overweight_Level_I
Female,23.501249,1.6,45.0,no,no,2.591439,3.0,Frequently,no,2.074048,no,1.679935,0.0,no,Public_Transportation,Insufficient_Weight
Female,36.0,1.58,60.0,yes,no,3.0,3.0,Sometimes,no,1.0,no,2.0,0.0,Sometimes,Automobile,Normal_Weight
Female,26.0,1.640606,111.036881,yes,yes,3.0,3.0,Sometimes,no,2.709428,no,0.0,0.228486,Sometimes,Public_Transportation,Obesity_Type_III
Female,34.044229,1.665807,77.098973,yes,yes,2.976975,1.346987,Sometimes,no,1.868349,no,0.702538,0.879333,no,Automobile,Overweight_Level_II
Female,18.233541,1.792378,137.859737,yes,yes,3.0,3.0,Sometimes,no,2.838893,no,1.990317,0.735868,Sometimes,Public_Transportation,Obesity_Type_III
Female,23.455303,1.677672,114.470482,yes,yes,3.0,3.0,Sometimes,no,2.426094,no,0.294763,0.737226,Sometimes,Public_Transportation,Obesity_Type_III
