In [1]:
from pyspark import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

# Exploratory data analysis (EDA)

In [3]:
# Carrega dataset do desafio (basta descompactar todos na mesma pasta)
#display(dbutils.fs.ls("dbfs:/FileStore/tables/"))
df_desafio = spark.read.format("csv").options(header='true').load("/FileStore/tables/*.csv")

In [4]:
# Elimina na's e duplicados do df
df_desafio_v2 = df_desafio.dropna(how='any').dropDuplicates()

# Ajusta tipo de colunas
df_desafio_v2 = df_desafio_v2.selectExpr(
  'cast(time as timestamp) time',
  'ca',
  'unit',
  'scp',
  'station',
  'linename',
  'division',
  'desc',
  'cast(entries as int) entries',
  'cast(exits as int) exits'
)

# Features para visão temporal
df_desafio_v2 = df_desafio_v2.withColumn(
  "dt_year",
  year(col("time"))
).withColumn(
  "dt_month",
  month(col("time"))
).withColumn(
  "dt_day",
  dayofmonth(col("time"))
).withColumn(
  "dt_dayofy",
  dayofyear(col("time"))
).withColumn(
  "dt_hour",
  hour(col("time"))
).withColumn(
  "dt_min",
  minute(col("time"))
).withColumn(
  "dt_week_no",
  weekofyear(col("time"))
).withColumn(
  "dt_int",
  unix_timestamp(col("time"))
)

In [5]:
# Check: 79609191 / 79130015 / 79045675
count_desafio = df_desafio.count()
count_desafio_na = df_desafio.dropna(how='any').count()
count_desafio_final = df_desafio.dropna(how='any').dropDuplicates().count()

df_amostras = sc.parallelize([
  ('antes',count_desafio,0,0,0),
  ('depois',0,count_desafio_final,count_desafio-count_desafio_na,count_desafio_na-count_desafio_final)
]).toDF(['AMOSTRAS','TOTAL','UNICO','NA','DUPLICADO'])

display(df_amostras)

AMOSTRAS,TOTAL,UNICO,NA,DUPLICADO
antes,79609191,0,0,0
depois,0,79045675,479176,84340


In [6]:
df_amostras_v2 = sc.parallelize([
  ('',count_desafio-count_desafio_na,count_desafio_na-count_desafio_final)
]).toDF(['AMOSTRAS','NA','DUPLICADO'])

display(df_amostras_v2)

AMOSTRAS,NA,DUPLICADO
,479176,84340


In [7]:
# Método para variáveis categóricas (dummys) ~14.59 minutes
lista_idx = ['ca', 'unit', 'scp', 'station', 'linename', 'division', 'desc']
indexers = [StringIndexer(inputCol=column,outputCol=column+"_idx").fit(df_desafio_v2) for column in lista_idx]
pipeline = Pipeline(stages=indexers)
df_desafio_v2 = pipeline.fit(df_desafio_v2).transform(df_desafio_v2)
#display(df_desafio_v2)

In [8]:
#display(df_desafio_v2)

In [9]:
#display(df_desafio_v2)