# Leitura de Arquivos de Imagens no Spark

## Exemplo leitura de imagens de espécies de flores

Este exemplo lê um diretório de imagens, cria um vetor de features e inclui um tag de identificação.

Cada diretório pode comnter qualquer numero de imagens, no entanto, é obrigatório que cada diretório possua apenas uma categoria de imagem.

In [1]:
# Init Spark environment
import findspark
findspark.init()

In [2]:
# Load libs
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit

# Set main images directory
img_dir = "/data/dataset/flower_photos/"

In [3]:
# Create Spark Session
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("FlowerIdentification") \
        .getOrCreate()

In [4]:
#Read images usinf default Spark lib read.format("image")
#Create training & test DataFrames for transfer learning
#Sub directories: daisy  dandelion  roses sunflowers tulips

daisy_df = spark.read.format("image").load(img_dir + "daisy").withColumn("label", lit(0))
daisy_train, daisy_test = daisy_df.randomSplit([0.8, 0.2])

dandelion_df = spark.read.format("image").load(img_dir + "dandelion").withColumn("label", lit(1))
dandelion_train, dandelion_test = dandelion_df.randomSplit([0.8, 0.2])

roses_df = spark.read.format("image").load(img_dir + "roses").withColumn("label", lit(2))
roses_train, roses_test = roses_df.randomSplit([0.8, 0.2])

sunflowers_df = spark.read.format("image").load(img_dir + "sunflowers").withColumn("label", lit(3))
sunflowers_train, sunflowers_test = sunflowers_df.randomSplit([0.8, 0.2])

tulips_df = spark.read.format("image").load(img_dir + "tulips").withColumn("label", lit(4))
tulips_train, tulips_test = tulips_df.randomSplit([0.8, 0.2])

roses_df.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)
 |-- label: integer (nullable = false)



In [5]:
# Join train and test datasets

#dataframe for training a classification model
train_df = daisy_train.unionAll(dandelion_train).unionAll(roses_train).unionAll(sunflowers_train).unionAll(tulips_train)
train_df.show()

#dataframe for testing the classification model
test_df = daisy_test.unionAll(dandelion_test).unionAll(roses_test).unionAll(sunflowers_test).unionAll(tulips_test)
test_df.show()

+--------------------+-----+
|               image|label|
+--------------------+-----+
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
+--------------------+-----+
only showing top 20 rows

+--------------------+-----+
|               image|label|
+--------------------+-----+
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/dat...|    0|
|[file:///data/da

In [6]:
spark.stop()