<a href="https://colab.research.google.com/github/mvtap/BDCC/blob/main/BDCC_Project_Auto_ML_dataset_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup spark

In [1]:
def setupSpark():
  # Spark needs to run with Java 8 ... 
  !pip install -q findspark
  !apt-get install openjdk-8-jdk-headless > /dev/null
  !echo 2 | update-alternatives --config java > /dev/null
  !java -version
  import os, findspark
  os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
  # !echo JAVA_HOME=$JAVA_HOME
  !pip install -q pyspark
  findspark.init(spark_home='/usr/local/lib/python3.7/dist-packages/pyspark')
  !pyspark --version

setupSpark()

from pyspark import SparkContext
from pyspark.sql import SparkSession
    
spark = SparkSession\
        .builder\
        .master('local[*]')\
        .getOrCreate()
sc = spark.sparkContext

openjdk version "1.8.0_282"
OpenJDK Runtime Environment (build 1.8.0_282-8u282-b08-0ubuntu1~18.04-b08)
OpenJDK 64-Bit Server VM (build 25.282-b08, mixed mode)
[K     |████████████████████████████████| 212.3MB 77kB/s 
[K     |████████████████████████████████| 204kB 57.5MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.1.1
      /_/
                        
Using Scala version 2.12.10, OpenJDK 64-Bit Server VM, 1.8.0_282
Branch HEAD
Compiled by user ubuntu on 2021-02-22T01:33:19Z
Revision 1d550c4e90275ab418b9161925049239227f3dc9
Url https://github.com/apache/spark
Type --help for more information.


# Connect to Google Cloud 

__You need to set the `PROJECT_ID` variable.__

In [2]:
PROJECT_ID = 'bdcc-project1-309010' 
BUCKET_URI = 'gs://bdcc_open_images_dataset'
from google.colab import auth
auth.authenticate_user()
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


# Get necessary data

This will fetch files that contain the same data as in the BigQuery tables we use for the project.

In [3]:
!gsutil cp {BUCKET_URI}/data/classes.csv .
!gsutil cp {BUCKET_URI}/data/image-labels.csv .
!head classes.csv
!head image-labels.csv

Copying gs://bdcc_open_images_dataset/data/classes.csv...
/ [1 files][ 11.8 KiB/ 11.8 KiB]                                                
Operation completed over 1 objects/11.8 KiB.                                     
Copying gs://bdcc_open_images_dataset/data/image-labels.csv...
\ [1 files][ 10.9 MiB/ 10.9 MiB]                                                
Operation completed over 1 objects/10.9 MiB.                                     
Label,Description
/m/011k07,Tortoise
/m/011q46kg,Container
/m/012074,Magpie
/m/0120dh,Sea turtle
/m/01226z,Football
/m/012n7d,Ambulance
/m/012w5l,Ladder
/m/012xff,Toothbrush
/m/012ysf,Syringe
ImageId,Label
000026e7ee790996,/m/07j7r
000026e7ee790996,/m/05s2s
000062a39995e348,/m/015p6
000062a39995e348,/m/05s2s
0000c64e1253d68f,/m/0k4j
0000c64e1253d68f,/m/07yv9
000132c20b84269b,/m/03q69
000132c20b84269b,/m/0dzct
000132c20b84269b,/m/04hgtk


# Initialize data frames

In [4]:
classes = spark.read.csv('classes.csv',inferSchema=True,header=True)
classes.cache()
classes.createOrReplaceTempView('classes')
classes.printSchema()
classes.show()

image_labels = spark.read.csv('image-labels.csv',inferSchema=True,header=True)
image_labels.cache()
image_labels.createOrReplaceTempView('image_labels')
image_labels.printSchema()
image_labels.show()

root
 |-- Label: string (nullable = true)
 |-- Description: string (nullable = true)

+-----------+--------------------+
|      Label|         Description|
+-----------+--------------------+
|  /m/011k07|            Tortoise|
|/m/011q46kg|           Container|
|  /m/012074|              Magpie|
|  /m/0120dh|          Sea turtle|
|  /m/01226z|            Football|
|  /m/012n7d|           Ambulance|
|  /m/012w5l|              Ladder|
|  /m/012xff|          Toothbrush|
|  /m/012ysf|             Syringe|
|  /m/0130jx|                Sink|
|  /m/0138tl|                 Toy|
|  /m/013y1f|Organ (Musical In...|
|  /m/01432t|       Cassette deck|
|  /m/014j1m|               Apple|
|  /m/014sv8|           Human eye|
|  /m/014trl|           Cosmetics|
|  /m/014y4n|              Paddle|
|  /m/0152hh|             Snowman|
|   /m/01599|                Beer|
|   /m/01_5g|          Chopsticks|
+-----------+--------------------+
only showing top 20 rows

root
 |-- ImageId: string (nullable = true)
 |--

# Define the classes for your model.

Change __`CLASSES`__ to the image classes you want. 

See the project description for instructions.

In [17]:
CLASSES =[
          ('Aircraft',), 
          ('Bicycle',), 
          ('Boat',),  
          ('Bus',), 
          ('Car',),  
          ('Train',), 
          ('Helicopter',), 
          ('Motorcycle',), 
          ('Truck',), 
          ('Skateboard',)
]

In [18]:
class_labels = spark.createDataFrame(data=CLASSES,schema=['Description'])
class_labels.cache()
class_labels.createOrReplaceTempView('class_labels')
class_labels.printSchema()
class_labels.show()

root
 |-- Description: string (nullable = true)

+-----------+
|Description|
+-----------+
|   Aircraft|
|    Bicycle|
|       Boat|
|        Bus|
|        Car|
|      Train|
| Helicopter|
| Motorcycle|
|      Truck|
| Skateboard|
+-----------+



# Define the data set you want using Spark 

Now it's up to you.

In [33]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

In [84]:
getimages = \
  image_labels.join(classes,'Label')\
  .join(class_labels,'Description')\
  .select('ImageId','Description')\
  .orderBy('Description')\

In [85]:
window = \
  Window  \
  .partitionBy(getimages['Description'])\
  .orderBy(getimages['ImageId'])

In [86]:
final = \
  getimages.select('*', rank().over(window).alias('rank')) \
  .filter(col('rank') <= 100)\
  .toPandas()

In [88]:
final = final.drop(columns='rank')

In [89]:
final['train/test']=''

In [96]:
classes_list =[
          'Aircraft', 
          'Bicycle', 
          'Boat',  
          'Bus', 
          'Car',  
          'Train', 
          'Helicopter', 
          'Motorcycle', 
          'Truck', 
          'Skateboard'
]

In [95]:
final[final['Description']=='Aircraft'].head(10)


Unnamed: 0,ImageId,Description,train/test
0,00835f0fbe950715,Aircraft,
1,01635d30529455d0,Aircraft,
2,01d4c269fd96589d,Aircraft,
3,02dae28bbf4d02b0,Aircraft,
4,0300e59281a86403,Aircraft,
5,0317e581988fb533,Aircraft,
6,0432503d7d6f9b42,Aircraft,
7,04af13c8a282bd4f,Aircraft,
8,04d39811b8466ade,Aircraft,
9,050850ece13f02fd,Aircraft,


# Put the data in a convenient bucket

Now upload the CSV file describing the file and __only__ the necessary images to the bucket you'll use with AutoML.

__Note__: the bucket must be created using a __Regional__ location setting. Choose __us-central1__ for example. 


In [None]:
MY_AUTOML_BUCKET='TODO'

In [None]:
# TODO upload CSV and image file