<a href="https://colab.research.google.com/github/mvtap/BDCC/blob/main/BDCC_Project_Auto_ML_dataset_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup spark

In [1]:
def setupSpark():
  # Spark needs to run with Java 8 ... 
  !pip install -q findspark
  !apt-get install openjdk-8-jdk-headless > /dev/null
  !echo 2 | update-alternatives --config java > /dev/null
  !java -version
  import os, findspark
  os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
  # !echo JAVA_HOME=$JAVA_HOME
  !pip install -q pyspark
  findspark.init(spark_home='/usr/local/lib/python3.7/dist-packages/pyspark')
  !pyspark --version

setupSpark()

from pyspark import SparkContext
from pyspark.sql import SparkSession
    
spark = SparkSession\
        .builder\
        .master('local[*]')\
        .getOrCreate()
sc = spark.sparkContext

openjdk version "1.8.0_282"
OpenJDK Runtime Environment (build 1.8.0_282-8u282-b08-0ubuntu1~18.04-b08)
OpenJDK 64-Bit Server VM (build 25.282-b08, mixed mode)
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.1.1
      /_/
                        
Using Scala version 2.12.10, OpenJDK 64-Bit Server VM, 1.8.0_282
Branch HEAD
Compiled by user ubuntu on 2021-02-22T01:33:19Z
Revision 1d550c4e90275ab418b9161925049239227f3dc9
Url https://github.com/apache/spark
Type --help for more information.


# Connect to Google Cloud 

__You need to set the `PROJECT_ID` variable.__

In [2]:
PROJECT_ID = 'bdcc-project1-309010'
GOOGLE_CLOUD_PROJECT = PROJECT_ID
BUCKET_URI = 'gs://bdcc_open_images_dataset'
from google.colab import auth
auth.authenticate_user()
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


# Get necessary data

This will fetch files that contain the same data as in the BigQuery tables we use for the project.

In [3]:
!gsutil cp {BUCKET_URI}/data/classes.csv .
!gsutil cp {BUCKET_URI}/data/image-labels.csv .
!head classes.csv
!head image-labels.csv

Copying gs://bdcc_open_images_dataset/data/classes.csv...
/ [1 files][ 11.8 KiB/ 11.8 KiB]                                                
Operation completed over 1 objects/11.8 KiB.                                     
Copying gs://bdcc_open_images_dataset/data/image-labels.csv...
- [1 files][ 10.9 MiB/ 10.9 MiB]                                                
Operation completed over 1 objects/10.9 MiB.                                     
Label,Description
/m/011k07,Tortoise
/m/011q46kg,Container
/m/012074,Magpie
/m/0120dh,Sea turtle
/m/01226z,Football
/m/012n7d,Ambulance
/m/012w5l,Ladder
/m/012xff,Toothbrush
/m/012ysf,Syringe
ImageId,Label
000026e7ee790996,/m/07j7r
000026e7ee790996,/m/05s2s
000062a39995e348,/m/015p6
000062a39995e348,/m/05s2s
0000c64e1253d68f,/m/0k4j
0000c64e1253d68f,/m/07yv9
000132c20b84269b,/m/03q69
000132c20b84269b,/m/0dzct
000132c20b84269b,/m/04hgtk


# Initialize data frames

In [4]:
classes = spark.read.csv('classes.csv',inferSchema=True,header=True)
classes.cache()
classes.createOrReplaceTempView('classes')
classes.printSchema()
classes.show()

image_labels = spark.read.csv('image-labels.csv',inferSchema=True,header=True)
image_labels.cache()
image_labels.createOrReplaceTempView('image_labels')
image_labels.printSchema()
image_labels.show()

root
 |-- Label: string (nullable = true)
 |-- Description: string (nullable = true)

+-----------+--------------------+
|      Label|         Description|
+-----------+--------------------+
|  /m/011k07|            Tortoise|
|/m/011q46kg|           Container|
|  /m/012074|              Magpie|
|  /m/0120dh|          Sea turtle|
|  /m/01226z|            Football|
|  /m/012n7d|           Ambulance|
|  /m/012w5l|              Ladder|
|  /m/012xff|          Toothbrush|
|  /m/012ysf|             Syringe|
|  /m/0130jx|                Sink|
|  /m/0138tl|                 Toy|
|  /m/013y1f|Organ (Musical In...|
|  /m/01432t|       Cassette deck|
|  /m/014j1m|               Apple|
|  /m/014sv8|           Human eye|
|  /m/014trl|           Cosmetics|
|  /m/014y4n|              Paddle|
|  /m/0152hh|             Snowman|
|   /m/01599|                Beer|
|   /m/01_5g|          Chopsticks|
+-----------+--------------------+
only showing top 20 rows

root
 |-- ImageId: string (nullable = true)
 |--

# Define the classes for your model.

Change __`CLASSES`__ to the image classes you want. 

See the project description for instructions.

In [5]:
CLASSES =[
          ('Aircraft',), 
          ('Bicycle',), 
          ('Boat',),  
          ('Bus',), 
          ('Car',),  
          ('Train',), 
          ('Helicopter',), 
          ('Motorcycle',), 
          ('Truck',), 
          ('Skateboard',)
]

In [6]:
class_labels = spark.createDataFrame(data=CLASSES,schema=['Description'])
class_labels.cache()
class_labels.createOrReplaceTempView('class_labels')
class_labels.printSchema()
class_labels.show()

root
 |-- Description: string (nullable = true)

+-----------+
|Description|
+-----------+
|   Aircraft|
|    Bicycle|
|       Boat|
|        Bus|
|        Car|
|      Train|
| Helicopter|
| Motorcycle|
|      Truck|
| Skateboard|
+-----------+



# Define the data set you want using Spark 

Now it's up to you.

In [7]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col
import pandas as pd

In [41]:
getimages = \
  image_labels.join(classes,'Label')\
  .join(class_labels,'Description')\
  .select('ImageId','Description')\
  .dropDuplicates(["ImageId"])\
  .orderBy('Description')

In [42]:
window = \
  Window  \
  .partitionBy(getimages['Description'])\
  .orderBy(getimages['ImageId'])

In [43]:
final = \
  getimages.select('*', rank().over(window).alias('rank')) \
  .filter(col('rank') <= 100)\
  .toPandas()

In [44]:
final = final.drop(columns='rank')

In [45]:
final['train/test']=''

In [46]:
classes_list =[
          'Aircraft', 
          'Bicycle', 
          'Boat',  
          'Bus', 
          'Car',
          'Helicopter',  
          'Motorcycle',
          'Skateboard',
          'Train',          
          'Truck'
]

In [47]:
for i in range(len(classes_list)):
    final.loc[i*100:i*100+79,'train/test'][final['Description']==classes_list[i]] = 'TRAIN'
    final.loc[i*100+80:i*100+89,'train/test'][final['Description']==classes_list[i]] = 'VALIDATION'
    final.loc[i*100+90:i*100+99,'train/test'][final['Description']==classes_list[i]] = 'TEST'

In [48]:
cols = final.columns.tolist()
cols = cols[-1:] + cols[:-1]
final = final[cols]

# Put the data in a convenient bucket

Now upload the CSV file describing the file and __only__ the necessary images to the bucket you'll use with AutoML.

__Note__: the bucket must be created using a __Regional__ location setting. Choose __us-central1__ for example. 


In [49]:
MY_AUTOML_BUCKET='gs://bddc_train_transport'

In [50]:
for j in range(final.shape[0]):
  id = final.loc[[j],['ImageId']].values
  idf = id[0][0]
  !gsutil cp {BUCKET_URI}/images/{idf}.jpg {MY_AUTOML_BUCKET}/img/
  print(j)

Copying gs://bdcc_open_images_dataset/images/00835f0fbe950715.jpg [Content-Type=image/jpeg]...
/ [1 files][378.4 KiB/378.4 KiB]                                                
Operation completed over 1 objects/378.4 KiB.                                    
0
Copying gs://bdcc_open_images_dataset/images/01635d30529455d0.jpg [Content-Type=image/jpeg]...
/ [1 files][399.7 KiB/399.7 KiB]                                                
Operation completed over 1 objects/399.7 KiB.                                    
1
Copying gs://bdcc_open_images_dataset/images/02dae28bbf4d02b0.jpg [Content-Type=image/jpeg]...
/ [1 files][ 65.4 KiB/ 65.4 KiB]                                                
Operation completed over 1 objects/65.4 KiB.                                     
2
Copying gs://bdcc_open_images_dataset/images/0300e59281a86403.jpg [Content-Type=image/jpeg]...
/ [1 files][376.8 KiB/376.8 KiB]                                                
Operation completed over 1 objects/376.8 KiB

In [51]:
finalML = final
j=0
for j in range(finalML.shape[0]):
  id = finalML.loc[[j],['ImageId']].values
  idf = id[0][0]
  finalML.loc[[j],['ImageId']] = MY_AUTOML_BUCKET+'/img/'+idf+'.jpg'

In [52]:
finalML.head()

Unnamed: 0,train/test,ImageId,Description
0,TRAIN,gs://bddc_train_transport/img/00835f0fbe950715...,Aircraft
1,TRAIN,gs://bddc_train_transport/img/01635d30529455d0...,Aircraft
2,TRAIN,gs://bddc_train_transport/img/02dae28bbf4d02b0...,Aircraft
3,TRAIN,gs://bddc_train_transport/img/0300e59281a86403...,Aircraft
4,TRAIN,gs://bddc_train_transport/img/0317e581988fb533...,Aircraft


In [204]:
!pip install fsspec
!pip install gcsfs

Collecting gcsfs
  Downloading https://files.pythonhosted.org/packages/ef/7a/c847e1a170be1836d8df2d3e0449b349b06c90a627914196e084b75b0284/gcsfs-0.7.2-py2.py3-none-any.whl
Collecting aiohttp
[?25l  Downloading https://files.pythonhosted.org/packages/88/c0/5890b4c8b04a79b7360e8fe4490feb0bb3ab179743f199f0e6220cebd568/aiohttp-3.7.4.post0-cp37-cp37m-manylinux2014_x86_64.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 11.1MB/s 
Collecting multidict<7.0,>=4.5
[?25l  Downloading https://files.pythonhosted.org/packages/7c/a6/4123b8165acbe773d1a8dc8e3f0d1edea16d29f7de018eda769abb56bd30/multidict-5.1.0-cp37-cp37m-manylinux2014_x86_64.whl (142kB)
[K     |████████████████████████████████| 143kB 58.9MB/s 
[?25hCollecting async-timeout<4.0,>=3.0
  Downloading https://files.pythonhosted.org/packages/e1/1e/5a4441be21b0726c4464f3f23c8b19628372f606755a9d2e46c187e65ec4/async_timeout-3.0.1-py3-none-any.whl
Collecting yarl<2.0,>=1.0
[?25l  Downloading https://files.pythonhosted.org/package

In [53]:
finalML.to_csv(MY_AUTOML_BUCKET+'/csv/automl.csv', header = False, index = False)