## Data Preprocessing Step -3

#### In this notebook , we will preprocess third data source to a particular format and visualize the images. This data contains the images of the faces with/without Masks

### a) Data Preprocessing

In [24]:
# Importing Relevant Libraries
import pandas as pd
import os
import cv2
import glob
import json
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize

In [25]:
# Reading the annotation file
df= pd.read_csv('/data/logo_detection/Hackathon/raw_data/labels_3/Masks.csv')
df.head()

Unnamed: 0,filename,file_size,file_attributes,region_count,region_id,region_shape_attributes,region_attributes
0,6a9947bc-2cff-42d8-a32b-aac46276c512.jpg,151294,"{""caption"":"""",""public_domain"":""no"",""image_url""...",2,0,"{""name"":""rect"",""x"":142,""y"":54,""width"":37,""heig...","{""name"":""None"",""type"":""unknown"",""image_quality..."
1,6a9947bc-2cff-42d8-a32b-aac46276c512.jpg,151294,"{""caption"":"""",""public_domain"":""no"",""image_url""...",2,1,"{""name"":""rect"",""x"":214,""y"":25,""width"":36,""heig...","{""name"":""Good"",""type"":""unknown"",""image_quality..."
2,52e07677-32ec-4fb7-8641-ad87e5482ea3.jpg,297189,"{""caption"":"""",""public_domain"":""no"",""image_url""...",1,0,"{""name"":""rect"",""x"":1034,""y"":161,""width"":616,""h...","{""name"":""Bad"",""type"":""unknown"",""image_quality""..."
3,00140c9b-1600.jpg,844142,"{""caption"":"""",""public_domain"":""no"",""image_url""...",2,0,"{""name"":""rect"",""x"":617,""y"":274,""width"":93,""hei...","{""name"":""Bad"",""type"":""unknown"",""image_quality""..."
4,00140c9b-1600.jpg,844142,"{""caption"":"""",""public_domain"":""no"",""image_url""...",2,1,"{""name"":""rect"",""x"":844,""y"":270,""width"":79,""hei...","{""name"":""Good"",""type"":""unknown"",""image_quality..."


In [48]:
# Converting json attributes in different columns
def json_to_columns(annotation):
    df=annotation[['filename','region_attributes']].join(json_normalize(annotation['region_shape_attributes'].apply(json.loads)))
    df2 = df.join(json_normalize(df['region_attributes'].apply(json.loads)),rsuffix="_shape")
    df_mod= df2[['filename','name_shape','x','y','width','height']]
    return df_mod       

In [27]:
# Getting the shape of the image 
#Result - Tuple (width , Height , Depth)
def get_image_shape(df,image_path):
    filenames = list(df['filename'])
    image_shape={}
    for i in filenames:
            
        if os.path.exists(image_path+i):
            im= cv2.imread(image_path+i)
            image_shape.update({i:im.shape})
        else:
            pass
    return image_shape
    

In [28]:
# Getting the center of the annotations from x and y coordinates
def convert_format_vggtoyolo(df):
    df['center-x']= df['x']+(df['width']/2)
    df['center-y']= df['y']+(df['height']/2)
    return df 

In [29]:
def normalise(df):
    # Convert Vgg format(x,y,w,h) (x and y are top left corners) to Yolo Format(center-x,center-y,w,h)
    df = convert_format_vggtoyolo(df)
    #Normalising the height and width of Bounding Box
    df['normalised_height']= (df['height']/df['image_height'])
    df['normalised_width']= (df['width']/df['image_width'])
    df['center-x']= df['center-x']/df['image_width']
    df['center-y'] = df['center-y']/df['image_height']
    
    return df
    

In [44]:
def select_relevant_columns(df,image_path):
    df['class_name'] = df['name_shape']
    df = df[['filename','class_name','center-x','center-y','normalised_width','normalised_height']]
    return df

In [45]:
def data_preprocessing(annotation_path,image_path):
    file = pd.read_csv(annotation_path)
    annotation= file[['filename','region_shape_attributes','region_attributes']]
    df = json_to_columns(annotation)
    image_shape = get_image_shape(df,image_path)
    df['image_shape'] = df['filename'].map(image_shape)
    df[['image_height','image_width','image_channels']]=pd.DataFrame(df['image_shape'].tolist(),index=df.index)
    df = normalise(df)
    finaldf = select_relevant_columns(df,image_path)
    
    return finaldf
    

In [46]:
# Writing final annotation to the designated folder
def write_annotations(finaldf,image_path):
    suffix= image_path.split('/')[-2]
    path= '/data/logo_detection/Hackathon/processed_files/annotation_3.csv'
    finaldf.to_csv(path,index=None)
    

In [47]:
if __name__=='__main__':
    
    
    annotation_path = '/data/logo_detection/Hackathon/raw_data/labels_3/Masks.csv'
    image_path = '/data/logo_detection/Hackathon/raw_data/images_3/'
    
    finaldf= data_preprocessing(annotation_path,image_path)
    write_annotations(finaldf,image_path)
    

  
  This is separate from the ipykernel package so we can avoid doing imports until
