# Data Exploration and Transformation Notebook

In [1]:
#import
from datasets import load_dataset, Image

import cv2 
import os
import PIL

import pandas as pd


# Load Dataset

In [2]:
#load the dataset from hugging face
dataset = load_dataset("kili-technology/plastic_in_river")

  table = cls._concat_blocks(blocks, axis=0)


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'litter'],
        num_rows: 3407
    })
    test: Dataset({
        features: ['image', 'litter'],
        num_rows: 427
    })
    validation: Dataset({
        features: ['image', 'litter'],
        num_rows: 425
    })
})

In [4]:
#break the dataset up into train test and validation sets
trainData = dataset['train']
testData = dataset['test']
validationData = dataset['validation']

In [5]:
trainData.features

{'image': Image(decode=True, id=None),
 'litter': Sequence(feature={'label': ClassLabel(names=['PLASTIC_BAG', 'PLASTIC_BOTTLE', 'OTHER_PLASTIC_WASTE', 'NOT_PLASTIC_WASTE'], id=None), 'bbox': Sequence(feature=Value(dtype='float32', id=None), length=4, id=None)}, length=-1, id=None)}

In [6]:
testData.features

{'image': Image(decode=True, id=None),
 'litter': Sequence(feature={'label': ClassLabel(names=['PLASTIC_BAG', 'PLASTIC_BOTTLE', 'OTHER_PLASTIC_WASTE', 'NOT_PLASTIC_WASTE'], id=None), 'bbox': Sequence(feature=Value(dtype='float32', id=None), length=4, id=None)}, length=-1, id=None)}

In [7]:
validationData.features

{'image': Image(decode=True, id=None),
 'litter': Sequence(feature={'label': ClassLabel(names=['PLASTIC_BAG', 'PLASTIC_BOTTLE', 'OTHER_PLASTIC_WASTE', 'NOT_PLASTIC_WASTE'], id=None), 'bbox': Sequence(feature=Value(dtype='float32', id=None), length=4, id=None)}, length=-1, id=None)}

In [8]:
# litter represents the labels for the images
testLabels = testData['litter']
trainLabels = trainData['litter']
valLabels = validationData['litter']

# Test Data Exploration

In [9]:
#convert labels into pandas dataframe
#each row represents one image
labelsPD = pd.DataFrame(testLabels)

#label column is an array, determine its length
labelsPD = labelsPD.assign(labelsLen = labelsPD['label'].apply(len))
#bbox is an array of arrays, find its length
labelsPD = labelsPD.assign(bboxLen = labelsPD['bbox'].apply(len))

In [11]:
#Check for problems in dataset
print("Number of NA's in labels: " + str(labelsPD['label'].apply(lambda x: any(pd.isna(x))).sum()))
print("Number of NA's in bbox: " + str(labelsPD['bbox'].apply(lambda x: pd.isna(x).any()).sum()))
print("Number of missing labels: " + str(labelsPD[labelsPD['labelsLen'] > 0]['label'].apply(lambda x: '' in  x).sum()))
print("Number of empty labels: " + str(labelsPD[labelsPD['labelsLen'] == 0]['label'].apply(lambda x: not x).sum()))
print("Number of empty bbox: " + str(labelsPD[labelsPD['bboxLen'] == 0]['bbox'].apply(lambda x: not x).sum()))

print("Number of unequal label bbox pairs: " + str(len(labelsPD[labelsPD['labelsLen'] != labelsPD['bboxLen']])))
print("Number of out of bounds bbox values: " + str(labelsPD[labelsPD['bboxLen'] > 0]['bbox'].apply(
    lambda x: sum([not (jj <= 1 and jj >= 0) for ii in x for jj in ii])).sum()))

litterLabels = [0,1,2,3]
print("Data labels should be either empty or 0, 1, 2, or 3. Is this true: " + 
      str(labelsPD[labelsPD['labelsLen'] > 0]['label'].apply(lambda x: any([ii in x for ii in litterLabels])).all()))

Number of NA's in labels: 0
Number of NA's in bbox: 0
Number of missing labels: 0
Number of empty labels: 52
Number of empty bbox: 52
Number of unequal label bbox pairs: 0
Number of out of bounds bbox values: 0
Data labels should be either empty or 0, 1, 2, or 3. Is this true: True


# Train Data Exploration

In [12]:
#convert labels into pandas dataframe
#each row represents one image
labelsPD = pd.DataFrame(trainLabels)

#label column is an array, determine its length
labelsPD = labelsPD.assign(labelsLen = labelsPD['label'].apply(len))
#bbox is an array of arrays, find its length
labelsPD = labelsPD.assign(bboxLen = labelsPD['bbox'].apply(len))

In [13]:
#check for problems in dataset
print("Number of NA's in labels: " + str(labelsPD['label'].apply(lambda x: any(pd.isna(x))).sum()))
print("Number of NA's in bbox: " + str(labelsPD['bbox'].apply(lambda x: pd.isna(x).any()).sum()))
print("Number of missing labels: " + str(labelsPD[labelsPD['labelsLen'] > 0]['label'].apply(lambda x: '' in  x).sum()))
print("Number of empty labels: " + str(labelsPD[labelsPD['labelsLen'] == 0]['label'].apply(lambda x: not x).sum()))
print("Number of empty bbox: " + str(labelsPD[labelsPD['bboxLen'] == 0]['bbox'].apply(lambda x: not x).sum()))

print("Number of unequal label bbox pairs: " + str(len(labelsPD[labelsPD['labelsLen'] != labelsPD['bboxLen']])))
print("Number of out of bounds bbox values: " + str(labelsPD[labelsPD['bboxLen'] > 0]['bbox'].apply(
    lambda x: sum([not (jj <= 1 and jj >= 0) for ii in x for jj in ii])).sum()))

litterLabels = [0,1,2,3]
print("Data labels should be either empty or 0, 1, 2, or 3. Is this true: " + 
      str(labelsPD[labelsPD['labelsLen'] > 0]['label'].apply(lambda x: any([ii in x for ii in litterLabels])).all()))

Number of NA's in labels: 0
Number of NA's in bbox: 0
Number of missing labels: 0
Number of empty labels: 238
Number of empty bbox: 238
Number of unequal label bbox pairs: 0
Number of out of bounds bbox values: 0
Data labels should be either empty or 0, 1, 2, or 3. Is this true: True


# Validation Data Exploration

In [14]:
#convert labels into pandas dataframe
#each row represents one image
labelsPD = pd.DataFrame(valLabels)

#label column is an array, determine its length
labelsPD = labelsPD.assign(labelsLen = labelsPD['label'].apply(len))
#bbox is an array of arrays, find its length
labelsPD = labelsPD.assign(bboxLen = labelsPD['bbox'].apply(len))

In [15]:
#check for problems in dataset
print("Number of NA's in labels: " + str(labelsPD['label'].apply(lambda x: any(pd.isna(x))).sum()))
print("Number of NA's in bbox: " + str(labelsPD['bbox'].apply(lambda x: pd.isna(x).any()).sum()))
print("Number of missing labels: " + str(labelsPD[labelsPD['labelsLen'] > 0]['label'].apply(lambda x: '' in  x).sum()))
print("Number of empty labels: " + str(labelsPD[labelsPD['labelsLen'] == 0]['label'].apply(lambda x: not x).sum()))
print("Number of empty bbox: " + str(labelsPD[labelsPD['bboxLen'] == 0]['bbox'].apply(lambda x: not x).sum()))

print("Number of unequal label bbox pairs: " + str(len(labelsPD[labelsPD['labelsLen'] != labelsPD['bboxLen']])))
print("Number of out of bounds bbox values: " + str(labelsPD[labelsPD['bboxLen'] > 0]['bbox'].apply(
    lambda x: sum([not (jj <= 1 and jj >= 0) for ii in x for jj in ii])).sum()))

litterLabels = [0,1,2,3]
print("Data labels should be either empty or 0, 1, 2, or 3. Is this true: " + 
      str(labelsPD[labelsPD['labelsLen'] > 0]['label'].apply(lambda x: any([ii in x for ii in litterLabels])).all()))

Number of NA's in labels: 0
Number of NA's in bbox: 0
Number of missing labels: 0
Number of empty labels: 42
Number of empty bbox: 42
Number of unequal label bbox pairs: 0
Number of out of bounds bbox values: 0
Data labels should be either empty or 0, 1, 2, or 3. Is this true: True


# Save Data

### Please determine where to save the data and make an images and labels folder

### Inside the images and labels folders make a train, test, and val folder

## Remove Non-Plastic Waste and Save Labels to File

In [17]:
#split between train, test and validation
for split, data in dataset.items():
    #location labels should be saved to
    txtDirectory = "D:\school\graduate school\capstone\data\labels2"
    
    #change directory to labels file
    os.chdir(txtDirectory)
    #change directory to either train, test, or val file
    os.chdir(split)
    #loop through each labels row, one file per row
    for ii in range(len(data)):
        #make a txt file for the label with name determined by index, index 0 will become 0.txt
        f = open(str(ii) + ".txt", 'w')
        #one line in txt file per object in label array
        for jj in range(len(data['litter'][ii]['label'])):
            #do not add line if label is 3, this is non-plastic waste
            if data['litter'][ii]['label'][jj] != 3:
                #row starts with the object identifier, a number between 0 an 2
                labelString = str(data['litter'][ii]['label'][jj])
                #each line will have a bounding box which has four parts
                for ee in range(4):
                    #create the row as a string
                    labelString = labelString + ' ' +str(data['litter'][ii]['bbox'][jj][ee])
                labelString = labelString + '\n'
                #write the row to the file as a line
                f.writelines(labelString)
        #close the txt file
        f.close()

## Save images to png

In [None]:
#split between train, test and validation
for split, data in dataset.items():
    #location images should be saved to
    imageDirectory = f"D:\school\graduate school\capstone\data\images\\{split}"
    
    #change directory to images file
    os.chdir(imageDirectory)
    #take the images
    imageList = dataset[split]['image']
    #loop through all the images
    for ii in range(len(imageList)):
        #for each image save the image as a png, image index 0 will be 0.png
        imageList[ii].save(f'{ii}.png')

# Make Test Images Into Video

In [5]:
#change directory to where the video should be saved
%cd C:\Users\grpoo\capstone\PlasticDetection
#folder where test split of images data is saved
imageFolder = 'F:/capstoneData/images/test'
#name of video
videoName = 'testDataset.mp4'
#array of image names in image folder
images = [img for img in os.listdir(image_folder) if img.endswith(".png")]
#height of the video
height = 640
#width of the video
width = 640

#set up video at 30 frames per second
video = cv2.VideoWriter(video_name, 0, 30, (width,height))

#add each image in image file to video
for image in images:
    #adds an image to video
    #first opens the image with imread
    #second resizes image to width and height, otherwise will skip images with diffrent widths and heights
    #third adds the image to video
    video.write(cv2.resize(cv2.imread(os.path.join(image_folder, image)), (width, height)))

#save and close video
cv2.destroyAllWindows()
video.release()

C:\Users\grpoo\capstone\PlasticDetection
