# Detection of Plastic in Rivers Using Object Identification

## DS 785: Capstone Code

# Data Exploration and Transformation

In [None]:
#Imports
from datasets import load_dataset, Image

import cv2 
import os
import PIL

import pandas as pd


# Load Dataset

In [None]:
#load the dataset from hugging face
dataset = load_dataset("kili-technology/plastic_in_river")

In [None]:
dataset

In [None]:
#break the dataset up into train test and validation sets
trainData = dataset['train']
testData = dataset['test']
validationData = dataset['validation']

In [None]:
trainData.features

In [None]:
testData.features

In [None]:
validationData.features

In [None]:
# litter represents the labels for the images
testLabels = testData['litter']
trainLabels = trainData['litter']
valLabels = validationData['litter']

# Test Data Exploration

In [None]:
#convert labels into pandas dataframe
#each row represents one image
labelsPD = pd.DataFrame(testLabels)

#label column is an array, determine its length
labelsPD = labelsPD.assign(labelsLen = labelsPD['label'].apply(len))
#bbox is an array of arrays, find its length
labelsPD = labelsPD.assign(bboxLen = labelsPD['bbox'].apply(len))

In [None]:
#Check for problems in dataset
print("Number of NA's in labels: " + str(labelsPD['label'].apply(lambda x: any(pd.isna(x))).sum()))
print("Number of NA's in bbox: " + str(labelsPD['bbox'].apply(lambda x: pd.isna(x).any()).sum()))
print("Number of missing labels: " + str(labelsPD[labelsPD['labelsLen'] > 0]['label'].apply(lambda x: '' in  x).sum()))
print("Number of empty labels: " + str(labelsPD[labelsPD['labelsLen'] == 0]['label'].apply(lambda x: not x).sum()))
print("Number of empty bbox: " + str(labelsPD[labelsPD['bboxLen'] == 0]['bbox'].apply(lambda x: not x).sum()))

print("Number of unequal label bbox pairs: " + str(len(labelsPD[labelsPD['labelsLen'] != labelsPD['bboxLen']])))
print("Number of out of bounds bbox values: " + str(labelsPD[labelsPD['bboxLen'] > 0]['bbox'].apply(
    lambda x: sum([not (jj <= 1 and jj >= 0) for ii in x for jj in ii])).sum()))

litterLabels = [0,1,2,3]
print("Data labels should be either empty or 0, 1, 2, or 3. Is this true: " + 
      str(labelsPD[labelsPD['labelsLen'] > 0]['label'].apply(lambda x: any([ii in x for ii in litterLabels])).all()))

# Train Data Exploration

In [None]:
#convert labels into pandas dataframe
#each row represents one image
labelsPD = pd.DataFrame(trainLabels)

#label column is an array, determine its length
labelsPD = labelsPD.assign(labelsLen = labelsPD['label'].apply(len))
#bbox is an array of arrays, find its length
labelsPD = labelsPD.assign(bboxLen = labelsPD['bbox'].apply(len))

In [None]:
#check for problems in dataset
print("Number of NA's in labels: " + str(labelsPD['label'].apply(lambda x: any(pd.isna(x))).sum()))
print("Number of NA's in bbox: " + str(labelsPD['bbox'].apply(lambda x: pd.isna(x).any()).sum()))
print("Number of missing labels: " + str(labelsPD[labelsPD['labelsLen'] > 0]['label'].apply(lambda x: '' in  x).sum()))
print("Number of empty labels: " + str(labelsPD[labelsPD['labelsLen'] == 0]['label'].apply(lambda x: not x).sum()))
print("Number of empty bbox: " + str(labelsPD[labelsPD['bboxLen'] == 0]['bbox'].apply(lambda x: not x).sum()))

print("Number of unequal label bbox pairs: " + str(len(labelsPD[labelsPD['labelsLen'] != labelsPD['bboxLen']])))
print("Number of out of bounds bbox values: " + str(labelsPD[labelsPD['bboxLen'] > 0]['bbox'].apply(
    lambda x: sum([not (jj <= 1 and jj >= 0) for ii in x for jj in ii])).sum()))

litterLabels = [0,1,2,3]
print("Data labels should be either empty or 0, 1, 2, or 3. Is this true: " + 
      str(labelsPD[labelsPD['labelsLen'] > 0]['label'].apply(lambda x: any([ii in x for ii in litterLabels])).all()))

# Validation Data Exploration

In [None]:
#convert labels into pandas dataframe
#each row represents one image
labelsPD = pd.DataFrame(valLabels)

#label column is an array, determine its length
labelsPD = labelsPD.assign(labelsLen = labelsPD['label'].apply(len))
#bbox is an array of arrays, find its length
labelsPD = labelsPD.assign(bboxLen = labelsPD['bbox'].apply(len))

In [None]:
#check for problems in dataset
print("Number of NA's in labels: " + str(labelsPD['label'].apply(lambda x: any(pd.isna(x))).sum()))
print("Number of NA's in bbox: " + str(labelsPD['bbox'].apply(lambda x: pd.isna(x).any()).sum()))
print("Number of missing labels: " + str(labelsPD[labelsPD['labelsLen'] > 0]['label'].apply(lambda x: '' in  x).sum()))
print("Number of empty labels: " + str(labelsPD[labelsPD['labelsLen'] == 0]['label'].apply(lambda x: not x).sum()))
print("Number of empty bbox: " + str(labelsPD[labelsPD['bboxLen'] == 0]['bbox'].apply(lambda x: not x).sum()))

print("Number of unequal label bbox pairs: " + str(len(labelsPD[labelsPD['labelsLen'] != labelsPD['bboxLen']])))
print("Number of out of bounds bbox values: " + str(labelsPD[labelsPD['bboxLen'] > 0]['bbox'].apply(
    lambda x: sum([not (jj <= 1 and jj >= 0) for ii in x for jj in ii])).sum()))

litterLabels = [0,1,2,3]
print("Data labels should be either empty or 0, 1, 2, or 3. Is this true: " + 
      str(labelsPD[labelsPD['labelsLen'] > 0]['label'].apply(lambda x: any([ii in x for ii in litterLabels])).all()))

# Save Data

### Please determine where to save the data and make an images and labels folder

### Inside the images and labels folders make a train, test, and val folder

## Remove Non-Plastic Waste and Save Labels to File

In [None]:
#split between train, test and validation
for split, data in dataset.items():
    #location labels should be saved to
    txtDirectory = "D:\school\graduate school\capstone\data\labels2"
    
    #change directory to labels file
    os.chdir(txtDirectory)
    #change directory to either train, test, or val file
    os.chdir(split)
    #loop through each labels row, one file per row
    for ii in range(len(data)):
        #make a txt file for the label with name determined by index, index 0 will become 0.txt
        f = open(str(ii) + ".txt", 'w')
        #one line in txt file per object in label array
        for jj in range(len(data['litter'][ii]['label'])):
            #do not add line if label is 3, this is non-plastic waste
            if data['litter'][ii]['label'][jj] != 3:
                #row starts with the object identifier, a number between 0 an 2
                labelString = str(data['litter'][ii]['label'][jj])
                #each line will have a bounding box which has four parts
                for ee in range(4):
                    #create the row as a string
                    labelString = labelString + ' ' +str(data['litter'][ii]['bbox'][jj][ee])
                labelString = labelString + '\n'
                #write the row to the file as a line
                f.writelines(labelString)
        #close the txt file
        f.close()

## Save images to png

In [None]:
#split between train, test and validation
for split, data in dataset.items():
    #location images should be saved to
    imageDirectory = f"D:\school\graduate school\capstone\data\images\\{split}"
    
    #change directory to images file
    os.chdir(imageDirectory)
    #take the images
    imageList = dataset[split]['image']
    #loop through all the images
    for ii in range(len(imageList)):
        #for each image save the image as a png, image index 0 will be 0.png
        imageList[ii].save(f'{ii}.png')

# Make Test Images Into Video

In [None]:
#change directory to where the video should be saved
%cd C:\Users\grpoo\capstone\PlasticDetection
#folder where test split of images data is saved
imageFolder = 'F:/capstoneData/images/test'
#name of video
videoName = 'testDataset.mp4'
#array of image names in image folder
images = [img for img in os.listdir(image_folder) if img.endswith(".png")]
#height of the video
height = 640
#width of the video
width = 640

#set up video at 30 frames per second
video = cv2.VideoWriter(video_name, 0, 30, (width,height))

#add each image in image file to video
for image in images:
    #adds an image to video
    #first opens the image with imread
    #second resizes image to width and height, otherwise will skip images with diffrent widths and heights
    #third adds the image to video
    video.write(cv2.resize(cv2.imread(os.path.join(image_folder, image)), (width, height)))

#save and close video
cv2.destroyAllWindows()
video.release()

# YOLO Model Training, and Testing

In [None]:
#Imports
from ultralytics import YOLO
from memory_profiler import memory_usage

# YOLOv8

## YOLOv8 Small

### Train (Warning this may take a long time and use a lot of RAM)

In [None]:
# Load a pretrained YOLOv8 small model
# on the first run this will download the yolov8s.pt file
model = YOLO('yolov8s.pt')

# data need a yaml file as input.

#workers denote the number of threads used.
#batch will affect how much ram training uses. If an out of memory error occurs, try to decrease batch size.

#running train will create a new file called runs.

#when training completes the best model will be saved as best.pt
#training results including the best model is saved in \runs\detect\ folder under value inputted in name
#the models are saved in the name folder under \weights\

results = model.train(data='F:/capstoneData/data.yaml', epochs=100, workers=8, batch=4, iou=0.5, name='yolo8Small',
                      seed=1)

### Test

In [None]:
#Load Trained Model
model = YOLO('C:/Users/grpoo/capstone/PlasticDetection/runs/detect/yolo8Small/weights/best.pt')

# Evaluate the model's performance on the test dataset
results = model.val(iou = 0.5, split = 'test')

# Export the model
#export files will be in the same weights folder as the inputted best.pt
success = model.export()
success = model.export(format='onnx')
success = model.export(format='torchscript')

### Measure YOLO Model RAM Usage

In [None]:
#Measure YOLO Model RAM Usage

#This will be passed to memory_usgae to run the model
def detYolo8Small():
    model = YOLO('C:/Users/grpoo/capstone/PlasticDetection/runs/detect/yolo8Small/weights/best.pt')
    #make predictions on video of test dataset
    model.predict(source='C:/Users/grpoo/capstone/PlasticDetection/testDataset.mp4', iou = 0.5, verbose = False, stream = True)
    return
#Measure RAM Usage
mem_usage = memory_usage(detYolo8Small)
#Report results, the max of mem_usage is the max RAM usage
print('Maximum memory usage: ' + str(round(max(mem_usage),2)) + ' MB, Average memory usage: '
      + str(round(sum(mem_usage)/len(mem_usage),2)) + ' MB')

## YOLOv8 Medium

### Train (Warning this may take a long time and use a lot of RAM)

In [None]:
# Load a pretrained YOLOv8 medium model
# on the first run this will download the yolov8s.pt file
model = YOLO('yolov8m.pt')

# data need a yaml file as input.

#workers denote the number of threads used.
#batch will affect how much ram training uses. If an out of memory error occurs, try to decrease batch size.

#running train will create a new file called runs.

#when training completes the best model will be saved as best.pt
#training results including the best model is saved in \runs\detect\ folder under value inputted in name
#the models are saved in the name folder under \weights\
results = model.train(data='F:/capstoneData/data.yaml', epochs=100, workers=8, batch=4, iou = 0.5, name = 'yolo8Medium',
                      seed = 1)

### Test

In [None]:
#Load Trained Model
model = YOLO('C:/Users/grpoo/capstone/PlasticDetection/runs/detect/yolo8Medium/weights/best.pt')

# Evaluate the model's performance on the test dataset
results = model.val(iou = 0.5, split = 'test')

# Export the model
#export files will be in the same weights folder as the inputted best.pt
success = model.export()
success = model.export(format='onnx')
success = model.export(format='torchscript')

### Measure YOLO Model RAM Usage

In [None]:
#Measure YOLO Model RAM Usage

#This will be passed to memory_usgae to run the model
def detYolo8Medium():
    model = YOLO('C:/Users/grpoo/capstone/PlasticDetection/runs/detect/yolo8Medium/weights/best.pt')
    #make predictions on video of test dataset
    model.predict(source='C:/Users/grpoo/capstone/PlasticDetection/testDataset.mp4', iou = 0.5, verbose = False, stream = True)
    return
#Measure RAM Usage
mem_usage = memory_usage(detYolo8Medium)
#Report results, the max of mem_usage is the max RAM usage
print('Maximum memory usage: ' + str(round(max(mem_usage),2)) + ' MB, Average memory usage: '
      + str(round(sum(mem_usage)/len(mem_usage),2)) + ' MB')

# YOLOv5

## YOLOv5 Small

### Train (Warning this may take a long time and use a lot of RAM)

In [None]:
# Load a pretrained YOLOv5 small model
# on the first run this will download the yolov8s.pt file
model = YOLO('yolov5su.pt')

# data need a yaml file as input.

#workers denote the number of threads used.
#batch will affect how much ram training uses. If an out of memory error occurs, try to decrease batch size.

#running train will create a new file called runs.

#when training completes the best model will be saved as best.pt
#training results including the best model is saved in \runs\detect\ folder under value inputted in name
#the models are saved in the name folder under \weights\
results = model.train(data='F:/capstoneData/data.yaml', epochs=100, workers=8, batch=4, iou = 0.5, name = 'yolo5Small',
                      seed = 1)

### Test

In [None]:
#Load Trained Model
model = YOLO('C:/Users/grpoo/capstone/PlasticDetection/runs/detect/yolo5Small/weights/best.pt')

# Evaluate the model's performance on the test dataset
results = model.val(iou = 0.5, split = 'test')

# Export the model
#export files will be in the same weights folder as the inputted best.pt
success = model.export()
success = model.export(format='onnx')
success = model.export(format='torchscript')

### Measure YOLO Model RAM Usage

In [None]:
#Measure YOLO Model RAM Usage

#This will be passed to memory_usgae to run the model
def detYolo5Small():
    model = YOLO('C:/Users/grpoo/capstone/PlasticDetection/runs/detect/yolo5Small/weights/best.pt')
    #make predictions on video of test dataset
    model.predict(source='C:/Users/grpoo/capstone/PlasticDetection/testDataset.mp4', iou = 0.5, verbose = False, stream = True)
    return
#Measure RAM Usage
mem_usage = memory_usage(detYolo5Small)
#Report results, the max of mem_usage is the max RAM usage
print('Maximum memory usage: ' + str(round(max(mem_usage),2)) + ' MB, Average memory usage: '
      + str(round(sum(mem_usage)/len(mem_usage),2)) + ' MB')

## YOLOv5 Medium

### Train (Warning this may take a long time and use a lot of RAM)

In [None]:
# Load a pretrained YOLOv5 medium model
# on the first run this will download the yolov8s.pt file
model = YOLO('yolov5mu.pt')

# data need a yaml file as input.

#workers denote the number of threads used.
#batch will affect how much ram training uses. If an out of memory error occurs, try to decrease batch size.

#running train will create a new file called runs.

#when training completes the best model will be saved as best.pt
#training results including the best model is saved in \runs\detect\ folder under value inputted in name
#the models are saved in the name folder under \weights\
results = model.train(data='F:/capstoneData/data.yaml', epochs=100, workers=8, batch=4, iou = 0.5, name = 'yolo5Medium',
                      seed = 1)

### Test

In [None]:
#Load Trained Model
model = YOLO('C:/Users/grpoo/capstone/PlasticDetection/runs/detect/yolo5Medium/weights/best.pt')

# Evaluate the model's performance on the test dataset
results = model.val(iou = 0.5, split = 'test')

# Export the model
#export files will be in the same weights folder as the inputted best.pt
success = model.export()
success = model.export(format='onnx')
success = model.export(format='torchscript')

### Measure YOLO Model RAM Usage

In [None]:
#Measure YOLO Model RAM Usage

#This will be passed to memory_usgae to run the model
def detYolo5Medium():
    model = YOLO('C:/Users/grpoo/capstone/PlasticDetection/runs/detect/yolo5Medium/weights/best.pt')
    #make predictions on video of test dataset
    model.predict(source='C:/Users/grpoo/capstone/PlasticDetection/testDataset.mp4', iou = 0.5, verbose = False, stream = True)
    return
#Measure RAM Usage
mem_usage = memory_usage(detYolo5Medium)
#Report results, the max of mem_usage is the max RAM usage
print('Maximum memory usage: ' + str(round(max(mem_usage),2)) + ' MB, Average memory usage: '
      + str(round(sum(mem_usage)/len(mem_usage),2)) + ' MB')

# YOLOv7

#### This is setup, once downloaded this section can be skipped.

In [None]:
# Download YOLOv7 repository and install requirements
!git clone https://github.com/WongKinYiu/yolov7
%cd yolov7

#Yolov7 relies on the libraries listed in requirments.txt in the yolov7 repository
#Many of the libraries require a certain version for yolov7 to work.
#I recommend this be done in a separate environment.

#This will install yolov7 requirements.
!pip install -r requirements.txt

### To download the small YOLOv7 model checkpoint visit: https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-tiny.pt

### To download the medium YOLOv7 model checkpoint visit: https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt

### Place the yolov7-tiny.pt and yolovy.pt files in the yolov7 cloned repository

## YOLOv7 Small

### Train (Warning this may take a long time and use a lot of RAM)

In [None]:
#Change this to your yolov7 directory
%cd C:\Users\grpoo\capstone\PlasticDetection\yolov7

#train.py is included with the yolov7 repository.

# weights, cfg, hyp, and data need a file as input.

#workers denote the number of threads used.
#device denotes what hardware training will occur on, 0 is the first installed GPU.
#batch-size will affect how much ram training uses. If an out of memory error occurs, try to decrease batch size.

#when training completes the best model will be saved as best.pt
#training results including the best model is saved in the yolov7 directory at runs\train\ folder under value inputted in --name
#the models are saved in the name folder under \weights\

!python train.py --epochs 100 --workers 8 --device 0 --batch-size 4 \
--data F:\capstoneData\dataYoloV7.yaml --img 640 640 --cfg cfg\training\yolov7-tiny-Plastic.yaml \
--weights 'yolov7-tiny.pt' --name yolov7TinyPlastic --hyp data\hyp.scratch.tiny.yaml

In [None]:
#Resume train if interrupted. If training is completed do not run this.

#Change this to your yolov7 directory
%cd C:\Users\grpoo\capstone\PlasticDetection\yolov7
    
#last.py is created and updated automatically during training as the weights of the most recent completed epoch.
#Training will continue from this epoch with the same parameters as initially used.
!python train.py --weights runs\train\yolov7TinyPlastic\weights\last.py --resume

### Test

In [None]:
#Change this to your yolov7 directory
%cd C:\Users\grpoo\capstone\PlasticDetection\yolov7
    
#test.py is included with the yolov7 repository. 
!python test.py --task test --data F:\capstoneData\dataYoloV7.yaml --weights runs\train\yolov7TinyPlastic\weights\best.pt --iou 0.5

### Export

In [None]:
#Change this to your yolov7 directory
%cd C:\Users\grpoo\capstone\PlasticDetection\yolov7
    
#export.py is included with the yolov7 repository.
#export files will be in the same weights folder as the inputted best.pt
!python export.py --weights runs\train\yolov7TinyPlastic\weights\best.pt --iou-thres 0.5 --img-size 640 640 --max-wh 640

### Measure YOLO Model RAM Usage

In [None]:
#This will be passed to memory_usgae to run the model
def detYolov7TinyPlastic():
    #Change this to your yolov7 directory
    %cd C:\Users\grpoo\capstone\PlasticDetection\yolov7
    #detect.py is included with the yolov7 repository.
    #make predictions on video of test dataset
    %run detect.py --weights runs\train\yolov7TinyPlastic\weights\best.pt --source C:\Users\grpoo\capstone\PlasticDetection\testDataset.mp4 --iou-thres 0.5
    return
#Measure RAM Usage
mem_usage = memory_usage(detYolov7TinyPlastic)
#Report results, the max of mem_usage is the max RAM usage
print('Maximum memory usage: ' + str(round(max(mem_usage),2)) + ' MB, Average memory usage: '
      + str(round(sum(mem_usage)/len(mem_usage),2)) + ' MB')

## YOLOv7 Medium

### Train (Warning this may take a long time and use a lot of RAM)

In [None]:
#Change this to your yolov7 directory
%cd C:\Users\grpoo\capstone\PlasticDetection\yolov7
    
#train.py is included with the yolov7 repository.

# weights, cfg, hyp, and data need a file as input.

#workers denote the number of threads used.
#device denotes what hardware training will occur on, 0 is the first installed GPU.
#batch-size will affect how much ram training uses. If an out of memory error occurs, try to decrease batch size.

#when training completes the best model will be saved as best.pt
#training results including the best model is saved in the yolov7 directory at runs\train\ folder under value inputted in --name
#the models are saved in the name folder under \weights\

!python train.py --epochs 100 --workers 8 --device 0 --batch-size 4 \
--data F:\capstoneData\dataYoloV7.yaml --img 640 640 --cfg cfg\training\yolov7-Plastic.yaml \
--weights 'yolov7.pt' --name yolov7Plastic --hyp data\hyp.scratch.p5.yaml

In [None]:
#Resume train if interrupted. If training is completed do not run this.

#Change this to your yolov7 directory
%cd C:\Users\grpoo\capstone\PlasticDetection\yolov7
    
#last.py is created and updated automatically during training as the weights of the most recent completed epoch.
#Training will continue from this epoch with the same parameters as initially used.
!python train.py --weights runs\train\yolov7Plastic\weights\last.py --resume

### Test

In [None]:
#Change this to your yolov7 directory
%cd C:\Users\grpoo\capstone\PlasticDetection\yolov7
    
#test.py is included with the yolov7 repository.
!python test.py --task test --data F:\capstoneData\dataYoloV7.yaml --weights runs\train\yolov7Plastic\weights\best.pt --iou 0.5

### Export

In [None]:
#Change this to your yolov7 directory
%cd C:\Users\grpoo\capstone\PlasticDetection\yolov7
    
#export.py is included with the yolov7 repository.
#export files will be in the same weights folder as the inputted best.pt
!python export.py --weights runs\train\yolov7Plastic\weights\best.pt --iou-thres 0.5 --img-size 640 640 --max-wh 640

### Measure YOLO Model RAM Usage

In [None]:
#Measure YOLO Model RAM Usage

#This will be passed to memory_usgae to run the model
def detYolov7Plastic():
    #Change this to your yolov7 directory
    %cd C:\Users\grpoo\capstone\PlasticDetection\yolov7
    #detect.py is included with the yolov7 repository.
    #make predictions on video of test dataset
    %run detect.py --weights runs\train\yolov7Plastic\weights\best.pt --source C:\Users\grpoo\capstone\PlasticDetection\testDataset.mp4 --iou-thres 0.5
    return
#Measure RAM Usage
mem_usage = memory_usage(detYolov7Plastic)
#Report results, the max of mem_usage is the max RAM usage
print('Maximum memory usage: ' + str(round(max(mem_usage),2)) + ' MB, Average memory usage: '
      + str(round(sum(mem_usage)/len(mem_usage),2)) + ' MB')