In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from  scipy import stats

In [None]:
with open('/content/README.md', 'r') as f:
    readme_content = f.read()

print(readme_content)

# WaRP (Waste Recycling Plant Dataset)

The dataset WaRP (Waste Recycling Plant) includes labeled images of an industrial waste sorting plant. We have selected 28 recyclable waste
categories. Objects in the dataset are divided into the following groups: plastic bottles of 17 categories (class name with the bottle- prefix),
glass bottles of three types (the glass- prefix), card boards of two categories, detergents of four categories, canisters and cans. The -full postfix means that the bottle is filled with air, i.e. not flat.

Examples of instances of each category of the WaRP Dataset are presented in the figure below. 

![Dataset classes](/assets/WaRP-Categories.png)

 
A crucial difference from other datasets is that objects can
overlap, be heavily deformed, or be in poor lighting conditions.
The dataset has three parts: WaRP-D, WaRP-C, and WaRP-S

The first two parts are intended for training and objective quality assessment
of detection (WaRP-D) and classification (WaRP-C) tasks, a

In [None]:
import re
import pandas as pd

citation_content = sections_content.get('Citing WaRP dataset', [])
citation_text = "\n".join(citation_content)

# Regex to extract BibTeX fields
# This pattern looks for the field name (e.g., title, author) and its value within curly braces
pattern = r"@article\{.*?,\s*(.*?)\s*=\s*\{(.*?)\}"

matches = re.findall(pattern, citation_text, re.DOTALL)

citation_data = []
for match in matches:
    field_name = match[0].strip()
    field_value = match[1].strip()
    citation_data.append({"Field": field_name, "Value": field_value})

# Create a pandas DataFrame
citation_df = pd.DataFrame(citation_data)

print("Citation Information Table:")
display(citation_df)

Citation Information Table:


Unnamed: 0,Field,Value
0,title,Hierarchical waste detection with weakly super...


In [None]:
display(categories_df)

Unnamed: 0,Category Group,Count
0,plastic bottles,17
1,glass bottles,3
2,card boards,2
3,detergents,4
4,canisters and cans,2


In [None]:
import pandas as pd

# Access the content of the relevant sections
warp_d_content = sections_content.get('Warp-D Detection', [])
warp_c_content = sections_content.get('Warp-C Classification', [])
warp_s_content = sections_content.get('Warp-S Segmentation', [])

# Extract key information from each section's content
warp_d_info = {
    "Part": "WaRP-D",
    "Purpose": "Detection",
    "Training Images": "2452",
    "Validation Images": "522",
    "Image Resolution": "1920 x 1080 pixels",
    "Annotation": ".txt annotation with bboxes"
}

# Extracting details for WaRP-C from its content
warp_c_info = {
    "Part": "WaRP-C",
    "Purpose": "Classification",
    "Training Images": "8823",
    "Testing Images": "1583",
    "Image Size Range": "40 to 703 pixels wide and 35 to 668 pixels high",
    "Characteristics": "Unbalanced dataset, cut-out image areas from WaRP-D"
}

# Extracting details for WaRP-S from its content
warp_s_info = {
    "Part": "WaRP-S",
    "Purpose": "Segmentation",
    "Total Images": "112",
    "Image Size Range": "100 × 96 pixels to 412 × 510 pixels",
    "Characteristics": "Each category has 4 images with significantly deformed objects"
}


# Create a list of dictionaries for the DataFrame
dataset_parts_data = [warp_d_info, warp_c_info, warp_s_info]

# Create a pandas DataFrame
dataset_parts_df = pd.DataFrame(dataset_parts_data)

# Display the DataFrame
print("Dataset Parts Summary:")
display(dataset_parts_df)

Dataset Parts Summary:


Unnamed: 0,Part,Purpose,Training Images,Validation Images,Image Resolution,Annotation,Testing Images,Image Size Range,Characteristics,Total Images
0,WaRP-D,Detection,2452.0,522.0,1920 x 1080 pixels,.txt annotation with bboxes,,,,
1,WaRP-C,Classification,8823.0,,,,1583.0,40 to 703 pixels wide and 35 to 668 pixels high,"Unbalanced dataset, cut-out image areas from W...",
2,WaRP-S,Segmentation,,,,,,100 × 96 pixels to 412 × 510 pixels,Each category has 4 images with significantly ...,112.0


In [None]:
import re
import pandas as pd

text = """Examples of instances of each category of the WaRP Dataset are presented in the figure below.

![Dataset classes](/assets/WaRP-Categories.png)
![Dataset parts](/assets/WaRP-Dataset.png)
"""

# Regex to find markdown images: ![alt text](image_path)
# It captures the alt text and the image path
pattern = r"!\[(.*?)\]\((.*?)\)"

matches = re.findall(pattern, text)

image_data = []
for match in matches:
    alt_text = match[0]
    image_path = match[1]
    image_data.append({"Alt Text": alt_text, "Image Path": image_path})

# Create a pandas DataFrame from the extracted image data
images_df = pd.DataFrame(image_data)

print("Image Information Table:")
display(images_df)

Image Information Table:


Unnamed: 0,Alt Text,Image Path
0,Dataset classes,/assets/WaRP-Categories.png
1,Dataset parts,/assets/WaRP-Dataset.png


In [None]:
print(len(readme_content))

3911


In [None]:
df=readme_content

In [None]:
sections = []
with open('/content/README.md', 'r') as f:
    for line in f:
        line = line.strip()
        if line.startswith('#'):
            # Assuming headings start with '#' and are followed by a space
            # We strip leading '#' and spaces and add to the list
            section_title = line.lstrip('# ').strip()
            if section_title: # Ensure it's not just a line with '#'
                sections.append(section_title)

print("Extracted sections:")
for section in sections:
    print(section)

Extracted sections:
WaRP (Waste Recycling Plant Dataset)
Structure
Warp-D Detection
Warp-C Classification
Warp-S Segmentation
Citing WaRP dataset


In [None]:
import pandas as pd

sections_df = pd.DataFrame(sections, columns=['Titles)'])
display(sections_df.head())

Unnamed: 0,Titles)
0,WaRP (Waste Recycling Plant Dataset)
1,Structure
2,Warp-D Detection
3,Warp-C Classification
4,Warp-S Segmentation


In [None]:
# Check for duplicate section titles in sections_df
duplicate_sections = sections_df.duplicated()
print("Rows with duplicate section titles:")
display(sections_df[duplicate_sections])

# To count the number of duplicate section titles:
print(f"\nNumber of duplicate section titles: {duplicate_sections.sum()}")

Rows with duplicate section titles:


Unnamed: 0,Titles)



Number of duplicate section titles: 0


In [None]:
sections_df['Title Length'] = sections_df['Titles)'].apply(len)
display(sections_df)

Unnamed: 0,Titles),Title Length
0,WaRP (Waste Recycling Plant Dataset),36
1,Structure,9
2,Warp-D Detection,16
3,Warp-C Classification,21
4,Warp-S Segmentation,19
5,Citing WaRP dataset,19


In [None]:
df=pd.read_csv("/content/sections.csv")

In [None]:
# Remove the 'Title Length' column
sections_df = sections_df.drop(columns=['Title Length'])

# Display the DataFrame after removing the column
display(sections_df)

Unnamed: 0,Titles)
0,WaRP (Waste Recycling Plant Dataset)
1,Structure
2,Warp-D Detection
3,Warp-C Classification
4,Warp-S Segmentation
5,Citing WaRP dataset


In [None]:
# Load data from the CSV file into sections_df
sections_df = pd.read_csv('/content/sections.csv')

# Display the head of the updated sections_df DataFrame
display(sections_df.head())

Unnamed: 0,WaRP(Waste Recycling Plant Dataset)
0,WaRP (Waste Recycling Plant Dataset)
1,Structure
2,Warp-D Detection
3,Warp-C Classification
4,Warp-S Segmentation
