In [1]:
import pandas as pd


In [2]:
# Load the section types from the CSV file
section_types_df = pd.read_csv("requirements_data/section_types.csv")

# Function to classify sections
def classify_sections(section_outline: str):
    # Define section type keywords for classification
    section_keywords = {
        "Lead section": ["Introduction", "Overview", "Summary"],
        "Body sections": ["Background", "Development", "Architecture", "Timeline", "Features", "Collaborations", "Model Structure", "Training Data", "Algorithms", "Techniques"],
        "Infobox": ["Infobox"],
        "References": ["References", "Citations"],
        "See also": ["See also", "Related Articles"],
        "Further reading": ["Further reading"],
        "External links": ["External links"],
        "Categories": ["Categories"],
        "Notes": ["Notes", "Footnotes"]
    }
    
    # Split the outline into sections
    sections = [line.strip("# ").strip() for line in section_outline.splitlines() if line.strip()]
    
    # Function to get section type
    def get_section_type(section):
        for section_type, keywords in section_keywords.items():
            if any(keyword.lower() in section.lower() for keyword in keywords):
                return section_type
        return "Body sections"  # Default to body sections if no match is found
    
    # Get section types for each section
    section_types = [get_section_type(section) for section in sections]
    
    return sections, section_types

# Example usage
article_outline = """
# Introduction
## Overview
## Key Features
## Release Date
# Development
## Background
## Research Team
## Funding
## Development Timeline
## Collaborations
# Architecture
## Model Structure
## Training Data
### Data Sources
### Data Preprocessing
## Algorithms and Techniques
### Core Algorithms
### Innovations and Improvements
"""

sections, section_types = classify_sections(article_outline)
print("Sections:", sections)
print("Section Types:", section_types)


Sections: ['Introduction', 'Overview', 'Key Features', 'Release Date', 'Development', 'Background', 'Research Team', 'Funding', 'Development Timeline', 'Collaborations', 'Architecture', 'Model Structure', 'Training Data', 'Data Sources', 'Data Preprocessing', 'Algorithms and Techniques', 'Core Algorithms', 'Innovations and Improvements']
Section Types: ['Lead section', 'Lead section', 'Body sections', 'Body sections', 'Body sections', 'Body sections', 'Body sections', 'Body sections', 'Body sections', 'Body sections', 'Body sections', 'Body sections', 'Body sections', 'Body sections', 'Body sections', 'Body sections', 'Body sections', 'Body sections']


In [7]:

# Load the dataframes from the CSV files
rbsdf = pd.read_csv("requirements_data/requirements_by_section_type.csv")
rdf = pd.read_csv("requirements_data/section_level_requirements.csv")

# Join the dataframes on "name" ~ "requirement name"
joined_df = pd.merge(rbsdf, rdf, left_on="requirement_name", right_on="name")

# Drop the redundant 'name' column from rdf
joined_df = joined_df.drop(columns=["name"]).set_index("requirement_name").T

# add a new column called "requirements_list" that contains an empty list
#joined_df["requirements_list"] = [[] for _ in range(len(joined_df))]

cols = joined_df.columns
criteria = joined_df.iloc[-1]


In [8]:
joined_df

requirement_name,Title format,Consistency,References included,Lead summary,Infobox usage,External links relevance,Further reading selectivity,See also relevance,Category placement
Lead section,1,1,1,1,0,0,0,0,0
Body sections,1,1,1,0,0,0,0,0,0
Infobox,0,1,0,0,1,0,0,0,0
References,0,1,1,0,0,0,0,0,0
See also,0,1,0,0,0,0,0,1,0
Further reading,0,1,0,0,0,0,1,0,0
External links,0,1,0,0,0,1,0,0,0
Categories,0,1,0,0,0,0,0,0,1
Notes,0,1,0,0,0,0,0,0,0
criteria_details,Sentence case for titles and headings,Consistent style within an article,All statements must be supported by reliable s...,Lead must summarize the article concisely,Infobox must be right-aligned and summarize ke...,External links must be relevant and appropriate,Further reading section must be selective,See also links must be directly related to the...,Categories must be placed at the very end of t...


In [9]:
def get_requirements_list(requirement_name):
    requirements_list = []
    for col in cols:
        if joined_df.loc[requirement_name][col] == 1:
            requirements_list.append(col)
    return requirements_list

In [12]:
get_requirements_list("Body sections")

['Title format', 'Consistency', 'References included']