In [227]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema.output_parser import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
from langchain.schema.runnable import RunnablePassthrough
import os
load_dotenv()

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "Books Classifications"

model = ChatOpenAI(model="gpt-3.5-turbo",temperature=0)

In [257]:
SWEBOK_categories = """Software Requirements
Software Design
Software Construction
Software Testing
Software Maintenance
Software Configuration Management
Software Engineering Management
Software Engineering Process
Software Engineering Models and Methods
Software Quality
Software Engineering Professional Practice
Software Engineering Economics
Computing Foundations
Mathematical Foundations
Engineering Foundations"""

prompt = ChatPromptTemplate.from_messages(
    [
("user","""Your objective is to categorize chapters titles and sections into SWEBOK categories, identified by #### delimiters. Chapters titles and sections are demarcated by triple backticks.

Ensure to adhere to the following instructions:
- Thoroughly analyze each chapter titles and section in conjunction with SWEBOK categories before assigning a label, as accuracy is crucial for my career.
- Avoid generating any additional text, as it may have adverse effects.
- Provide your output in CSV format:
Example format:
There Will Be Code,Software Construction
Bad Code,Software Quality

> SWEBOK Categories: ####\n{SWEBOK_categories} ####

-----------

> Titles and Sections: ```\n{Titles_and_Sections} ```
""")])

prompt = prompt.partial(SWEBOK_categories=SWEBOK_categories)

chain = prompt | ChatGoogleGenerativeAI(model="gemini-pro",temperature=0) | StrOutputParser()

In [265]:
import pandas as pd

def data():

    Titles_and_Sections = []

    if any(file.endswith('.xlsx') for file in os.listdir('data')):

        for xlsx in os.listdir("data"):
            df = pd.read_excel(os.path.join("data",xlsx))

            if "chapter_section_titles" in df.columns:
                Titles_and_Sections.extend(df['chapter_section_titles'].tolist())
                
            else:
                print(f"No column found with name 'Titles_and_Sections' in {xlsx} file.")
    else:
        print("No Excel file found in Data Directory!")

    return Titles_and_Sections
    
def chunking(items):

    chunk_size = 30

    return [{"Titles_and_Sections":"\n".join(items[i:i + chunk_size])} for i in range(0, len(items), chunk_size)]

title_and_sections = RunnablePassthrough() | chunking | chain.map()

output = "\n".join(title_and_sections.invoke(data()))

No column found with name 'Titles_and_Sections' in swebok_area.xlsx file.


In [269]:
print(output)

There Will Be Code,Software Construction
Bad Code,Software Quality
The Total Cost of Owning a Mess,Software Maintenance
The Grand Redesign in the Sky,Software Maintenance
Attitude,Software Engineering Professional Practice
The Primal Conundrum,Software Engineering Professional Practice
The Art of Clean Code?,Software Engineering Professional Practice
What Is Clean Code?,Software Engineering Professional Practice
Schools of Thought,Software Engineering Professional Practice
We Are Authors,Software Engineering Professional Practice
The Boy Scout Rule,Software Engineering Professional Practice
Prequel and Principles,Software Engineering Professional Practice
Conclusion,Software Engineering Professional Practice
Introduction,Software Design
Use Intention-Revealing Names,Software Design
Avoid Disinformation,Software Design
Make Meaningful Distinctions,Software Design
Use Pronounceable Names,Software Design
Use Searchable Names,Software Design
Avoid Encodings,Software Design
Hungarian Notati

In [273]:
import pandas as pd
import io
import csv

output = """40. Circles and Arrows,Software Engineering Process
8 Pragmatic Projects,Software Engineering Management
41. Pragmatic Teams,Software Engineering Management
42. Ubiquitous Automation,Software Engineering Process
43. Ruthless Testing,Software Testing
44. It's All Writing,Software Engineering Professional Practice
45. Great Expectations,Software Engineering Management
46. Pride and Prejudice,Software Engineering Professional Practice"""

csv_reader = csv.reader(io.StringIO(output))
data = list(csv_reader)
df = pd.DataFrame(data, columns=["Titles_and_Sections", "SWEBOK_categories"]).iloc[1:]

xlsx_filename = "label_data.xlsx"

with pd.ExcelWriter(xlsx_filename, engine='xlsxwriter') as writer:
    df.to_excel(writer, index=False, sheet_name='Sheet1')

    workbook  = writer.book
    worksheet = writer.sheets['Sheet1']

    for i, col in enumerate(df.columns):
        max_len = max(df[col].astype(str).apply(len).max(), len(col))
        worksheet.set_column(i, i, max_len + 2)

print(f"Excel file '{xlsx_filename}' created successfully.")


Excel file 'label_data.xlsx' created successfully.
