In [17]:
from docx import Document

def clean_report(input_path):
    # Load the document
    doc = Document(input_path)
    
    # Define key sections to keep
    key_sections = [
        "Introduction", "Techniques and Strategies", "Data Preprocessing", "Methods for Data Exploration and Visualization",
        "Techniques for Balancing the Dataset", "Models", "List of Models Attempted", "Model Selection and Evaluation",
        "Hyperparameter Tuning", "Libraries", "Comprehensive List of Libraries Employed", "Description of Library Utilization",
        "Combinations and Configurations", "Specific Combinations of Models and Preprocessing Techniques",
        "Different Configurations and Their Impacts on Model Performance", "Experiment Tracking",
        "Experiment Tracking with MLflow", "Challenges and Solutions", "Challenges Faced and Solutions Implemented",
        "Insights and Lessons Learned", "Recommendations", "Practical Insights and Recommendations",
        "References and Resources", "Conclusion"
    ]
    
    # Define sections to merge
    merge_sections = {
        "Data Cleaning and Handling Missing Values": "Data Preprocessing",
        "Feature Engineering": "Data Preprocessing",
        "Scaling": "Data Preprocessing",
        "Basic Analysis": "Methods for Data Exploration and Visualization",
        "Correlation Analysis": "Methods for Data Exploration and Visualization",
        "Skewness and Distribution Analysis": "Methods for Data Exploration and Visualization",
        "Binning": "Methods for Data Exploration and Visualization"
    }
    
    # Track the current section
    current_section = None
    section_text = []
    cleaned_text = ""

    for i, para in enumerate(doc.paragraphs):
        para_text = para.text.strip()
        print(f"Processing paragraph {i}: {para_text[:50]}")  # Debug print
        
        # Check if the paragraph is a heading
        if para.style.name.startswith('Heading') or para_text in key_sections:
            print(f"Found key section: {para_text} at paragraph {i}")  # Debug print
            # If we're in a new section, save the previous section
            if current_section:
                cleaned_text += f"\n\n{current_section}\n" + '\n'.join(section_text)
            # Start the new section
            current_section = para_text
            section_text = []
        elif para.style.name.startswith('Heading') or para_text in merge_sections:
            print(f"Found merge section: {para_text} at paragraph {i}")  # Debug print
            # Merge sections
            if current_section == merge_sections[para_text]:
                section_text.append(f'\n{para_text}:\n')
            else:
                if current_section:
                    cleaned_text += f"\n\n{current_section}\n" + '\n'.join(section_text)
                current_section = merge_sections[para_text]
                section_text = [f'\n{para_text}:\n']
        else:
            section_text.append(para_text)
    
    # Add the last section
    if current_section:
        cleaned_text += f"\n\n{current_section}\n" + '\n'.join(section_text)
    else:
        cleaned_text += '\n'.join(section_text)  # Handle the case where the last section has no heading
    
    return cleaned_text

# Define file paths
input_path = "preliminary report for compilation.docx"

# Clean the report and print the cleaned text
cleaned_text = clean_report(input_path)
print(cleaned_text[:5000])  # Display the first 5000 characters to inspect the content


Processing paragraph 0: # Comprehensive Report on Binary Classification Mo
Processing paragraph 1: 
Processing paragraph 2: ## Introduction
Processing paragraph 3: 
Processing paragraph 4: This report provides a comprehensive overview of t
Processing paragraph 5: 
Processing paragraph 6: ## Techniques and Strategies
Processing paragraph 7: 
Processing paragraph 8: ### Data Preprocessing
Processing paragraph 9: 
Processing paragraph 10: **Data Cleaning and Handling Missing Values:**
Processing paragraph 11: - No missing values were detected in the dataset, 
Processing paragraph 12: 
Processing paragraph 13: **Feature Engineering:**
Processing paragraph 14: - **Interaction Features**: Created new features b
Processing paragraph 15: - **Polynomial Features**: Generated polynomial fe
Processing paragraph 16: - **Binning**: Applied binning to continuous featu
Processing paragraph 17: 
Processing paragraph 18: **Scaling:**
Processing paragraph 19: - Standardized numerical features to have a 

In [18]:
# Add this snippet at the end of your script to save the cleaned report

# Define the output path for the cleaned report
output_path = "cleaned_report.docx"

# Create a new document for the cleaned report
cleaned_doc = Document()
cleaned_doc.add_heading("Comprehensive Report on Binary Classification Model for Kaggle Competition", level=1)

# Add the cleaned text to the new document
for section in cleaned_text.split("\n\n"):
    if section.strip():
        heading, *content = section.split("\n", 1)
        cleaned_doc.add_heading(heading.strip(), level=2)
        cleaned_doc.add_paragraph(content[0].strip() if content else "")

# Save the cleaned document
cleaned_doc.save(output_path)

print(f"Cleaned report saved to: {output_path}")


Cleaned report saved to: cleaned_report.docx


In [20]:
from docx import Document

def consolidate_headings(input_path, output_path):
    # Load the cleaned document
    doc = Document(input_path)
    
    # Dictionary to hold consolidated sections
    consolidated_sections = {}

    current_heading = None
    for para in doc.paragraphs:
        para_text = para.text.strip()
        if para.style.name.startswith('Heading'):
            current_heading = para_text
            if current_heading not in consolidated_sections:
                consolidated_sections[current_heading] = []
        else:
            if current_heading:
                consolidated_sections[current_heading].append(para_text)

    # Create a new document for the consolidated report
    consolidated_doc = Document()
    consolidated_doc.add_heading("Comprehensive Report on Binary Classification Model for Kaggle Competition", level=1)

    # Add the consolidated sections to the new document
    for heading, content in consolidated_sections.items():
        consolidated_doc.add_heading(heading, level=2)
        consolidated_doc.add_paragraph('\n'.join([text for text in content if text]))

    # Save the consolidated document
    consolidated_doc.save(output_path)

# Define file paths
cleaned_input_path = "cleaned_report.docx"
consolidated_output_path = "consolidated_report.docx"

# Consolidate the headings in the cleaned report
consolidate_headings(cleaned_input_path, consolidated_output_path)

consolidated_output_path  # Return the output path to download the consolidated report


'consolidated_report.docx'

In [21]:
from docx import Document

def compile_consolidated_report(input_path, output_path):
    # Load the consolidated document
    doc = Document(input_path)
    
    # Dictionary to hold consolidated sections
    consolidated_sections = {}

    current_heading = None
    for para in doc.paragraphs:
        para_text = para.text.strip()
        if para.style.name.startswith('Heading'):
            current_heading = para_text
            if current_heading not in consolidated_sections:
                consolidated_sections[current_heading] = []
        else:
            if current_heading:
                consolidated_sections[current_heading].append(para_text)
    
    # Create a new document for the final compiled report
    compiled_doc = Document()
    compiled_doc.add_heading("Comprehensive Report on Binary Classification Model for Kaggle Competition", level=1)

    # Add the consolidated sections to the new document
    for heading, content in consolidated_sections.items():
        compiled_doc.add_heading(heading, level=2)
        cleaned_content = [text for text in content if text.strip()]
        compiled_doc.add_paragraph('\n'.join(cleaned_content))

    # Save the compiled document
    compiled_doc.save(output_path)

# Define file paths
consolidated_input_path = "consolidated_report.docx"
compiled_output_path = "compiled_report.docx"

# Compile the consolidated report
compile_consolidated_report(consolidated_input_path, compiled_output_path)

compiled_output_path  # Return the output path to download the compiled report


'compiled_report.docx'

In [26]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\paulo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [33]:
import nltk
from docx import Document
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import re

# Download NLTK punkt data
nltk.download('punkt')

def clean_duplicate_5grams_and_remove_code(docx_filename):
    doc = Document(docx_filename)
    paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]  # Get non-empty paragraphs

    # Function to tokenize text into words
    def tokenize_text(text):
        # Remove non-alphanumeric characters and lowercase
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        return word_tokenize(text.lower())

    # Create a dictionary to store paragraphs by their 5-gram sets
    paragraph_dict = defaultdict(list)
    for paragraph in paragraphs:
        words = tokenize_text(paragraph)
        five_grams = list(ngrams(words, 5))  # Generate 5-grams
        key = tuple(five_grams)  # Use tuple of 5-grams as dictionary key
        paragraph_dict[key].append(paragraph)

    # Filter out paragraphs with duplicate 5-grams, keep the first occurrence
    unique_paragraphs = []
    seen_keys = set()
    for key, paragraphs in zip(paragraph_dict.keys(), paragraph_dict.values()):
        if key not in seen_keys:
            seen_keys.add(key)
            unique_paragraphs.append(paragraphs[0])  # Keep the first occurrence

    # Create a new document with cleaned paragraphs
    cleaned_doc = Document()
    for paragraph in unique_paragraphs:
        cleaned_doc.add_paragraph(paragraph)

    # Save the cleaned document
    cleaned_filename = f"cleaned_{docx_filename}"
    cleaned_doc.save(cleaned_filename)
    print(f"Cleaned file saved as: {cleaned_filename}")

    # Remove Python code blocks (between ''' or ```)
    clean_doc_with_code_removed = Document()
    for paragraph in cleaned_doc.paragraphs:
        clean_text = re.sub(r"(['`]{3})(.*?)\1", "", paragraph.text, flags=re.DOTALL)  # Remove triple backticks or single quotes
        clean_text = re.sub(r'```[\s\S]*?```', '', clean_text)  # Remove triple backticks block
        clean_text = re.sub(r"'''[\s\S]*?'''", '', clean_text)  # Remove single quotes block
        clean_doc_with_code_removed.add_paragraph(clean_text.strip())

    # Save the final cleaned document without code blocks
    final_cleaned_filename = f"final_cleaned_{docx_filename}"
    clean_doc_with_code_removed.save(final_cleaned_filename)
    print(f"Final cleaned file saved as: {final_cleaned_filename}")

# Example usage:
if __name__ == "__main__":
    docx_file = "compiled_report.docx"  # Replace with your DOCX file path
    clean_duplicate_5grams_and_remove_code(docx_file)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\paulo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Cleaned file saved as: cleaned_compiled_report.docx
Final cleaned file saved as: final_cleaned_compiled_report.docx
