In [4]:
import os
import pandas as pd

# Read the CSV file
csv_file = 'pages.csv'  # Replace with your CSV file name
html_column = 'page_content'  # Replace with the name of the column containing HTML code
output_dir = 'html_pages'  # Directory to save the HTML files

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Load CSV into a DataFrame
df = pd.read_csv(csv_file)

# Iterate over the rows and save each HTML content to a file
for index, row in df.iterrows():
    html_content = row[html_column]  # Extract the HTML code
    
    # Handle missing or non-string values
    if pd.isna(html_content):
        html_content = ""  # Replace NaN or missing values with an empty string
    else:
        html_content = str(html_content)  # Ensure the content is a string
    
    # Create the output file name
    output_file = os.path.join(output_dir, f'page_{index + 1}.html')
    
    # Write the HTML content to the file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(html_content)

print(f"HTML files have been saved to the '{output_dir}' directory.")


HTML files have been saved to the 'html_pages' directory.


In [1]:
import pandas as pd
from bs4 import BeautifulSoup

# Load the original CSV file
csv_file = "pages.csv"  # Replace with your actual file path
df = pd.read_csv(csv_file)

# Function to extract text from HTML
def extract_text_from_html(html_content):
    if pd.isnull(html_content):  # Check for missing values
        return ""  # Return empty string if content is null
    soup = BeautifulSoup(html_content, "html.parser")  # Parse HTML
    return soup.get_text(strip=True)  # Extract plain text

# Apply the function to 'page_content' and overwrite or add a column
df['cleaned_text'] = df['page_content'].apply(extract_text_from_html)

# Save the updated DataFrame back to the original CSV file
df.to_csv(csv_file, index=False)

# Print confirmation
print("Cleaned text has been saved back to the original CSV file.")


Cleaned text has been saved back to the original CSV file.


In [3]:
import pandas as pd
from bs4 import BeautifulSoup

# Load the original CSV file
input_csv = "pages.csv"  # Replace with your original CSV file path
output_csv = "cleaned_html.csv"  # Path for the new CSV file

# Load the data
df = pd.read_csv(input_csv)

# Function to extract reviews/blogs from HTML content
def extract_reviews_from_html(html_content):
    if pd.isnull(html_content):  # Check for missing values
        return ""  # Return empty string if content is null
    
    soup = BeautifulSoup(html_content, "html.parser")  # Parse HTML
    
    # Remove unwanted tags like headings, navigation, and other noise
    for tag in soup(["h1", "h2", "h3", "nav", "footer", "header", "aside"]):
        tag.decompose()  # Remove the tag and its content
    
    # Extract desired content: paragraphs or relevant tags
    reviews = []
    for tag in soup.find_all(["p", "article", "div"]):
        text = tag.get_text(strip=True)
        if len(text) > 20:  # Skip short or irrelevant text
            reviews.append(text)
    
    return " ".join(reviews)  # Combine all extracted text into one string

# Apply the function to clean 'page_content'
df['cleaned_text'] = df['page_content'].apply(extract_reviews_from_html)

# Select columns to include in the new CSV
columns_to_save = ['Title', 'Link', 'Description', 'cleaned_text']
df_cleaned = df[columns_to_save]  # Create a new DataFrame with selected columns

# Save the cleaned data to a new CSV file
df_cleaned.to_csv(output_csv, index=False)

# Print confirmation
print(f"Cleaned data has been saved to '{output_csv}' with title, link, description, and cleaned text.")

Cleaned data has been saved to 'cleaned_reviews.csv' with title, link, description, and cleaned text.
