# Data Normalizing

# Data Processing and making it ready to use in modelling

In [10]:
import json
import re
import unicodedata
import pandas as pd

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def normalize_text(text):
    text = unicodedata.normalize('NFKD', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    text = text.lower()
    return text

def process_paragraphs(paragraphs):
    return [normalize_text(paragraph) for paragraph in paragraphs if paragraph]

def process_images(images):
    return [{'src': image.get('src', ''), 'alt': normalize_text(image.get('alt', ''))} for image in images]

def process_sections(sections):
    section_data = []
    for section in sections:
        h2 = section.get('h2', '')
        for ul in section.get('uls', []):
            if ul:
                for link in ul.get('links', []):
                    section_data.append({
                        'url': link,
                        'h1': h2,
                        'content_type': 'link',
                        'content': link
                    })
                for paragraph in process_paragraphs(ul.get('text', '').split('\n')):
                    section_data.append({
                        'url': '',
                        'h1': h2,
                        'content_type': 'paragraph',
                        'content': paragraph
                    })
    return section_data


def process_additional_data(additional_data):
    additional_data_processed = []
    for item in additional_data:
        url = item.get('url', '')
        h1 = item.get('h1', '')
        paragraphs = process_paragraphs(item.get('paragraphs', []))
        images = process_images(item.get('images', []))
        
        for paragraph in paragraphs:
            additional_data_processed.append({
                'url': url,
                'h1': h1,
                'content_type': 'paragraph',
                'content': paragraph
            })
        for image in images:
            additional_data_processed.append({
                'url': url,
                'h1': h1,
                'content_type': 'image',
                'content': image['src'],
                'alt_text': image['alt']
            })
    return additional_data_processed

def process_data(data):
    processed_data = []
    # Process sections
    sections_data = process_sections(data.get('sections', []))
    processed_data.extend(sections_data)
    # Process additional data
    additional_data = process_additional_data(data.get('additional_data', []))
    processed_data.extend(additional_data)
    return processed_data

def save_to_csv(data, output_file):
    if data:
        df = pd.DataFrame(data)
        df.to_csv(output_file, index=False, encoding='utf-8')
        print(f"Saved {len(data)} rows to {output_file}")
    else:
        print("No data to save.")

def main():
    json_file_path = 'scraped_data.json'  
    data = load_json(json_file_path)
    
    # Check if data is loaded correctly
    print(f"Loaded {len(data)} items from JSON.")
    
    normalized_data = process_data(data)
    
    # Check if data is processed correctly
    print(f"Processed data contains {len(normalized_data)} items.")
    
    output_csv_path = 'normalized_data.csv'  # Output file path
    save_to_csv(normalized_data, output_csv_path)
    
    print("Data processing and normalization completed. Normalized data saved to", output_csv_path)

if __name__ == "__main__":
    main()


Loaded 5 items from JSON.
Processed data contains 4099 items.
Saved 4099 rows to normalized_data.csv
Data processing and normalization completed. Normalized data saved to normalized_data.csv


In [1]:
import pandas as pd
import re

def load_csv(file_path):
    data = pd.read_csv(file_path)
    return data

def convert_h1_to_question(h1_text):
    # Directly use h1 text as the question without a fixed prefix
    question = re.sub(r'[\W_]+', ' ', h1_text).strip()
    question = question + '?'
    return question

def process_data(csv_file_path):
    data = load_csv(csv_file_path)
    
    questions = []
    contexts = []

    for index, row in data.iterrows():
        h1 = row['h1']
        content = row['content']
        
        if pd.notna(h1) and pd.notna(content):
            question = convert_h1_to_question(h1)
            questions.append(question)
            contexts.append(content)
    
    # Create the DataFrame for training
    df = pd.DataFrame({
        'query': questions,
        'context': contexts,
        'response': contexts  # Using context as response for demonstration
    })
    
    return df

# Path to your CSV file
csv_file_path = 'normalized_data.csv'
processed_df = process_data(csv_file_path)

# Display the processed DataFrame
print(processed_df.head())

# Save processed data to a new CSV file
processed_df.to_csv('processed_data.csv', index=False, encoding='utf-8')
print(f"Processed data saved to '/mnt/data/processed_data.csv'")


              query                                            context  \
0  Getting Started?  https://support.payever.org/hc/en-us/articles/...   
1  Getting Started?  https://support.payever.org/hc/en-us/articles/...   
2  Getting Started?  https://support.payever.org/hc/en-us/articles/...   
3  Getting Started?  https://support.payever.org/hc/en-us/articles/...   
4  Getting Started?  https://support.payever.org/hc/en-us/articles/...   

                                            response  
0  https://support.payever.org/hc/en-us/articles/...  
1  https://support.payever.org/hc/en-us/articles/...  
2  https://support.payever.org/hc/en-us/articles/...  
3  https://support.payever.org/hc/en-us/articles/...  
4  https://support.payever.org/hc/en-us/articles/...  
Processed data saved to '/mnt/data/processed_data.csv'
