# V. K. Kapahi 

In [None]:
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
import pandas as pd
import re

def preprocess_image(image_path):
    """Preprocess the image for better OCR accuracy."""
    image = Image.open(image_path)
    image = image.convert("L")  # Convert to grayscale
    image = image.filter(ImageFilter.SHARPEN)  # Sharpen the image
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2)  # Enhance contrast
    return image

def ocr_tables_to_dataframe(image_paths):
    rows = []
    
    for image_path in image_paths:
        # Preprocess image
        image = preprocess_image(image_path)
        # OCR the image with specific parameters for tables
        ocr_text = pytesseract.image_to_string(image, config="--psm 6")

        print(f"OCR Text from {image_path}:")  # Debug: Print raw OCR text
        print(ocr_text)

        # Process each line of the OCR text
        for line in ocr_text.splitlines():
            print(f"Processing line: {line}")  # Debug: Print each line being processed
            # Regex to parse table rows (Source, Redshift, Angular Size, etc.)
            match = re.match(r"([\w+\-]+)\s+[\d\.]+\s+[\d\.]+\s+(G|Q)\s+([\d\.\(\)]+)\s+<?\s?([\d\.]+)", line)
            if match:
                try:
                    source = match.group(1)
                    id_type = match.group(2)  # G for Galaxy, Q for Quasar
                    redshift = float(match.group(3).replace("(", "").replace(")", ""))
                    angular_size = float(match.group(4).replace("<", "").replace(">", ""))
                    rows.append({"Source": source, "ID": id_type, "Redshift": redshift, "Angular_Size": angular_size})
                except (ValueError, AttributeError):
                    print(f"Skipping line due to error: {line}")  # Log problematic lines
                    continue
    
    # Create a DataFrame from the parsed rows
    return pd.DataFrame(rows)


# Paths to the image files
# Extract text from each image
image_paths = [
        # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image1.png",
        "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image2.png",
        # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image3.png",
        # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image4.png",
        # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image5.png",
        # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image6.png",
        # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image7.png",
        # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image8.png",
]

# Create the DataFrame by processing the images
dataframe = ocr_tables_to_dataframe(image_paths)

# Display the first few rows of the DataFrame
print(dataframe)


In [None]:
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
import pandas as pd


def preprocess_image(image_path):
    """Preprocess the image for better OCR accuracy."""
    image = Image.open(image_path)
    image = image.convert("L")  # Convert to grayscale
    image = image.filter(ImageFilter.SHARPEN)  # Sharpen the image
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2)  # Enhance contrast
    return image

def parse_fixed_length_columns(line):
    """Parse a line of text using fixed-length column slicing."""
    try:
        source = line[0:10].strip()
        id_type = line[30:31].strip()  # Assuming ID is always one character (G/Q)
        redshift = float(line[40:47].strip().replace(";", ""))  # Clean any semicolons or spaces
        angular_size = float(line[48:].strip().replace("<", "").replace(">", ""))
        print( "succeded", source, "    :   ", id_type,  "    :   ", redshift, "    :   ",angular_size)
        return {"Source": source, "ID": id_type, "Redshift": redshift, "Angular_Size": angular_size}
    except ValueError:
        print( "failed", source, "    :   ", id_type,  "    :   ", redshift, "    :   ",angular_size)
        return None

def ocr_tables_to_dataframe(image_paths):
    rows = []

    for image_path in image_paths:
        # Preprocess image
        image = preprocess_image(image_path)
        # OCR the image with specific parameters for tables
        ocr_text = pytesseract.image_to_string(image, config="--psm 6")

        print(f"OCR Text from {image_path}:")  # Debug: Print raw OCR text
        print(ocr_text)

        # Process each line of the OCR text
        for line in ocr_text.splitlines():
            print(f"Processing line: {line}")  # Debug: Print each line being processed
            parsed_row = parse_fixed_length_columns(line)
            if parsed_row:
                rows.append(parsed_row)

    # Create a DataFrame from the parsed rows
    return pd.DataFrame(rows)

# Paths to the image files
image_paths = [
    # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image1.png",
    # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image2.png",
    # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image3.png",
    # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image4.png",
    # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image5.png",
    # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image6.png",
    # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image7.png",
    # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image8.png"
]

# Create the DataFrame by processing the images
dataframe = ocr_tables_to_dataframe(image_paths)

# Display the first few rows of the DataFrame
print(dataframe.head())


In [None]:
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
import pandas as pd
import re

def preprocess_image(image_path):
    """Preprocess the image for better OCR accuracy."""
    image = Image.open(image_path)
    image = image.convert("L")  # Convert to grayscale
    image = image.filter(ImageFilter.SHARPEN)  # Sharpen the image
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2)  # Enhance contrast
    return image

def parse_line_with_flexibility(line):
    """Parse a line of text dynamically, accounting for missing columns."""
    try:
        # Regex to extract the Source, ID, Redshift, and Angular Size dynamically
        pattern = r"(\S{4,10})\s+(\d*\.\d+)?\s+(\d*\.\d+)?\s+(G|Q)\s+(\d*\.\d+|\(\d*\.\d+\))\s+(\d+)"
        match = re.match(pattern, line)
        if match:
            source = match.group(1)
            id_type = match.group(4)  # G for Galaxy, Q for Quasar
            redshift = float(match.group(5).replace("(", "").replace(")", ""))
            angular_size = float(match.group(6).replace("<", "").replace(">", ""))
            return {"Source": source, "ID": id_type, "Redshift": redshift, "Angular_Size": angular_size}
    except ValueError:
        pass  # Skip lines that don't match the expected format
    return None

def ocr_tables_to_dataframe(image_paths):
    rows = []

    for image_path in image_paths:
        # Preprocess image
        image = preprocess_image(image_path)
        # OCR the image with specific parameters for tables
        ocr_text = pytesseract.image_to_string(image, config="--psm 6")

        print(f"OCR Text from {image_path}:")  # Debug: Print raw OCR text
        print(ocr_text)

        # Process each line of the OCR text
        for line in ocr_text.splitlines():
            print(f"Processing line: {line}")  # Debug: Print each line being processed
            parsed_row = parse_line_with_flexibility(line)
            if parsed_row:
                rows.append(parsed_row)

    # Create a DataFrame from the parsed rows
    return pd.DataFrame(rows)

# Paths to the image files
image_paths = [
    # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/table3.png",
    # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image1.png",
    # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image2.png",
    # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image3.png",
    # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image4.png",
    # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image5.png",
    # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image6.png",
    # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image7.png",
    # "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/image8.png"
]

# Create the DataFrame by processing the images
dataframe = ocr_tables_to_dataframe(image_paths)

# Display the first few rows of the DataFrame
print(dataframe)


In [None]:
import pandas as pd
# Adjust columns and handle potential formatting issues
file_path = "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/Kapahi.txt"
columns = ["Source", "ID", "Redshift", "Angular_Size"]

# Load the cleaned data into a DataFrame
with open(file_path, 'r') as file:
    lines = file.readlines()

# Process the lines manually to ensure proper parsing
data = []
for line in lines:
    parts = line.split(" ")
    if len(parts) == 6:  # Ensure the line has the correct number of columns
        data.append({
            "Source": parts[0],
            "Redshift": parts[1].replace("(", "").replace(")", ""),
            "Angular_Size": parts[2].replace("<", "").replace(">", ""),
            "ID": parts[3],
        })

# Create a DataFrame
dataframe = pd.DataFrame(data)

# Filter for galaxies (ID = "G")
# Create a copy of the filtered DataFrame before modifying it
galaxy_data = dataframe[dataframe["ID"] == "G"].copy()

# Convert numerical columns to float for further analysis
galaxy_data["Redshift"] = pd.to_numeric(galaxy_data["Redshift"], errors="coerce")
galaxy_data["Angular_Size"] = pd.to_numeric(galaxy_data["Angular_Size"], errors="coerce")
galaxy_data = galaxy_data[galaxy_data.Redshift <15]

# Display the filtered DataFrame to the user
# Display the DataFrame using pandas
print(galaxy_data)


In [None]:
import pandas as pd
# Adjust columns and handle potential formatting issues
file_path = "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/Kapahi_table3.txt"
columns = ["Source", "Angular_Size", "BestAngularSize"]

# Load the cleaned data into a DataFrame
with open(file_path, 'r') as file:
    lines = file.readlines()

# Process the lines manually to ensure proper parsing
data = []
for line in lines:
    parts = line.split(" ")
    if len(parts) == 3:  # Ensure the line has the correct number of columns
        data.append({
            "Source": parts[0],
            "Angular_Size": parts[1],
            "Best_Angular_Size": parts[2],
        })

# Create a DataFrame
df = pd.DataFrame(data)

# Convert numerical columns to float for further analysis
df["Angular_Size"] = pd.to_numeric(df["Angular_Size"], errors="coerce")
df["Best_Angular_Size"] = pd.to_numeric(df["Best_Angular_Size"], errors="coerce")

# Display the filtered DataFrame to the user
# Display the DataFrame using pandas
print(df)


In [None]:
galaxy_data.plot("Redshift")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Define a function to extract and organize the data from the tables in the document

# Define a function to bin the data by redshift and calculate medians
def bin_and_calculate_medians(df, bin_size):
    df['Bin'] = (df['Redshift'] / bin_size).astype(int) * bin_size
    grouped = df.groupby('Bin')['Angular_Size'].median()
    return grouped

# Define a function to plot the data
def plot_data(grouped_data):
    plt.figure(figsize=(10, 6))
    plt.plot(grouped_data.index, grouped_data.values, marker='o', linestyle='-', label='Median Angular Size')
    plt.xlabel('Redshift (z)')
    plt.ylabel('Median Angular Size (arcsec)')
    plt.title('Median Angular Size vs. Redshift')
    plt.grid(True)
    plt.legend()
    plt.show()

# Main function to execute the workflow

# Step 1: Extract data
df = galaxy_data

# Step 2: Bin data and calculate medians
bin_size = 0.1
grouped_data = bin_and_calculate_medians(df, bin_size)

# Step 3: Plot the data
plot_data(grouped_data)



In [None]:
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
# Reload the newly uploaded image and perform OCR
image_path = "/mnt/sda1/Dropbox/AAA_Papers_2022_Folder/AAA_CMB_HU_latest_to_git/KapahiData/table3.png"
image = Image.open(image_path)

# Perform OCR with the appropriate configuration
ocr_text = pytesseract.image_to_string(image, config="--psm 6")

# Display the OCR output to see what was extracted
ocr_text


In [None]:
# Cleaning the DataFrame: remove any rows with invalid redshift or angular size values
df_cleaned = df_extracted.dropna()
df_cleaned = df_cleaned[df_cleaned['Redshift'] > 0]  # Ensure positive redshift values
df_cleaned = df_cleaned[df_cleaned['Angular_Size'] > 0]  # Ensure positive angular size values

# Binning the data by redshift
bin_size = 0.3
df_cleaned['Bin'] = (df_cleaned['Redshift'] // bin_size) * bin_size

# Calculating the median angular size for each bin
binned_data_cleaned = df_cleaned.groupby('Bin')['Angular_Size'].median()

# Plotting the cleaned and processed data
plt.figure(figsize=(10, 6))
plt.plot(binned_data_cleaned.index, binned_data_cleaned.values, marker='o', linestyle='-', label='Median Angular Size')
plt.xlabel('Redshift (z)')
plt.ylabel('Median Angular Size (arcsec)')
plt.title('Median Angular Size vs. Redshift (Cleaned Data)')
plt.grid(True)
plt.legend()
plt.show()
