In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup


In [None]:
def get_tf_matches(html_file):
    # Load the HTML file

    # Read the HTML file into BeautifulSoup
    with open(html_file, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

    # Find the table
    table = soup.find("table")  # Adjust if there's more than one table or a specific identifier

    # Extract rows
    rows = table.find_all("tr")

    # Parse the table into a list of lists
    data = []
    for row in rows:
        cells = row.find_all(["td", "th"])  # Include both header and data cells
        row_data = []
        for cell in cells:
            # Check if the cell contains an image
            img_tag = cell.find("img")
            if img_tag:
                row_data.append(img_tag.get("src"))  # Get the `src` attribute of the image
            else:
                row_data.append(cell.get_text(strip=True))  # Get the text content
        data.append(row_data)

    # Convert the data into a Pandas DataFrame
    df = pd.DataFrame(data)
    df = df.set_index(df.columns[0])  # Set the first column as the index

    # Inspect the DataFrame

    df.columns = df.iloc[0]
    df = df[1:]

    df["Direct_annot"] = df["Direct_annot"].str.split(",").apply(lambda x: [item.strip() for item in x])  # Split strings into lists
    df = df.dropna(subset=["Direct_annot", "Logo"])  # Drop rows where Direct_annot is NaN
    df = df.explode("Direct_annot", ignore_index=True)  # Expand into multiple rows

    # Group by TF and get the first Logo for each TF
    tf_matches = df[["Direct_annot", "Logo"]].groupby("Direct_annot").first()

    return tf_matches

In [None]:
tf_matches_ctx = get_tf_matches("/Users/bogdan/Downloads/ctx_results.html")
tf_matches_dem = get_tf_matches("/Users/bogdan/Downloads/dem_results.html")
tf_matches_ctx.columns = ["Logo_ctx"]
tf_matches_dem.columns = ["Logo_dem"]

In [None]:
tf_matches = pd.concat([tf_matches_ctx, tf_matches_dem], axis = 1)
tf_matches["Logo"] = np.where(~tf_matches["Logo_ctx"].isna(), tf_matches["Logo_ctx"], tf_matches["Logo_dem"])
tf_matches

In [None]:
top_tfs = pd.read_csv("/Users/bogdan/Downloads/top_meiotic_tfs.csv", index_col=0)

In [None]:
top_tfs

In [None]:
top_tfs = top_tfs.head(20).merge(tf_matches["Logo"], "left", left_index=True, right_index=True)

In [None]:
top_tfs

In [None]:
top_tfs.to_csv("/Users/bogdan/ovelle/output/garcia_ATAC/tfs_logos.csv", index = True)

In [None]:
top_regulons = pd.read_csv("/Users/bogdan/Downloads/top_meiotic_regulons.csv", index_col=0)

In [None]:
top_regulons["TF"] = top_regulons.index.str.split("_").str[0]
top_regulons

In [None]:
top_regulons = top_regulons.head(20).merge(tf_matches["Logo"], "left", left_on = "TF", right_index=True)
top_regulons

In [None]:
top_regulons.to_csv("/Users/bogdan/ovelle/output/garcia_ATAC/regulons_tfs_logos.csv", index = True)