In [3]:
import pandas as pd
import re

def parse_url(url:str):
    url_components = url.split("/")
    main_version = url_components[-2]
    release_note_version = url_components[-1]
    return main_version, release_note_version

source_urls_df = pd.read_csv("output.csv")
source_urls_df['source_uri'] = source_urls_df['source_uri'].str.replace(r'\t', '', regex=True)
source_urls_df[["main_version", "release_note_version"]] = source_urls_df["source_uri"].apply(
    lambda x: pd.Series(parse_url(x))
)

In [4]:
source_urls_df = source_urls_df.sort_values(by=["release_note_version", "main_version"])
source_urls_df.to_csv("documents.csv", index=False)

In [5]:
grouped_source_urls_df = source_urls_df.groupby("release_note_version")

latest_version_rows = []
other_versions_rows = []

for release_note_version, group in grouped_source_urls_df:
    group_sorted = group.sort_values(by="main_version", ascending=False)
    latest_version_rows.append(group_sorted.iloc[0])
    if len(group_sorted) > 1:
        other_versions_rows.append(group_sorted.iloc[1:])

latest_versions_df = pd.DataFrame(latest_version_rows)
other_versions_df = pd.concat(other_versions_rows, ignore_index=True) if other_versions_rows else pd.DataFrame()
latest_versions_df.to_csv("document_latest_versions.csv", index=False)
other_versions_df.to_csv("document_other_versions.csv", index=False)

In [None]:
# delete from relationships_60001 where document_id in (?,?,?)
# delete from chunks_60001 where document_id in (?,?,?)

batch_size = 100
document_ids = other_versions_df["id"].tolist()

delete_relationships_sql = []
delete_chunks_sql = []

for i in range(0, len(document_ids), batch_size):
    batch = document_ids[i:i + batch_size]
    ids_list = ",".join(map(str, batch))
    relationships_sql = f"delete from relationships_60001 where document_id in ({ids_list});"
    chunks_sql = f"delete from chunks_60001 where document_id in ({ids_list});"
    delete_relationships_sql.append(relationships_sql)
    delete_chunks_sql.append(chunks_sql)

delete_relationships_sql[:5], delete_chunks_sql[:5]

In [None]:
for sql in delete_relationships_sql:
    print(sql)

# totla relationships count = 42607, cleaned 34968

In [None]:
for sql in delete_chunks_sql:
    print(sql)

# total chunks count = 25638, cleaned 21005

In [None]:
"""
SELECT 
    e.id AS entity_id,
    e.name AS entity_name,
    COUNT(r.entity_id) AS relationship_count
FROM 
    entities_60001 e
LEFT JOIN (
    SELECT source_entity_id AS entity_id FROM relationships_60001
    UNION ALL
    SELECT target_entity_id AS entity_id FROM relationships_60001
) r ON e.id = r.entity_id
GROUP BY e.id, e.name
having relationship_count = 0
ORDER BY relationship_count DESC;
"""

"""
DELETE FROM entities_60001
WHERE id IN (
    SELECT e.id
    FROM entities_60001 e
    LEFT JOIN (
        SELECT source_entity_id AS entity_id FROM relationships_60001
        UNION ALL
        SELECT target_entity_id AS entity_id FROM relationships_60001
    ) r ON e.id = r.entity_id
    GROUP BY e.id
    HAVING COUNT(r.entity_id) = 0
);
"""

# Total entities count =  35824, cleand 28785