In [107]:
import os
import pandas as pd

from pathlib import Path

In [108]:
# Prepare "nodes"

In [109]:
path = "../researchgate_hub/processed"

In [110]:
ls $path

all_unique_ids.csv          publications_overview.csv
citations_edges.csv         publications_raw.csv
citations_publications.csv


In [111]:
FILE_PATH = f"{path}/publications_overview.csv"
URL_BASE = "https://www.researchgate.net/publication/"
PREFIX_TO_REMOVE = "PB:"
# -----------------

def clean_urls(file_path: str):
    """
    Reads CSV, replaces '{URL_BASE}PB:' with '{URL_BASE}' in the 'url' column,
    and overwrites the file.
    """
    path = Path(file_path)
    if not path.exists():
        print(f"Error: File not found: {file_path}")
        return

    print(f"Reading data from: {file_path}")
    
    try:
        df = pd.read_csv(path)
    except Exception as e:
        print(f"Error reading CSV: {e}")
        return

    if 'url' not in df.columns:
        print(f"Error: Column 'url' not found.")
        return

    # Logic to fix the URL: replaces '.../publication/PB:' with '.../publication/'
    mask = df['url'].str.contains(PREFIX_TO_REMOVE, na=False)
    
    if mask.sum() == 0:
        print("No URLs found requiring cleaning (no 'PB:' prefix detected).")
        return

    df['url'] = df['url'].str.replace(
        f"{URL_BASE}{PREFIX_TO_REMOVE}",
        URL_BASE,
        regex=False
    )

    # Overwrite the original file
    df.to_csv(path, index=False, encoding="utf-8")
    
    print(f"\nSuccess: Cleaned {mask.sum()} URLs. File overwritten.")
    print(f"Example of fixed URL: {df['url'].iloc[mask.idxmax()]}")

In [112]:
clean_urls(FILE_PATH)

Reading data from: ../researchgate_hub/processed/publications_overview.csv
No URLs found requiring cleaning (no 'PB:' prefix detected).


In [124]:
publications_citations_publications_data = pd.read_csv(f"{path}/citations_publications.csv")

publications_overview_data = pd.read_csv(f"{path}/publications_overview.csv")

publications_raw_data = pd.read_csv(f"{path}/publications_raw.csv")
publications_raw_data["raw"] = True

In [129]:
topics_data = (
    publications_citations_publications_data
    .groupby("publication_id")["topic"]
    .agg(lambda x: list(set(x.dropna())))
    .reset_index(name="all_topics")
)

topics_data.shape

(3127, 2)

In [138]:
pre_data = topics_data.merge(
    publications_citations_publications_data[[
        "publication_id", "title", "url", "authors"
    ]],
    on="publication_id", 
    how="left",
)

pre_data.shape

(3854, 5)

In [139]:
pre_data.drop_duplicates(subset=["publication_id"], keep="first", inplace=True)

pre_data.shape

(3127, 5)

In [140]:
pre_data.drop(columns=["url"], axis=1, inplace=True)

In [141]:
pre_data = pre_data.merge(
    publications_overview_data[["publication_id", "url", "type", "abstract"]],
    on="publication_id", 
    how="left",
)

pre_data.shape

(3127, 7)

In [142]:
pre_data = pre_data.merge(
    (
        publications_raw_data[["publication_id", "year", "citations_count", "raw"]]
        .drop_duplicates(subset=["publication_id"], keep="first")
    ),
    on="publication_id", 
    how="left",
)

pre_data.shape

(3127, 10)

In [143]:
pre_data.rename(columns={"all_topics": "topics"}, inplace=True)

In [144]:
pre_data[
    ["publication_id", "url", "title", "type", "authors", "year", "abstract", "citations_count", "topics", "raw"]
]

Unnamed: 0,publication_id,url,title,type,authors,year,abstract,citations_count,topics,raw
0,PB:337020195,https://www.researchgate.net/publication/33702...,An End-to-End Deep RL Framework for Task Arran...,preprint,"['Caihua Shan', 'Nikos Mamoulis', 'Reynold Che...",,"In this paper, we propose a Deep Reinforcement...",,[machine-learning-sociology],
1,PB:341702131,https://www.researchgate.net/publication/34170...,Yapay Zekanın Buluşlarının Patentlenmesi,article,['Armağan Ebru Bozkurt-Yüksel'],,,,[quantitative-analysis-of-sociological-data-ar...,
2,PB:347103466,https://www.researchgate.net/publication/34710...,YAPAY ZEKÂNIN BULUŞLARININ PATENTLENMESİ,article,['Armağan Ebru Bozkurt-Yüksel'],,,,[quantitative-analysis-of-sociological-data-ar...,
3,PB:350696830,https://www.researchgate.net/publication/35069...,Machine Learning Modeling: A New Way to do Qua...,article,[],2021.0,Improvements in big data and machine learning ...,12.0,[machine-learning-quantitative-sociological-re...,True
4,PB:351249153,https://www.researchgate.net/publication/35124...,A Machine Learning Algorithm to Identify Patie...,article,[],2021.0,Background: \nRisk stratification of individua...,28.0,[machine-learning-quantitative-sociological-re...,True
...,...,...,...,...,...,...,...,...,...,...
3122,PB:398223167,https://www.researchgate.net/publication/39822...,Cross-Domain Federated Semantic Communication ...,article,"['Loc X. Nguyen', 'Ji Su Yoon', 'Huy Le', 'Yu ...",,,,[survey-data-analysis-ai],
3123,PB:398227057,https://www.researchgate.net/publication/39822...,Cross-Domain Federated Semantic Communication ...,preprint,"['Loc X. Nguyen', 'Ji Su Yoon', 'Huy Q. Le', '...",,Semantic communication can significantly impro...,,[survey-data-analysis-ai],
3124,PB:398229428,https://www.researchgate.net/publication/39822...,Potential risks of GenAI on medical education,article,"['Jacob Hough', 'Nicholas Culley', 'Chase Erga...",,,,[survey-data-analysis-ai],
3125,PB:398233284,https://www.researchgate.net/publication/39823...,ETHICS in the DIGITAL WORLD,inCollection,['Emre Yildirim'],,,,[text-analysis-discourse-ai],
