In [None]:
%pip install pymilvus pyspark pymilvus[model] torch sentence-transformers
%pip install --upgrade pip
%pip install torch torchvision torchaudio
%pip install neo4j


In [None]:
# Inicia sesión de spark
from pyspark.sql import SparkSession
from pyspark.sql.functions import input_file_name, regexp_replace, explode, col, lit

spark = SparkSession.builder \
    .appName("Milvus-PySpark") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

In [None]:
# Lectura del fichero

df = spark.read.option("multiline","true")\
    .json("/home/luis-manuel/projects/db-generation/all_conferences_data/all_data/CrawlerData/ExtendedCrawlerData")\
    .withColumn("Conference", regexp_replace(input_file_name(), ".*/|_extended_data\\.json", ""))
df.show()

In [None]:
df.printSchema()

In [None]:
#  Procesamiento de los datos leídos
years = [field.name for field in df.schema.fields]
flattened_dfs = []
for year in years:
    if year == "Conference":
        continue

    exploded_df = df.select(
            "Conference",
            explode(col(year)).alias("paper")
        ).select(
        col("paper.Abstract").alias("Abstract"),
        col("paper.Authors and Institutions").alias("Authors and Institutions"),
        col("paper.Citations S2").alias("Citations S2"),
        col("paper.DOI Number").alias("DOI Number"),
        col("paper.OpenAlex Link").alias("OpenAlex Link"),
        col("paper.OpenAlex Referenced Works").alias("OpenAlex Referenced Works"),
        col("paper.S2 Paper ID").alias("S2 Paper ID"),
        col("paper.TLDR").alias("TLDR"),
        col("paper.Title").alias("Title"),
        "Conference",
        lit(year).alias("Year")  # Ensure Year is preserved
    )
    
    flattened_dfs.append(exploded_df)

final_df = flattened_dfs[0]
for df_part in flattened_dfs[1:]:
    final_df = final_df.union(df_part)

abstracts = final_df.where(col("Abstract").isNotNull()).collect()

In [1]:
import polars as pl
from pathlib import Path
import re

path = Path("../data/CrawlerData/ExtendedCrawlerData")
json_files = list(path.glob("*.json"))

flattened_dfs = []

for file in json_files:    
    # Extract conference name from filename
    conference = re.sub(r".*/|_extended_data\.json", "", str(file))

    # Read the file (multiline JSON is supported in Polars 0.19+)
    df = pl.read_json(file, infer_schema_length=100000)

    # Loop over each column (assumed to be a year)
    for year in df.columns:
        # if year == "Conference":
        #     continue

        # Each column is a list of dicts (papers)
        df_year = (
            pl.DataFrame({
                "paper": df[year].explode()
            })
            .unnest("paper")
            .with_columns([
                pl.lit(conference).alias("Conference"),
                pl.lit(year).alias("Year")
            ])
        )

        # Optional: select/reorder only the required columns
        selected_cols = [
            "Abstract",
            "Authors and Institutions",
            "Citations S2",
            # "DOI Number",
            # "OpenAlex Link",
            # "OpenAlex Referenced Works",
            "S2 Paper ID",
            "TLDR",
            "Title",
            "Conference",
            "Year"
        ]
        df_year_reduced = df_year.select([col for col in selected_cols if col in df_year.columns])
        
        # for col in df_year_reduced.columns:
        #     dtype = df_year_reduced.schema[col]
        #     if dtype == pl.Null:
        #         df_year_reduced = df_year_reduced.with_columns(pl.col(col).cast(pl.Utf8))        

        flattened_dfs.append(df_year_reduced)

# Concatenate all DataFrames
df_final = pl.concat(flattened_dfs, how="vertical")

# Filter out entries with null Abstracts
df_abstracts = df_final.filter(pl.col("Abstract").is_not_null())

In [2]:
from pymilvus import MilvusClient, model
from sentence_transformers import SentenceTransformer

milvus_client = MilvusClient("vector.db") # Solo para serviddor
collection_name = "papers"
if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Embeddings de abstracts
# embedding_dim = len(emb_text(abstracts[0].asDict()["Abstract"]))
milvus_client.create_collection(
    collection_name=collection_name,
    dimension=768,
    primary_field_name="id",
    id_type="int",    
    metric_type="IP",  # Inner product distance
    consistency_level="Strong",  # Strong consistency level
    auto_id=True
)

In [5]:
import torch
# embedding_fn = model.DefaultEmbeddingFunction()
def get_device_map() -> str:
    return 'cuda' if torch.cuda.is_available() else 'cpu'

device = get_device_map() 

model = SentenceTransformer(model_name_or_path="Muennighoff/SGPT-125M-weightedmean-nli-bitfit", device=device )
def emb_text(text):
    # response = embedding_fn.encode_documents(text)
    embedding = model.encode([text], normalize_embeddings=True)
    return embedding[0].tolist()


In [6]:
# Alimentación de datos
from tqdm import tqdm

abstracts = df_abstracts.to_dicts()

data = []
for i, row in enumerate(tqdm(abstracts, desc="Creating embeddings")):
    data.append({
        "S2 Paper ID": row["S2 Paper ID"], 
        "vector": emb_text(row["Abstract"]), 
        "Title": row["Title"],
        "Abstract": row["Abstract"], 
        "TLDR": row["TLDR"],
        "Year": row["Year"],
        "Authors and Institutions": row["Authors and Institutions"],
        "Conference": row["Conference"]        
    })

Creating embeddings: 100%|██████████| 5993/5993 [00:57<00:00, 104.00it/s]


In [7]:
milvus_client.insert(collection_name=collection_name, data=data)

{'insert_count': 5993, 'ids': [457509055112413184, 457509055112413185, 457509055112413186, 457509055112413187, 457509055112413188, 457509055112413189, 457509055112413190, 457509055112413191, 457509055112413192, 457509055112413193, 457509055112413194, 457509055112413195, 457509055112413196, 457509055112413197, 457509055112413198, 457509055112413199, 457509055112413200, 457509055112413201, 457509055112413202, 457509055112413203, 457509055112413204, 457509055112413205, 457509055112413206, 457509055112413207, 457509055112413208, 457509055112413209, 457509055112413210, 457509055112413211, 457509055112413212, 457509055112413213, 457509055112413214, 457509055112413215, 457509055112413216, 457509055112413217, 457509055112413218, 457509055112413219, 457509055112413220, 457509055112413221, 457509055112413222, 457509055112413223, 457509055112413224, 457509055112413225, 457509055112413226, 457509055112413227, 457509055112413228, 457509055112413229, 457509055112413230, 457509055112413231, 457509055

In [7]:
def search_milvus(search_text):
    search_res = milvus_client.search(
        collection_name=collection_name,
        data=[
            emb_text(search_text)
        ],  
        limit=10,  # Return top 3 results
        search_params={"metric_type": "COSINE", "params": {}},  # Inner product distance
        output_fields=["Abstract"],  # Return the text field
    )
    return search_res[0]

# res = milvus_client.search(
#     collection_name=collection_name,
#     data=[emb_text("Cloud computing")],
#     limit=2,
#     output_fields=["Abstract"],
#     anns_field="vector"
# )

In [9]:
search_milvus("serverless stateful computing")

[{'id': 457389739558572127,
  'distance': 0.6507527828216553,
  'entity': {'Abstract': 'Serverless computing defines a pay-as-you-go cloud execution model, where the unit of computation is a function that a cloud provider executes and auto-scales on behalf of a cloud consumer. Serverless suggests not (or less) caring about servers but focusing (more) on business logic expressed in functions. Server’less’ may be ‘more’ when getting developer expectations and platform propositions right and when engineering solutions that take specific behavior and constraints of (current) Function-as-a-Service platforms into account. To this end, in this invited paper, we present a summary of findings and lessons learned from a series of research experiments conducted over the past two years. We argue that careful attention must be placed on the promises associated with the serverless model, provide a reality-check for five common assumptions, and suggest ways to mitigate unwanted effects. Our findings 

In [None]:
from neo4j import GraphDatabase

# Detalles de conexión
URI = "bolt://localhost:7687"
USERNAME = "neo4j"
PASSWORD = "password"
driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD), database="merged")

# Función para ejecutar queries en Cypher
def run_write_query(query, params={}):
    with driver.session() as session:
        session.execute_write(query, params)

        # result = session.execute_write(query, params)
        # return [record for record in result]

def community_search(tx, params={}):
    result_stream = tx.run("""
    CALL gds.graph.project(
        'citationsGraph',
        'Paper',
        'CITES'
    )
    """)
    print(result_stream.single())

    result_community = tx.run("""
    CALL gds.louvain.stream('citationsGraph')
    YIELD nodeId, communityId
    RETURN gds.util.asNode(nodeId).title AS paperTitle, communityId
    ORDER BY communityId;
    """)
    print(result_community.single())

    result_write = tx.run("""
    CALL gds.louvain.write('citationsGraph', { writeProperty: 'communityId' });
    """)
    print(result_write.single())

    tx.run("""CALL gds.graph.drop('citationsGraph')""")

def connected_components_search(tx, params={}):


    result_stream = tx.run("""
    CALL gds.graph.project(
        'citationsGraph',
        'Paper',
        'CITES'
    )
    """)
    print(result_stream.single())

    # result_community = tx.run("""
    # CALL gds.wcc.stream('connectedComponentId')
    # YIELD nodeId, communityId
    # RETURN gds.util.asNode(nodeId).title AS title, communityId
    # ORDER BY communityId;
    # """)
    # print(result_community.single())

    result_write = tx.run("""
    CALL gds.wcc.write('citationsGraph', { writeProperty: 'connectedComponentId' });
    """)
    print(result_write.single())

    tx.run("""CALL gds.graph.drop('citationsGraph')""")      


In [None]:
# run_write_query(community_search)
run_write_query(connected_components_search)

In [None]:
import hashlib
import uuid

# The UUID string to hash
hex_str = "d4c3b0ae5b0c11e69420ab73f5d84808"

# Convert the UUID string to bytes and hash it with SHA-1
formatted_uuid = str(uuid.UUID(hex_str))

print(formatted_uuid)  # Output: 2b8ad4c3b0ae5b0c11e69420ab73f5d84808749b (example)


In [None]:
len("d4c3b0ae5b0c11e69420ab73f5d84808")

In [None]:
from neo4j import GraphDatabase
URI = "bolt://localhost:7687"
USERNAME = "neo4j"
PASSWORD = "password"

# Mergeo de las bases de datos de grafos
source_databases = ["ieeecloud", "ccgrid", "europar", "eurosys", "ic2e", "icdcs", "middleware", "nsdi", "sigcomm", "socc"]
target_database = "complete"

# Extracción de datos de la base de datos indicada
def extract_data(db_name):
    driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD), database=db_name)
    
    def fetch_nodes(tx):
        return tx.run("MATCH (n) RETURN n as properties, labels(n) as labels").data()

    def fetch_relationships(tx):
        # return tx.run("MmodelATCH (a)-[r]->(b) RETURN d(a) AS start, type(r) AS type, id(b) AS end, properties(r) AS properties").data()
        return tx.run("MATCH p=(a)-[r]->(b) RETURN a as source, labels(a) as source_labels, type(r) as relationship, b as target, labels(b) as target_labels").data()

    with driver.session() as session:
        nodes = session.execute_read(fetch_nodes)
        relationships = session.execute_read(fetch_relationships)

    driver.close()
    return nodes, relationships

# Extracción de datos general
def extract_data_multiple(db_names):
    nodes_complete = []
    relationships_complete = []
    for source in tqdm(db_names, "Reading databases"):
        nodes, relationships = extract_data(source)
        nodes_complete.extend(nodes)
        relationships_complete.extend(relationships)
    return nodes_complete, relationships_complete

In [None]:
import json
from tqdm import tqdm

def insert_data(target_db, nodes, relationships):
    driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD), database=target_db)
    
    def create_nodes(tx, nodes):
        errors = []
        for node in tqdm(nodes, "Inserting nodes"):
            labels = ":".join(node['labels'])
            props = ", ".join(f"{k}: {json.dumps(v)}" for k, v in node['properties'].items())
            query = f"MERGE (n:{labels} {{{props}}})"
            try:
                tx.run(query)
            except Exception as e:
                errors.append({
                    "node": node,
                    "error": e
                })

        return errors

    def create_relationships(tx, relationships):
        errors = []
        for rel in tqdm(relationships, "Inserting realtionships"):
            if rel["source_labels"][0] == "Paper" and rel["target_labels"][0] == "Paper" and rel["relationship"] == "CITES":  
                query = (
                    f"MATCH (a:Paper), (b:Paper) WHERE a.title = '{rel['source']["title"].replace("'", "\\'")}' AND b.title = '{rel['target']["title"].replace("'", "\\'")}' "
                    f"MERGE (a)-[r:{rel['relationship']}]->(b)"
                )

            elif rel["source_labels"][0] == "Paper" and rel["target_labels"][0] == "Institution" and rel["relationship"] == "HAS_INSTITUTION":
                query = (
                    f"MATCH (a:Paper), (b:Institution) WHERE a.title = '{rel['source']["title"].replace("'", "\\'")}' AND b.name = '{rel['target']["name"].replace("'", "\\'")}' "
                    f"MERGE (a)-[r:{rel['relationship']}]->(b)"
                )

            elif rel["source_labels"][0] == "Institution" and rel["target_labels"][0] == "Country" and rel["relationship"] == "LOCATED_IN":
                query = (
                    f"MATCH (a:Institution), (b:Country) WHERE a.name = '{rel['source']["name"].replace("'", "\\'")}' AND b.name = '{rel['target']["name"].replace("'", "\\'")}'"
                    f"MERGE (a)-[r:{rel['relationship']}]->(b)"
                )

            try:
                tx.run(query)
            except Exception as e:
                print(e)
                errors.append({
                    "relationship": rel,
                    "error": e
                })
        
        return errors

    with driver.session() as session:
        node_errors = session.execute_write(create_nodes, nodes)
        relationship_errors = session.execute_write(create_relationships, relationships)

    driver.close()

    return node_errors, relationship_errors

In [None]:
nodes, relationships = extract_data_multiple(source_databases[4:5])

In [None]:
node_errors, relationship_errors = insert_data(target_database, [], relationships[12915:])


#  CALL apoc.export.csv.query( "MATCH (n:Paper) RETURN labels(n) as labels, properties(n) as properties", "eurosys_nodes.csv", {} );
# CALL apoc.export.csv.query( "MATCH (a)-[r:CITES]->(b)  RETURN properties(a).title AS start, properties(b).title AS end, TYPE(r) AS type", "eurosys_cites.csv", {} );
# CALL apoc.export.csv.query( "MATCH (n:Paper) RETURN n.PredominantContinent, n.title, n.PredominantCountry, n.year, n.Authors, n.conference", "eurosys_papers.csv", {} )

# LOAD CSV WITH HEADERS FROM 'file:///eurosys_papers.csv' AS row
# MERGE p=(a:Paper {PredominantContinent: row[0], title: row[1], PredominantCountry: row[2], year: row[3], Authors: row[4], conference: row[5] })
# RETURN p

# LOAD CSV WITH HEADERS FROM 'file:///eurosys_cites.csv' AS row
# MATCH (a:Paper {title: row.start})
# MATCH (b:Paper {title: row.end})
# CALL apoc.create.relationship(a, "CITES", {}, b)
# YIELD rel
# RETURN rel


In [None]:
try:
    insert_data(target_database, nodes[1:100], relationships[1:50])
except:
    print("aqui")

In [None]:
relationships[25641]

In [None]:
relationships_change = relationships[28743]
relationships_change["target"]["title"] = relationships_change["target"]["title"].replace("''", "'")
relationships[28743]

In [None]:
relationships[28743]

In [None]:
insert_data(target_database, [], [relationships[28743]])

In [8]:
search_milvus("serverless stateful computing")

[{'id': 457409336716300383,
  'distance': 0.6507527828216553,
  'entity': {'Abstract': 'Serverless computing defines a pay-as-you-go cloud execution model, where the unit of computation is a function that a cloud provider executes and auto-scales on behalf of a cloud consumer. Serverless suggests not (or less) caring about servers but focusing (more) on business logic expressed in functions. Server’less’ may be ‘more’ when getting developer expectations and platform propositions right and when engineering solutions that take specific behavior and constraints of (current) Function-as-a-Service platforms into account. To this end, in this invited paper, we present a summary of findings and lessons learned from a series of research experiments conducted over the past two years. We argue that careful attention must be placed on the promises associated with the serverless model, provide a reality-check for five common assumptions, and suggest ways to mitigate unwanted effects. Our findings 

In [12]:
results = search_milvus("serverless stateful computing")
abstract = results[0]["entity"]["Abstract"]

from keybert import KeyBERT 
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/allenai-specter")

kw_model = KeyBERT(model=model)

print(abstract)

keywords = kw_model.extract_keywords(
    abstract,
    top_n=10,
    use_maxsum=True,  # optional: improves diversity
    nr_candidates=20    
)

Shielded execution based on Intel SGX provides strong security guarantees for legacy applications running on untrusted platforms. However, memory safety attacks such as Heartbleed can render the confidentiality and integrity properties of shielded execution completely ineffective. To prevent these attacks, the state-of-the-art memory-safety approaches can be used in the context of shielded execution. In this work, we first showcase that two prominent software- and hardware-based defenses, AddressSanitizer and Intel MPX respectively, are impractical for shielded execution due to high performance and memory overheads. This motivated our design of SGXBounds---an efficient memory-safety approach for shielded execution exploiting the architectural features of Intel SGX. Our design is based on a simple combination of tagged pointers and compact memory layout. We implemented SGXBounds based on the LLVM compiler framework targeting unmodified multithreaded applications. Our evaluation using Ph

In [38]:
keywords = kw_model.extract_keywords(
    results[1]["entity"]["TLDR"],
    top_n=10,
)
keywords

KeyError: 'TLDR'

In [52]:
import torch
torch.cuda.empty_cache()

del kw_model
torch.cuda.empty_cache()


In [42]:
len(model.encode(results[1]["entity"]["Abstract"]))

768