In [1]:
from neo4j import GraphDatabase

uri = "neo4j://localhost:7687"
username = "neo4j"
password = "Your password"
driver = GraphDatabase.driver(uri, auth=(username, password))

In [None]:
def delete_all(tx):
    query = """
    MATCH (n) DETACH DELETE n
    """
    docs = tx.run(query)
    for doc in docs:
        print(doc['n'])
    return docs

with driver.session() as session:
    docs = session.execute_write(delete_all)

In [2]:
def get_total_relationships(tx):
    result = tx.run("MATCH ()-[r]->() RETURN count(r) AS total_relationships")
    return result.single()["total_relationships"]

# Open a session and get the total number of relationships
with driver.session() as session:
    total_relationships = session.execute_read(get_total_relationships)

# Print the total number of relationships
print(f"Total number of relationships: {total_relationships}")

Total number of relationships: 434518


In [None]:

def node_count(tx):
    query = """
        MATCH (n)
        RETURN count(n) AS nodeCount;
    """
    retuslts = tx.run(query)
    for result in retuslts:
        print(result)
    return retuslts

with driver.session() as session:
    docs = session.execute_read(node_count)

In [None]:
#we have saved node info functions for each 1000 function in a separate file
total_number_of_function = 115000
for j in range(0,total_number_of_function,1000):
    def create_nodes(tx):
            query = f"""
            WITH "file:///nodes_{j}_enhanced.json" AS url
            CALL apoc.load.json(url) YIELD value AS data
            UNWIND data AS item
            CALL apoc.create.node(['code_block'], {{content:item.content,node_type:item.node_type,embedding:item.embedding,embedding_large:item.voyage_content,uuid:item.uuid,groupid:{j}}}) YIELD node
            RETURN node
            """
            docs = tx.run(query)
        # return docs

    with driver.session() as session:
        docs = session.execute_write(create_nodes)
    print(f"{j} have been processed.")

In [None]:
total_number_of_function = 115000

for j in range(0,total_number_of_function,1000):
    def create_relationships(tx):
        
            query = f"""
            CALL apoc.load.json("file:///relations_{j}.json") YIELD value

            WITH value AS relationship

            // Match nodes based on UUIDs
            MATCH (a:code_block {{groupid: {j},uuid: relationship.uuid_from}}) 
            MATCH (b:code_block {{groupid: {j},uuid: relationship.uuid_to}})

            // Create a relationship with a dynamic type
            CALL apoc.create.relationship(a, relationship.relation_type, {{}}, b) YIELD rel

            // Return the result
            RETURN a, b, rel
            """
            docs = tx.run(query)
            print(f"doc {j} has been processed.")
        # return doc

    with driver.session() as session:
        docs = session.execute_write(create_relationships)

In [9]:

def inxed_on_uuid(tx):
    query = """
    CREATE CONSTRAINT code_block_index FOR (m:code_block) REQUIRE m.uuid IS UNIQUE
    """
    retuslts = tx.run(query)
    for result in retuslts:
        print(result)
    return retuslts

with driver.session() as session:
    docs = session.execute_write(inxed_on_uuid)

In [10]:
def inxed_on_uuid(tx):
    query = """
    CREATE INDEX code_block_group_index FOR (n:code_block) ON (n.groupid);
    """
    retuslts = tx.run(query)
    for result in retuslts:
        print(result)
    return retuslts

with driver.session() as session:
    docs = session.execute_write(inxed_on_uuid)

In [19]:
def create_semantic_relationships(tx):

    query = f"""
    MATCH (n1:func_name), (n2:func_name)
    WHERE id(n1) < id(n2)

    WITH n1, n2, gds.similarity.cosine(n1.embedding_large, n2.embedding_large) AS similarity
    WHERE similarity > 0.85
    CREATE (n1)-[:SIMILAR_TO {{similarity: similarity}}]->(n2)
    RETURN n1, n2, similarity
    """
    docs = tx.run(query)
    return docs

with driver.session() as session:
    docs = session.execute_write(create_semantic_relationships)



In [25]:

column_name="code_block_vector_index"

def create_vector_index(tx):
    query = f"""
    CREATE VECTOR INDEX `{column_name}`
    FOR (n: code_block) ON (n.embedding_large)
    OPTIONS {{indexConfig: {{
    `vector.dimensions`: 1536,
    `vector.similarity_function`: 'cosine'
    }}}};
    """
    docs = tx.run(query)
    return docs

with driver.session() as session:
    docs = session.execute_write(create_vector_index)

In [None]:
def delete_index(tx):
    query = f"""
    DROP INDEX code_block_uuid;
    """
    docs = tx.run(query)
    return docs

with driver.session() as session:
    docs = session.execute_write(delete_index)

In [234]:
# CREATE INDEX FOR (n:code_block) ON (n.node_type);
def create_index(tx):
    query = f"""
    
    CREATE INDEX FOR (n:code_block) ON (n.embedding_large);
    """
    docs = tx.run(query)
    return docs

with driver.session() as session:
    docs = session.execute_write(create_index)

In [24]:
def show_index(tx):
    query = f"""
    SHOW INDEXES;
    """
    docs = tx.run(query)
    for doc in docs:
        print(doc)
    return docs

with driver.session() as session:
    docs = session.execute_read(show_index)

<Record id=5 name='code_block_group_index' state='ONLINE' populationPercent=100.0 type='RANGE' entityType='NODE' labelsOrTypes=['code_block'] properties=['groupid'] indexProvider='range-1.0' owningConstraint=None lastRead=None readCount=0>
<Record id=3 name='code_block_index' state='ONLINE' populationPercent=100.0 type='RANGE' entityType='NODE' labelsOrTypes=['code_block'] properties=['uuid'] indexProvider='range-1.0' owningConstraint='code_block_index' lastRead=neo4j.time.DateTime(2024, 8, 21, 0, 5, 10, 894000000, tzinfo=<UTC>) readCount=1176550>
<Record id=1 name='index_343aff4e' state='ONLINE' populationPercent=100.0 type='LOOKUP' entityType='NODE' labelsOrTypes=None properties=None indexProvider='token-lookup-1.0' owningConstraint=None lastRead=None readCount=0>
<Record id=2 name='index_f7700477' state='ONLINE' populationPercent=100.0 type='LOOKUP' entityType='RELATIONSHIP' labelsOrTypes=None properties=None indexProvider='token-lookup-1.0' owningConstraint=None lastRead=None readC

In [34]:
def show_constraints(tx):
    query = f"""
    SHOW constraints;
    """
    docs = tx.run(query)
    for doc in docs:
        print(doc)
    return docs

with driver.session() as session:
    docs = session.execute_read(show_constraints)

<Record id=6 name='code_block_index' type='UNIQUENESS' entityType='NODE' labelsOrTypes=['code_block'] properties=['uuid'] ownedIndex='code_block_index' propertyType=None>
<Record id=8 name='func_name_index' type='UNIQUENESS' entityType='NODE' labelsOrTypes=['func_name'] properties=['uuid'] ownedIndex='func_name_index' propertyType=None>
<Record id=4 name='implementation_index' type='UNIQUENESS' entityType='NODE' labelsOrTypes=['implementation'] properties=['uuid'] ownedIndex='implementation_index' propertyType=None>


In [26]:
def delete_constraint(tx):
    query = f"""
    DROP CONSTRAINT implementation_uuid;
    """
    docs = tx.run(query)
    return docs

with driver.session() as session:
    docs = session.execute_write(delete_constraint)

In [19]:
import voyageai
voyageai.api_key = "pa-lpHvggxAmQX_QTSipVOhn7qC5Ue_9XRxT_RGwCEFRmE"
vo = voyageai.Client()



In [None]:
# an example of retrieval
query = """For a given list of integers, return a tuple consisting of a sum and a product of all the integers in a list. Empty sum should be equal to 0 and empty product should be equal to 1. >>> sum_product([]) (0, 1) >>> sum_product([1, 2, 3, 4]) (10, 24) """

query_embedding = vo.embed([query], model="voyage-code-2").embeddings[0]

In [21]:

def vector_index_search(tx):
    query = f"""
    CALL db.index.vector.queryNodes('code_block_vector_index', 10, {query_embedding})
    YIELD node AS item, score
    RETURN  score, item.content AS content, item.embedding as embedding
    """
    retrieved_embeddings = []
    docs = tx.run(query)
    for doc in docs:
        print(doc)
        retrieved_embeddings.append(doc['embedding'])

    return docs,retrieved_embeddings

with driver.session() as session:
    docs,retrieved_embeddings = session.execute_read(vector_index_search)

<Record score=0.8830127716064453 content='    row_to_sort = table_data[sort_row]\n    col_to_sort = [row[sort_column] for row in table_data]\n    sorted_col = sorted(col_to_sort)\n    row_indices = [col_to_sort.index(val) for val in sorted_col]\n    sorted_data = []\n    for i in row_indices:\n        sorted_data.append(table_data[i])\n    return sorted_data' embedding=[-0.007909656502306461, 0.02840583026409149, -0.12434674799442291, 0.019193410873413086, -0.07726016640663147, 0.11634133011102676, 0.06942914426326752, -0.05989779531955719, 0.07717688381671906, -0.05098520219326019, 0.0025004756171256304, -0.055318836122751236, 0.023515738546848297, -0.10449260473251343, 0.01987525448203087, -0.07083327323198318, 0.10623994469642639, 0.024535058066248894, -0.07393083721399307, 0.043240658938884735, -0.034193262457847595, 0.019228121265769005, -0.004227224737405777, 0.03865516558289528, -0.005984822753816843, 0.026631992310285568, -0.127699077129364, 0.11797858774662018, -0.044584594666

# code_block

In [253]:
def vector_index_search(tx):
    query = f"""
    CALL db.index.vector.queryNodes('code_block_vector_index', 40, {query_embedding})
    YIELD node AS item, score
    WHERE item.node_type = 'code_block'
    RETURN score, item.content AS content
    LIMIT 10
    """
    docs = tx.run(query)
    for doc in docs:
        print(doc)
    return docs,retrieved_embeddings

with driver.session() as session:
    docs,retrieved_embeddings = session.execute_read(vector_index_search)

<Record score=0.903038740158081 content='    num_coeff = len(xs)\n    derivative = [i * xs[i] for i in range(1, num_coeff)]\n    integral = [xs[i] / (i + 1) for i in range(num_coeff)]\n    integral.insert(0, C)\n    zeros = np.roots(list(reversed(xs))).tolist()\n    return (derivative, integral, zeros)'>
<Record score=0.9026429653167725 content='    result = 0\n    power = len(coefficients) - 1\n    for coeff in coefficients:\n        result += coeff * x ** power\n        power -= 1\n    return result'>
<Record score=0.9023441672325134 content='    result = 0\n    for i in range(len(poly)):\n        result += poly[i] * x ** i\n    return result'>
<Record score=0.9010539054870605 content='    result = 0\n    n = len(coefficients)\n    for i, coefficient in enumerate(coefficients):\n        result += coefficient * x ** (n - i - 1)\n    return result'>
<Record score=0.9010425209999084 content='    deg_p = len(px) - 1\n    deg_q = len(qx) - 1\n    if deg_p == 0 or deg_q == 0:\n        if d

In [9]:
function_embedding = vo.embed(["nested_bracket"], model="voyage-code-2").embeddings[0]
def vector_index_search(tx):
    query = f"""
    CALL db.index.vector.queryNodes('code_block_vector_index', 40, {function_embedding})
    YIELD node AS item, score
    WHERE item.node_type = 'func_name'
    RETURN score, item.content AS content
    LIMIT 5
    """
    docs = tx.run(query)
    for doc in docs:
        print(doc)
    return docs

with driver.session() as session:
    docs = session.execute_read(vector_index_search)

<Record score=0.9671465158462524 content='complex_brackets'>
<Record score=0.9671372175216675 content='complex_brackets'>
<Record score=0.9613941311836243 content='intricate_brackets'>
<Record score=0.9613808393478394 content='intricate_brackets'>
<Record score=0.9485931992530823 content='bracket_parser'>


# func implementation

In [55]:
def vector_index_search(tx):
    query = f"""
    CALL db.index.vector.queryNodes('code_block_vector_index', 40, {query_embedding})
    YIELD node AS item, score
    WHERE item.node_type = 'implementation'
    RETURN score, item.content AS content
    LIMIT 10
    """
    docs = tx.run(query)
    for doc in docs:
        print(doc)
    return docs,retrieved_embeddings

with driver.session() as session:
    docs,retrieved_embeddings = session.execute_read(vector_index_search)

<Record score=0.9100169539451599 content='def compute_derivative_integral(xs: list, C: int):\n    """\n    Assumptions broken: (xs = coefficients of a polynomial in order: xs[0] + xs[1] * x + xs[2] * x^2 + ....)\n    Compute both the derivative and integral of the given polynomial, incorporate constant C and remove any surplus zeros. Ensure correctness with edge cases.\n    """\n    if not xs:\n        return [], [C]\n    \n    derivative = []\n    integral = [C]\n\n    for i in range(len(xs)):\n        if i > 0:  derivative.append(i * xs[i])\n        if i > 0:  integral.append(xs[i-1] /(i))\n        \n    integral.append(xs[-1] / len(xs))\n\n    while derivative and derivative[-1] == 0:\n        derivative.pop()\n        \n    while len(integral) > 1 and integral[-1] == 0:  # retain the constant term even if it\'s zero.\n        integral.pop()\n\n    return derivative, integral'>
<Record score=0.9041867256164551 content='def evalPoly(poly, x):\n    """\n    Return the value of the pol

In [247]:
def vector_index_search(tx):
    query = f"""
CALL db.index.vector.queryNodes('implementation_index', 10, {query_embedding})
YIELD node AS item, score
RETURN  score, item.content AS content, item.embedding as embedding
    """
    docs = tx.run(query)
    for doc in docs:
        print(doc)

    return docs,retrieved_embeddings

with driver.session() as session:
    docs,retrieved_embeddings = session.execute_read(vector_index_search)

<Record score=0.9148832559585571 content='def enhancedTotalMatch(lst1, lst2):\n    """\n    """\n    anagrams = [[w1, w2] for w1 in lst1 for w2 in lst2 if sorted(w1.lower()) == sorted(w2.lower())]\n    count1, count2 = sum(len(w) for w in lst1), sum(len(w) for w in lst2)\n    if count1 <= count2:\n        return [lst1, anagrams]\n    else:\n        return [lst2, anagrams]' embedding=[0.009031648747622967, 0.030583929270505905, -0.03486156091094017, -0.022051002830266953, -0.023475293070077896, -0.026141636073589325, -0.05974365025758743, -0.053883060812950134, -0.03208543360233307, -0.05409877747297287, -0.09671930968761444, -0.01607118919491768, -0.07319103181362152, -0.12618064880371094, 0.01626375876367092, -0.07468921691179276, 0.0861251950263977, 0.02918310835957527, -0.026843829080462456, -0.007701801136136055, -0.02969500422477722, -0.009568755514919758, 0.02772989124059677, 0.1341221034526825, -0.04653438553214073, -0.01781974546611309, -0.07039649784564972, 0.03220852091908455

# semantic path search

In [361]:
def path_search(tx):
    query = f"""
    with {query_embedding} as query_embedding
    CALL db.index.vector.queryNodes('code_block_vector_index', 20, query_embedding)
    YIELD node AS item, score
    WHERE item.node_type = 'code_block'
    MATCH path = (n)-[:child*]->(m)
    WHERE NOT (m)-[:child]->() and n.uuid = item.uuid
    with path, nodes(path) AS pathNodes, query_embedding, id(item) as item_id

    UNWIND range(0, size(pathNodes)-1) AS idx
    WITH pathNodes[idx] AS middleNode,idx,query_embedding,item_id

    WITH middleNode.content as content, gds.similarity.cosine(middleNode.embedding_large, query_embedding) AS similarity, idx,item_id
    RETURN item_id,idx,similarity, content
    """

    doc_info = []
    docs = tx.run(query)
    for doc in docs:
        print(doc)
        doc_info.append({'item_id':doc['item_id'],'idx':doc['idx'],'similarity':doc['similarity'],'content':doc['content']})
        
    return doc_info,retrieved_embeddings

with driver.session() as session:
    docs,retrieved_embeddings = session.execute_read(path_search)


<Record item_id=229700 idx=0 similarity=0.8156278122924214 content='    freq_count = {}\n    for num in lst:\n        if num > 0:\n            if num not in freq_count:\n                freq_count[num] = 1\n            else:\n                freq_count[num] += 1\n    sorted_freq_count = sorted(freq_count.items(), key=lambda x: x[1], reverse=True)\n    if sorted_freq_count:\n        return sorted_freq_count[0][0]\n    else:\n        return None'>
<Record item_id=229700 idx=1 similarity=0.711971877310879 content='    if sorted_freq_count:\n        return sorted_freq_count[0][0]\n    else:\n        return None'>
<Record item_id=229700 idx=0 similarity=0.8156278122924214 content='    freq_count = {}\n    for num in lst:\n        if num > 0:\n            if num not in freq_count:\n                freq_count[num] = 1\n            else:\n                freq_count[num] += 1\n    sorted_freq_count = sorted(freq_count.items(), key=lambda x: x[1], reverse=True)\n    if sorted_freq_count:\n      



<Record item_id=63743 idx=0 similarity=0.7942822605660861 content='    count = Counter()\n    freq = defaultdict(int)\n    max_count = res = 0\n    for i, x in enumerate(nums, 1):\n        freq[count[x]] -= 1\n        count[x] += 1\n        freq[count[x]] += 1\n        max_count = max(max_count, count[x])\n        if max_count * freq[max_count] == i - 1 and i < len(nums):\n            res = i\n        if max_count * (freq[max_count] - 1) + max_count - 1 == i:\n            res = i\n    return [res, nums[res - 1]] if res > 0 else [0, -1]'>
<Record item_id=63743 idx=1 similarity=0.77524134303082 content='    for i, x in enumerate(nums, 1):\n        freq[count[x]] -= 1\n        count[x] += 1\n        freq[count[x]] += 1\n        max_count = max(max_count, count[x])\n        if max_count * freq[max_count] == i - 1 and i < len(nums):\n            res = i\n        if max_count * (freq[max_count] - 1) + max_count - 1 == i:\n            res = i'>
<Record item_id=63743 idx=2 similarity=0.7537903

In [351]:
import re
import ast

def remove_comments_from_function(func_str):
    # Remove multi-line comments (''' or """ enclosed)
    no_multi_line_comments = re.sub(r'(\'\'\'(.*?)\'\'\'|\"\"\"(.*?)\"\"\")', '', func_str, flags=re.DOTALL)
    
    # Remove single-line and inline comments (starting with #)
    no_comments = re.sub(r'#.*', '', no_multi_line_comments)
    
    no_empty_lines = "\n".join([line for line in no_comments.splitlines() if line.strip() != ""])

    return no_empty_lines

def parse_and_regenerate_code(code_string):
    # Parse the code string to an AST
    tree = ast.parse(code_string)
    
    # Regenerate the code from the AST
    regenerated_code = ast.unparse(tree)
    
    return regenerated_code

def remove_subset_lines(larger_str, subset_str):
    # Split the strings into lists of lines
    larger_lines = larger_str.splitlines()
    subset_lines = subset_str.splitlines()

    
    # Remove lines from the larger string that are in the subset string
    result_lines = [line for line in larger_lines if line not in subset_lines]
    
    # Join the result back into a single multiline string
    result_str = "\n".join(result_lines)
    
    return result_str

def path_prouning(paths):
    prouned_contents = []
    try:
        content = parse_and_regenerate_code(remove_comments_from_function(paths[0][0]['content']))
    except:
        content = remove_comments_from_function(paths[0][0]['content'])

    prouned_contents.append(content)
    for path in paths:

        prouned_content = remove_subset_lines(content,path[-1]['content'])
        if prouned_content not in prouned_contents:
            prouned_contents.append(prouned_content)

    return prouned_contents

import numpy as np

def cosine_similarity(embedding1, embedding2):
    # Ensure the embeddings are numpy arrays
    embedding1 = np.array(embedding1)
    embedding2 = np.array(embedding2)
    
    # Compute the dot product
    dot_product = np.dot(embedding1, embedding2)
    
    # Compute the magnitudes (norms) of the embeddings
    norm1 = np.linalg.norm(embedding1)
    norm2 = np.linalg.norm(embedding2)
    
    # Calculate cosine similarity
    if norm1 == 0 or norm2 == 0:
        return 0.0  # Avoid division by zero if one of the embeddings is all zeros
    cosine_sim = dot_product / (norm1 * norm2)
    
    return cosine_sim


def get_unique_paths(docs):
    solutions = {}
    for doc in docs:
        item_id = str(doc["item_id"])
        if item_id in solutions:
            solutions[item_id].append({'idx':doc['idx'],'similarity':doc['similarity'],'content':doc['content']})
        else:
            solutions[item_id] = [{'idx':doc['idx'],'similarity':doc['similarity'],'content':doc['content']}]

    path_info = {}
    for sol in solutions:
    
        unique_paths = []
        for doc in solutions[sol]:
            if doc['idx'] == 0:
                unique_paths.append([doc])
            if doc['idx'] != 0:
                unique_paths[-1].append(doc)
        path_info[sol] = unique_paths
    return path_info

def get_prouned_content(docs):
    path_info = get_unique_paths(docs)
    prouned_contents = []
    for path in path_info:
        prouned_contents.extend(path_prouning(path_info[path]))
    return prouned_contents


In [362]:
import pandas as pd 

contents=get_prouned_content(docs)

def rerank_nodes(contents,query_embedding):
    prouned_contents_embedding = vo.embed(contents, model="voyage-code-2").embeddings
    prouned_contents_similarities = []
    for proun_content in prouned_contents_embedding:
        prouned_contents_similarities.append(float(cosine_similarity(query_embedding,proun_content)))

    return prouned_contents_similarities,contents,np.argsort(prouned_contents_similarities)[::-1]

similarity,contents,ranks = rerank_nodes(contents,query_embedding)

In [367]:
def freq(lst):
    freq_count = {}
    for num in lst:
        if num > 0:
            if num not in freq_count:
                freq_count[num] = 1
            else:
                freq_count[num] += 1
    sorted_freq_count = sorted(freq_count.items(), key=lambda x: x[1], reverse=True)
    if sorted_freq_count:
        return sorted_freq_count[0][0]
    else:
        return None

freq([3,3,3,3,4,5,5,5,5,5])

5

# semantic relationns

In [241]:
def vector_index_search(tx):
    query = f"""
    CALL db.index.vector.queryNodes('code_block_vector_index', 40, {query_embedding})
    YIELD node AS item, score
    WHERE item.node_type = 'code_block'
    RETURN score, item.uuid, item.content AS content
    LIMIT 10
    """
    docs = tx.run(query)
    for doc in docs:
        print(doc)
    return docs,retrieved_embeddings

with driver.session() as session:
    docs,retrieved_embeddings = session.execute_read(vector_index_search)

<Record score=0.920928955078125 item.uuid='b7af20b8-26ea-40ce-987f-6606bf4a173d' content="    if len(numbers) < 2:\n        raise ValueError('The provided list must contain at least two numbers.')\n    sorted_numbers = sorted(numbers)\n    min_diff = float('inf')\n    closest_pair = (None, None)\n    for i in range(len(sorted_numbers) - 1):\n        diff = sorted_numbers[i + 1] - sorted_numbers[i]\n        if diff < min_diff:\n            min_diff = diff\n            closest_pair = (sorted_numbers[i], sorted_numbers[i + 1])\n    return closest_pair">
<Record score=0.9186456203460693 item.uuid='0a07a00f-bbd8-4823-a442-327dca74bf97' content="    numbers_with_indices = [(num, idx) for idx, num in enumerate(numbers)]\n    numbers_with_indices.sort()\n    num1, idx1, num2, idx2, min_distance = (None, -1, None, -1, float('inf'))\n    for i in range(1, len(numbers_with_indices)):\n        if numbers_with_indices[i][0] != numbers_with_indices[i - 1][0]:\n            distance = numbers_with_ind

In [209]:
subfunc = remove_comments_from_function(unique_paths[1][0]['content']).replace(unique_paths[1][-1]['content'],"\n")
print(subfunc)
subfunc_embedding = vo.embed([subfunc], model="voyage-code-2").embeddings[0]
cosine_similarity(query_embedding,subfunc_embedding)

def largestUniqueNumber(A):
    
    counter = dict()   
    for num in A:      
        if num not in counter:     
            counter[num] = 1       
        else:                      
            counter[num] += 1      
    for num in sorted(A, reverse=True):   
        if counter[num] == 1:    
            return num           
    return -1                  


np.float64(0.7958695495473417)

In [26]:
def show_duplicates(tx):
    query = f"""
    MATCH (n)
    WITH n.content AS content, collect(n) AS nodes
    WHERE size(nodes) > 1
    RETURN content, size(nodes) as size
    """
    duplicated = 0
    docs = tx.run(query)
    for doc in docs:
        # print(doc)
        duplicated += int(doc['size'])-1
        # retrieved_embeddings.append(doc['embedding'])

    return docs,duplicated

with driver.session() as session:
    docs,duplicated = session.execute_read(show_duplicates)
    print(duplicated)

152124


In [37]:
def remove_duplicates(tx):
    query = """
    MATCH (n)
    WITH n.content AS content, collect(n) AS nodes
    WHERE size(nodes) > 1

    // Step 2: Redirect relationships to a single node
    WITH content, head(nodes) AS original, tail(nodes) AS duplicates
    UNWIND duplicates AS duplicate

    CALL {
        WITH original, duplicate
        MATCH (m)-[r]->(duplicate)
        MERGE (m)-[r2:`$type(r)`]->(original) 
        ON CREATE SET r2 = r
        DELETE r
    }
    // Redirect outgoing relationships
    CALL {
        WITH original, duplicate
        MATCH (duplicate)-[r]->(m)
        MERGE (original)-[r2:`$type(r)`]->(m) 
        ON CREATE SET r2 = r
        DELETE r
    }

    // Step 3: Delete duplicates
    CALL {
        WITH duplicate
        DETACH DELETE duplicate
    }
    """
    retrieved_embeddings = []
    docs = tx.run(query)
    for doc in docs:
        print(doc)
        # retrieved_embeddings.append(doc['embedding'])

    return docs,retrieved_embeddings

with driver.session() as session:
    docs,retrieved_embeddings = session.execute_write(remove_duplicates)



In [39]:

def delete_all(tx):
    query = """
        MATCH (n) DETACH DELETE n
    """
    docs = tx.run(query)
    for doc in docs:
        print(doc)
    return docs

with driver.session() as session:
    docs = session.execute_write(delete_all)

In [42]:
def delete_all_indexes(tx):
    # Fetch all indexes
    result = tx.run("SHOW INDEXES YIELD name")
    index_names = [record["name"] for record in result]
    
    # Drop each index
    for index_name in index_names:
        tx.run(f"DROP INDEX {index_name}")

# Open a session and delete all indexes
with driver.session() as session:
    session.execute_write(delete_all_indexes)


# get voyage embeddings

In [15]:
# Function to load JSON from a file
import json

import voyageai
voyageai.api_key = "pa-lpHvggxAmQX_QTSipVOhn7qC5Ue_9XRxT_RGwCEFRmE"
vo = voyageai.Client()



def get_embeddings(documents):
    batch_size = 5
   
    content_embeddings = [
        vo.embed(
            documents[i : i + batch_size],
            model="voyage-code-2",
        ).embeddings
        for i in range(0, len(documents), batch_size)
    ]
    return [item for embeds in content_embeddings for item in embeds]


def load_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data


# Function to save JSON to a file
def save_json(data, file_path):
    with open(file_path, 'w') as f:
        json.dump(data, f, indent=4)


def process_json(input_file, output_file):
    data = load_json(input_file)
    contents = []
    for obj in data:
        contents.append(obj.get("content", ""))
    embeddings = get_embeddings(contents)
    print(len(embeddings))
    for i,obj in enumerate(data):
        contents.append(obj.get("content", ""))
        obj["embedding_large"] = embeddings[i]
    
    save_json(data, output_file)

output_file = "./import/canon_nodes_enhanced_1000.json"
input_file = "./import/canon_nodes_1000.json"

process_json(input_file=input_file, output_file = output_file)


673


# Blockwise content

In [75]:
from human_eval.human_eval.data import write_jsonl, read_problems
problems = read_problems()
import voyageai
voyageai.api_key = "pa-lpHvggxAmQX_QTSipVOhn7qC5Ue_9XRxT_RGwCEFRmE"
vo = voyageai.Client()
problems = read_problems()

def get_relevant_content(input_prompt):

    query_embedding = vo.embed([input_prompt], model="voyage-code-2").embeddings[0]
    relevant_content = []
    def blockwise_vector_index_search(tx):
        query = f"""
        CALL db.index.vector.queryNodes('code_block_vector_index', 40, {query_embedding})
        YIELD node AS item, score
        WHERE item.node_type = 'code_block'
        RETURN score, item.content AS content
        LIMIT 10
        """
        docs = tx.run(query)
        for doc in docs:
            relevant_content.append(doc)
        return docs
    with driver.session() as session:
        docs = session.execute_read(blockwise_vector_index_search)
    return relevant_content
    

In [82]:
contents = []
for task_id in problems:
    contents.append(dict(task_id=task_id, problem=get_relevant_content(problems[task_id]["prompt"])))
    print(task_id)
write_jsonl("blockwise_relevant_context.jsonl", contents)


HumanEval/0
HumanEval/1
HumanEval/2
HumanEval/3
HumanEval/4
HumanEval/5
HumanEval/6
HumanEval/7
HumanEval/8
HumanEval/9
HumanEval/10
HumanEval/11
HumanEval/12
HumanEval/13
HumanEval/14
HumanEval/15
HumanEval/16
HumanEval/17
HumanEval/18
HumanEval/19
HumanEval/20
HumanEval/21
HumanEval/22
HumanEval/23
HumanEval/24
HumanEval/25
HumanEval/26
HumanEval/27
HumanEval/28
HumanEval/29
HumanEval/30
HumanEval/31
HumanEval/32
HumanEval/33
HumanEval/34
HumanEval/35
HumanEval/36
HumanEval/37
HumanEval/38
HumanEval/39
HumanEval/40
HumanEval/41
HumanEval/42
HumanEval/43
HumanEval/44
HumanEval/45
HumanEval/46
HumanEval/47
HumanEval/48
HumanEval/49
HumanEval/50
HumanEval/51
HumanEval/52
HumanEval/53
HumanEval/54
HumanEval/55
HumanEval/56
HumanEval/57
HumanEval/58
HumanEval/59
HumanEval/60
HumanEval/61
HumanEval/62
HumanEval/63
HumanEval/64
HumanEval/65
HumanEval/66
HumanEval/67
HumanEval/68
HumanEval/69
HumanEval/70
HumanEval/71
HumanEval/72
HumanEval/73
HumanEval/74
HumanEval/75
HumanEval/76
HumanEval

# Function Retrieval

In [5]:
from human_eval.data import write_jsonl, read_problems
import voyageai
voyageai.api_key = "pa-lpHvggxAmQX_QTSipVOhn7qC5Ue_9XRxT_RGwCEFRmE"
vo = voyageai.Client()
problems = read_problems()

def get_relevant_content(input_prompt):

    query_embedding = vo.embed([input_prompt], model="voyage-code-2").embeddings[0]
    relevant_content = []
    def blockwise_vector_index_search(tx):
        query = f"""
        CALL db.index.vector.queryNodes('code_block_vector_index', 10, {query_embedding})
        YIELD node AS item, score
        WHERE item.node_type = 'implementation'
        RETURN score, item.content AS content
        LIMIT 10
        """
        docs = tx.run(query)
        for doc in docs:
            relevant_content.append(doc)
        return docs
    with driver.session() as session:
        docs = session.execute_read(blockwise_vector_index_search)
    return relevant_content

In [6]:
contents = []
for task_id in problems:
    contents.append(dict(task_id=task_id, problem=get_relevant_content(problems[task_id]["prompt"])))
    print(task_id)
write_jsonl("function_wise_relevant_context.jsonl", contents)

HumanEval/0
HumanEval/1
HumanEval/2
HumanEval/3
HumanEval/4
HumanEval/5
HumanEval/6
HumanEval/7
HumanEval/8
HumanEval/9
HumanEval/10
HumanEval/11
HumanEval/12
HumanEval/13
HumanEval/14
HumanEval/15
HumanEval/16
HumanEval/17
HumanEval/18
HumanEval/19
HumanEval/20
HumanEval/21
HumanEval/22
HumanEval/23
HumanEval/24
HumanEval/25
HumanEval/26
HumanEval/27
HumanEval/28
HumanEval/29
HumanEval/30
HumanEval/31
HumanEval/32
HumanEval/33
HumanEval/34
HumanEval/35
HumanEval/36
HumanEval/37
HumanEval/38
HumanEval/39
HumanEval/40
HumanEval/41
HumanEval/42
HumanEval/43
HumanEval/44
HumanEval/45
HumanEval/46
HumanEval/47
HumanEval/48
HumanEval/49
HumanEval/50
HumanEval/51
HumanEval/52
HumanEval/53
HumanEval/54
HumanEval/55
HumanEval/56
HumanEval/57
HumanEval/58
HumanEval/59
HumanEval/60
HumanEval/61
HumanEval/62
HumanEval/63
HumanEval/64
HumanEval/65
HumanEval/66
HumanEval/67
HumanEval/68
HumanEval/69
HumanEval/70
HumanEval/71
HumanEval/72
HumanEval/73
HumanEval/74
HumanEval/75
HumanEval/76
HumanEval

In [98]:
problems = read_problems()

def generate_augmented_prompt(contents):
    augmented_prompt = []
    for i,content in enumerate(contents):
        task_id = content['task_id']


        if len(contents[i]['problem']) > 0:
            content += "The following helper code blocks might be helpful: \n"
            content += contents[i]['problem'][0][1]
        augmented_prompt.append(dict(task_id = task_id, content=content))
    write_jsonl("blockwise_augmented_prompt.jsonl", augmented_prompt)

In [99]:
generate_augmented_prompt(contents,problems)

In [None]:


num_samples_per_task = 1
samples = [
    dict(task_id=task_id, problem=generate_augmented_prompt(problems[task_id]["prompt"],task_id))
    for task_id in problems
    for _ in range(num_samples_per_task)
]
write_jsonl("samples.jsonl", samples)

In [74]:
problems['HumanEval/0']['prompt']

'from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n'

In [None]:
class stinrg_utils:
    """
    A utility class for processing and analyzing strings.

    Methods
    -------
    count_boring_exciting_sentences(string):
        Counts the number of boring and exciting sentences in the given string.
    """    

    def count_boring_exciting_sentences(self, string):
        """
        Counts the number of boring and exciting sentences in the provided string.
        A boring sentence is defined as one that starts with the word 'I'.
        An exciting sentence is defined as one that ends with an exclamation mark ('!').
        Parameters
        string : str
            The input string containing sentences to be analyzed.
        Returns
        tuple
            A tuple containing two integers:
            - The first integer represents the number of boring sentences.
            - The second integer represents the number of exciting sentences.
        """
        boring_sent_count = 0
        exciting_sent_count = 0
        sentences = re.split('\\.|\\?|\\!', string)
        for sentence in sentences:
            sentence = sentence.strip()
            if sentence.startswith('I'):
                boring_sent_count += 1
            if sentence.endswith('!'):
                exciting_sent_count += 1
        return (boring_sent_count, exciting_sent_count)


In [22]:
from datasets import load_dataset
lcb_codegen = load_dataset("livecodebench/code_generation_lite", split='test', version_tag="release_v2")
df = lcb_codegen.to_pandas()

In [None]:
df.to_csv("lcb.csv")

In [2]:
import pandas as pd

df = pd.read_csv("lcb.csv")

In [16]:
print(df["question_content"][3])

You are given a string $s$ of length $n$, consisting of lowercase Latin letters, and an integer $k$.

You need to check if it is possible to remove exactly $k$ characters from the string $s$ in such a way that the remaining characters can be rearranged to form a palindrome. Note that you can reorder the remaining characters in any way.

A palindrome is a string that reads the same forwards and backwards. For example, the strings "z", "aaa", "aba", "abccba" are palindromes, while the strings "codeforces", "reality", "ab" are not.

Input

Each test consists of multiple test cases. The first line contains a single integer $t$ ($1 \leq t \leq 10^4$) — the number of the test cases. This is followed by their description.

The first line of each test case contains two integers $n$ and $k$ ($0 \leq k < n \leq 10^5$) — the length of the string $s$ and the number of characters to be deleted.

The second line of each test case contains a string $s$ of length $n$, consisting of lowercase Latin let