In [7]:
from openai import OpenAI
client = OpenAI()

completion = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},
    {"role": "user", "content": "Compose a poem that explains the concept of recursion in programming."}
  ]
)

print(completion.choices[0].message)

ChatCompletionMessage(content="In the world of code, a concept serene,\nRecursion dwells, like a flowing stream.\nA function calls itself, without pause or end,\nTo traverse through problems, its power to lend.\n\nLike a mirror reflecting its own image bright,\nRecursion reflects, in a recursive flight.\nDividing tasks into smaller parts, it sees,\nSolving each piece with elegant ease.\n\nA puzzle solved, with elegance and grace,\nRecursion weaves its magical embrace.\nEach call returning, like a song's refrain,\nUntil the problem is solved, without a stain.\n\nSo, let your code loop in a recursive dance,\nWith elegance and beauty, give it a chance.\nFor in the realm of programming's art,\nRecursion reigns, playing its part.", role='assistant', function_call=None, tool_calls=None)


In [8]:
import openai
import os
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "<your OpenAI API key if not set as env var>"))

In [14]:
def read_and_chunk_file(file_path, chunk_size=1024):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]


In [10]:
def embed_texts_with_openai(texts, model="text-embedding-3-small"):
    embeddings = []
    for text in texts:
        embeddings.append(client.embeddings.create(input=text, model=model).data[0].embedding)
    return embeddings

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def calculate_cosine_similarity(embedding, embeddings):
    similarities = cosine_similarity([embedding], embeddings)
    return similarities[0]

In [39]:
def recommend_text_chunks(files, input_query, top_n=5):
    chunks = {}
    # Read and chunk each file
    for file in files:
        if not os.path.exists(file):
            raise FileNotFoundError(f"File {file} not found.")
        else:
            chunked_file = read_and_chunk_file(file)
            for chunk in chunked_file:
                if file in chunks:
                    chunks[file].append(chunk)
                else:
                    chunks[file] = [chunk]

    # Initialize a list to store all chunk embeddings and their corresponding file and chunk information
    all_chunk_embeddings = []
    chunk_info = []

    # Process each file
    for file in chunks:
        print(f"Processing {file}...")
        # Embed all chunks in the current file
        chunk_embeddings = embed_texts_with_openai(chunks[file])
        # Store embeddings and corresponding file and chunk info
        all_chunk_embeddings.extend(chunk_embeddings)
        chunk_info.extend([(file, i, chunks[file]) for i in range(len(chunks[file]))])
    print(chunk_info)

    # Embed the query text
    query_embedding = embed_texts_with_openai([input_query])[0]

    # Calculate cosine similarities for all chunks
    similarities = calculate_cosine_similarity(query_embedding, all_chunk_embeddings)

    # Pair each chunk's information with its similarity score
    all_chunks_with_similarities = [(chunk_info[i], similarities[i]) for i in range(len(chunk_info))]

    # Optionally, sort all chunks by similarity if needed
    # all_chunks_with_similarities.sort(key=lambda x: x[1], reverse=True)

    return all_chunks_with_similarities

In [43]:
filenames = ['sample1.txt', 'sample2.txt', 'sample3.txt']
input_query = "What affects the environment?"
top_n = 5  # Number of recommendations you want


try:
    recommendations = recommend_text_chunks(filenames, input_query, top_n)
except Exception as e:
    print(f"An error occurred: {e.message}")

# Assuming recommendations is a list of tuples, where each tuple contains
# (file, chunk_index, text_chunk, similarity_score)
# Assuming recommendations is a list of tuples in the format:
# (file, chunk_index, text_chunk, similarity_score)

# Sort the recommendations based on similarity score in descending order
# Correct the sorting line to use the correct index for similarity_score
# sorted_recommendations = sorted(recommendations, key=lambda x: x[3], reverse=True)

# Now, sorted_recommendations holds the chunks sorted by their similarity scores
# Correct the sorting line to use the correct index for similarity_score
sorted_recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
for recommendation in sorted_recommendations:
    file, chunk_index, text = recommendation[0]
    similarity_score = recommendation[1]
    print(f"File: {file}, Chunk Index: {chunk_index}, Similarity: {similarity_score}\n, Text: {text}\n\n")

Processing sample1.txt...
Processing sample2.txt...
Processing sample3.txt...
[('sample1.txt', 0, ['The evolution of smart home technology has brought convenience and efficiency to household management. Devices like smart thermostats, lights, and security cameras can be controlled remotely through smartphones, making it easier to save energy, enhance security, and improve quality of life. As these technologies become more integrated, the concept of the Internet of Things (IoT) expands, connecting everyday objects to the internet for improved data sharing and communication.\n\nUrban planning and smart cities are at the forefront of addressing the challenges of rapid urbanization and sustainability. By leveraging technology such as IoT, big data analytics, and AI, cities can optimize traffic flow, manage waste more efficiently, and reduce environmental impact. These innovations not only contribute to a healthier planet but also improve the livability and resilience of urban spaces.\n\nTh