In [1]:
import pandas as pd
import os
import openai
import numpy as np
# Load the CSV file into a pandas DataFrame
df = pd.read_csv('../data/champions_lore.csv')


In [2]:
df

Unnamed: 0,champion,story
0,Aatrox,"Whether mistaken for a demon or god, many tale..."
1,Ahri,"Ahri’s origins are a mystery, even to her.She ..."
2,Akali,"Ionia has always been a land of wild magic, it..."
3,Akshan,Dashing through the shadows of eastern Shurima...
4,Alistar,"Many civilizations have resisted Noxus, but no..."
...,...,...
162,Milio,Milio's story began generations ago with his g...
163,Naafiri,"In the pitch black Shuriman night, few sounds ..."
164,Briar,"Near the end of his reign, Grand General Boram..."
165,Hwei,"In northwest Ionia, the island of Koyehn once ..."


In [2]:

def chunk_story(df, chunk_size=1000, overlap_size=0):
    # initialize an empty list to store the new rows
    new_rows = []

    for _, row in df.iterrows():
        champion_name = row['champion']
        story = row['story']
        
        i = 0
        while i < len(story):
            # if overlap_size is greater than 0, move the start of the chunk back to create an overlap
            if overlap_size > 0 and i != 0:
                i -= overlap_size  # move the start of the chunk back by the overlap size
                # if the overlap is too large, move the start of the chunk to the beginning of the story
                next_sentence_start = story[i:].find(". ") + 2  # find the next sentence
                if 0 < next_sentence_start < len(story[i:]):
                    i += next_sentence_start  # move the start of the chunk to the next sentence
            
            # find the end of the chunk
            chunk_end = i + chunk_size
            if chunk_end < len(story):
                last_period_idx = story[i:chunk_end].rfind(".")
                if last_period_idx != -1:
                    chunk = story[i:i + last_period_idx + 1]  # include the period in the chunk
                else:
                    # if there is no period in the chunk, find the next period after the chunk
                    chunk = story[i:chunk_end]
            else:
                chunk = story[i:]
            
            # add the new row to the list
            new_rows.append({"champion": champion_name, "story_chunk": chunk})
            i += len(chunk)  # move the start of the next chunk to the end of the current chunk
    
    # create a new DataFrame from the list of new rows
    new_df = pd.DataFrame(new_rows)
    return new_df

# create a new DataFrame with the story chunks
new_df = chunk_story(df, chunk_size=1000, overlap_size=200)
# new_df[new_df['champion'] == 'Aatrox'].to_csv('../data/Aatrox.txt', index=False)

In [11]:
new_df.head()

Unnamed: 0,champion,story_chunk
0,Aatrox,"Whether mistaken for a demon or god, many tale..."
1,Aatrox,"When Setaka, the Ascended warrior-queen, calle..."
2,Aatrox,Mortals fleeing the conflict came to know them...
3,Aatrox,"For centuries, he strained against this hellis..."
4,Aatrox,The flesh he stole and crudely shaped began to...


In [3]:
openai.api_type = "azure" # use the OpenAI API
model = "text-embedding-ada-002"  # 模型的部署名

In [4]:

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.embeddings.create(input = [text], model=model).data[0].embedding

In [9]:
final_df = new_df['story_chunk'].apply(lambda x: get_embedding(x, model='RAG-Embedding'))


In [13]:
final_df.head()

0    [0.013453477062284946, -0.005212394054979086, ...
1    [0.013177535496652126, -0.02012667991220951, -...
2    [0.007241181097924709, -0.014700141735374928, ...
3    [0.0029356228187680244, -0.008332949131727219,...
4    [0.005279228091239929, -0.013822841458022594, ...
Name: story_chunk, dtype: object

In [12]:
def cosine_similarity(a, b):
    return np.dot(a, b) 

In [14]:
final_df.to_csv('../data/embeddedings.csv', index=False)

In [6]:
final_df = pd.read_csv('../data/embeddedings.csv')

In [16]:
final_df['id'] = new_df['champion']
final_df.rename(columns={'story_chunk': 'values'}, inplace=True)

In [24]:
type(final_df.iloc[0]['values'])
string_value = final_df.iloc[0]['values']
float_list = [float(x) for x in string_value.split()]
float_list

ValueError: could not convert string to float: '[0.013453477062284946,'

In [23]:
final_df.iloc[0]['values'].tolist()

AttributeError: 'str' object has no attribute 'tolist'

In [None]:
!pip install pinecone-client pinecone-datasets

In [25]:
from pinecone_datasets import load_dataset



In [9]:
import os

use_serverless = os.environ.get("USE_SERVERLESS", "False").lower() == "true"

In [10]:
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pc.io)
api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'
environment = os.environ.get('PINECONE_ENVIRONMENT') or 'PINECONE_ENVIRONMENT'

# configure client
pc = Pinecone(api_key=api_key)

  from tqdm.autonotebook import tqdm


In [11]:
index_name = 'rag'
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [18]:
index.upsert_from_dataframe(final_df, batch_size=100)



TypeError: Column `values` is expected to be a list of floats

In [None]:
index.describe_index_stats()

In [19]:
from azure.core.exceptions import AzureError
from azure.core.credentials import AzureKeyCredential
import pymongo
from dotenv import load_dotenv
from tenacity import retry, wait_random_exponential, stop_after_attempt

In [20]:
from dotenv import dotenv_values
env_name = ".env" # following example.env template change to your own .env file name
config = dotenv_values(env_name)

cosmosdb_endpoint = config['cosmos_db_api_endpoint']
cosmosdb_key = config['cosmos_db_api_key']
cosmosdb_connection_str = config['cosmos_db_connection_string']

COSMOS_MONGO_USER = config['cosmos_db_mongo_user']
COSMOS_MONGO_PWD = config['cosmos_db_mongo_pwd']
COSMOS_MONGO_SERVER = config['cosmos_db_mongo_server']

In [None]:
mongo_conn = ""
mongo_client = pymongo.MongoClient(mongo_conn)

In [None]:
# create a database called TutorialDB
db = mongo_client['ExampleDB']

# Create collection if it doesn't exist
COLLECTION_NAME = "ExampleCollection"

collection = db[COLLECTION_NAME]

if COLLECTION_NAME not in db.list_collection_names():
    # Creates a unsharded collection that uses the DBs shared throughput
    db.create_collection(COLLECTION_NAME)
    print("Created collection '{}'.\n".format(COLLECTION_NAME))
else:
    print("Using collection: '{}'.\n".format(COLLECTION_NAME))

In [None]:
## Use only if re-reunning code and want to reset db and collection
# collection.drop_index("VectorSearchIndex")
# mongo_client.drop_database("ExampleDB")

In [None]:
db.command({
  'createIndexes': 'ExampleCollection',
  'indexes': [
    {
      'name': 'VectorSearchIndex',
      'key': {
        "contentVector": "cosmosSearch"
      },
      'cosmosSearchOptions': {
        'kind': 'vector-ivf',
        'numLists': 1,
        'similarity': 'COS',
        'dimensions': 1536
      }
    }
  ]
})

In [None]:
collection.insert_many(data)

In [None]:
from py_pdf_parser.loaders import load_file

document = load_file("../data/LoL_Lore_Collection.pdf")