In [1]:
import pandas as pd
import os
import openai
import numpy as np
# Load the CSV file into a pandas DataFrame
df = pd.read_csv('../data/champions_lore.csv')


In [2]:
df

Unnamed: 0,champion,story
0,Aatrox,"Whether mistaken for a demon or god, many tale..."
1,Ahri,"Ahri’s origins are a mystery, even to her.She ..."
2,Akali,"Ionia has always been a land of wild magic, it..."
3,Akshan,Dashing through the shadows of eastern Shurima...
4,Alistar,"Many civilizations have resisted Noxus, but no..."
...,...,...
162,Milio,Milio's story began generations ago with his g...
163,Naafiri,"In the pitch black Shuriman night, few sounds ..."
164,Briar,"Near the end of his reign, Grand General Boram..."
165,Hwei,"In northwest Ionia, the island of Koyehn once ..."


In [3]:

def chunk_story(df, chunk_size=1000, overlap_size=0):
    # initialize an empty list to store the new rows
    new_rows = []

    for _, row in df.iterrows():
        champion_name = row['champion']
        story = row['story']
        
        i = 0
        while i < len(story):
            # if overlap_size is greater than 0, move the start of the chunk back to create an overlap
            if overlap_size > 0 and i != 0:
                i -= overlap_size  # move the start of the chunk back by the overlap size
                # if the overlap is too large, move the start of the chunk to the beginning of the story
                next_sentence_start = story[i:].find(". ") + 2  # find the next sentence
                if 0 < next_sentence_start < len(story[i:]):
                    i += next_sentence_start  # move the start of the chunk to the next sentence
            
            # find the end of the chunk
            chunk_end = i + chunk_size
            if chunk_end < len(story):
                last_period_idx = story[i:chunk_end].rfind(".")
                if last_period_idx != -1:
                    chunk = story[i:i + last_period_idx + 1]  # include the period in the chunk
                else:
                    # if there is no period in the chunk, find the next period after the chunk
                    chunk = story[i:chunk_end]
            else:
                chunk = story[i:]
            
            # add the new row to the list
            new_rows.append({"champion": champion_name, "story_chunk": chunk})
            i += len(chunk)  # move the start of the next chunk to the end of the current chunk
    
    # create a new DataFrame from the list of new rows
    new_df = pd.DataFrame(new_rows)
    return new_df

# create a new DataFrame with the story chunks
new_df = chunk_story(df, chunk_size=1000, overlap_size=200)
# new_df[new_df['champion'] == 'Aatrox'].to_csv('../data/Aatrox.txt', index=False)

In [4]:
new_df.head()

Unnamed: 0,champion,story_chunk
0,Aatrox,"Whether mistaken for a demon or god, many tale..."
1,Aatrox,"When Setaka, the Ascended warrior-queen, calle..."
2,Aatrox,Mortals fleeing the conflict came to know them...
3,Aatrox,"For centuries, he strained against this hellis..."
4,Aatrox,The flesh he stole and crudely shaped began to...


In [5]:
openai.api_type = "azure" # use the OpenAI API
model = "text-embedding-ada-002"

In [6]:

def get_embedding(text, model="RAG-Embedding"):
   text = text.replace("\n", " ")
   return openai.embeddings.create(input = [text], model=model).data[0].embedding

In [7]:
final_df = new_df['story_chunk'].apply(lambda x: get_embedding(x, model='RAG-Embedding'))


In [8]:
final_df.head()

0    [0.013453477062284946, -0.005212394054979086, ...
1    [0.013177535496652126, -0.02012667991220951, -...
2    [0.007241181097924709, -0.014700141735374928, ...
3    [0.0029356228187680244, -0.008332949131727219,...
4    [0.005279228091239929, -0.013822841458022594, ...
Name: story_chunk, dtype: object

In [9]:
def cosine_similarity(a, b):
    return np.dot(a, b) 

In [14]:
# final_df.to_csv('../data/embeddedings.csv', index=False)

In [6]:
# final_df = pd.read_csv('../data/embeddedings.csv')

In [10]:
final_df

0      [0.013453477062284946, -0.005212394054979086, ...
1      [0.013177535496652126, -0.02012667991220951, -...
2      [0.007241181097924709, -0.014700141735374928, ...
3      [0.0029356228187680244, -0.008332949131727219,...
4      [0.005279228091239929, -0.013822841458022594, ...
                             ...                        
894    [0.018552804365754128, -0.033252231776714325, ...
895    [0.003526126267388463, -0.03364427387714386, -...
896    [0.017532963305711746, -0.029706742614507675, ...
897    [0.0013252843637019396, -0.033858876675367355,...
898    [0.005606119520962238, -0.030375465750694275, ...
Name: story_chunk, Length: 899, dtype: object

In [11]:
new_df['champion'] = new_df['champion'].str.replace("[’\s]", "", regex=True)


In [12]:

new_df['champion_with_number'] = new_df['champion'] + (new_df.groupby('champion').cumcount() + 1).astype(str)


In [13]:
new_df['champion_with_number'].to_csv('../data/champion_names.csv', index=False)

In [45]:
new_df

Unnamed: 0,champion,story_chunk,champion_with_number
0,Aatrox,"Whether mistaken for a demon or god, many tale...",Aatrox1
1,Aatrox,"When Setaka, the Ascended warrior-queen, calle...",Aatrox2
2,Aatrox,Mortals fleeing the conflict came to know them...,Aatrox3
3,Aatrox,"For centuries, he strained against this hellis...",Aatrox4
4,Aatrox,The flesh he stole and crudely shaped began to...,Aatrox5
...,...,...,...
894,Smolder,Long before the lands that would become Camavo...,Smolder1
895,Smolder,"In exchange, the dragons were kept satiated wi...",Smolder2
896,Smolder,The handful that survived bore witness to endl...,Smolder3
897,Smolder,"Scared, alone, and hungry, the hatchling empl...",Smolder4


In [41]:
metadata = pd.DataFrame.from_dict(metadata_dict, orient='index')
metadata

Unnamed: 0,chunk,text
Aatrox1,0,"Whether mistaken for a demon or god, many tale..."
Aatrox2,1,"When Setaka, the Ascended warrior-queen, calle..."
Aatrox3,2,Mortals fleeing the conflict came to know them...
Aatrox4,3,"For centuries, he strained against this hellis..."
Aatrox5,4,The flesh he stole and crudely shaped began to...
...,...,...
Smolder1,894,Long before the lands that would become Camavo...
Smolder2,895,"In exchange, the dragons were kept satiated wi..."
Smolder3,896,The handful that survived bore witness to endl...
Smolder4,897,"Scared, alone, and hungry, the hatchling empl..."


In [53]:
processed_df = pd.DataFrame({'id': new_df['champion_with_number'], 'values': final_df, 'metadata': new_df['story_chunk']})

In [54]:
processed_df

Unnamed: 0,id,values,metadata
0,Aatrox1,"[0.013453477062284946, -0.005212394054979086, ...","Whether mistaken for a demon or god, many tale..."
1,Aatrox2,"[0.013177535496652126, -0.02012667991220951, -...","When Setaka, the Ascended warrior-queen, calle..."
2,Aatrox3,"[0.007241181097924709, -0.014700141735374928, ...",Mortals fleeing the conflict came to know them...
3,Aatrox4,"[0.0029356228187680244, -0.008332949131727219,...","For centuries, he strained against this hellis..."
4,Aatrox5,"[0.005279228091239929, -0.013822841458022594, ...",The flesh he stole and crudely shaped began to...
...,...,...,...
894,Smolder1,"[0.018552804365754128, -0.033252231776714325, ...",Long before the lands that would become Camavo...
895,Smolder2,"[0.003526126267388463, -0.03364427387714386, -...","In exchange, the dragons were kept satiated wi..."
896,Smolder3,"[0.017532963305711746, -0.029706742614507675, ...",The handful that survived bore witness to endl...
897,Smolder4,"[0.0013252843637019396, -0.033858876675367355,...","Scared, alone, and hungry, the hatchling empl..."


In [55]:


for _, row in processed_df.iterrows():
    index = row.name
    story_chunk = row['metadata']

    metadata_dict = {}

    metadata_dict['chunk'] = index
    metadata_dict['text'] = story_chunk
    row['metadata'] = metadata_dict


In [56]:
processed_df

Unnamed: 0,id,values,metadata
0,Aatrox1,"[0.013453477062284946, -0.005212394054979086, ...","{'chunk': 0, 'text': 'Whether mistaken for a d..."
1,Aatrox2,"[0.013177535496652126, -0.02012667991220951, -...","{'chunk': 1, 'text': 'When Setaka, the Ascende..."
2,Aatrox3,"[0.007241181097924709, -0.014700141735374928, ...","{'chunk': 2, 'text': 'Mortals fleeing the conf..."
3,Aatrox4,"[0.0029356228187680244, -0.008332949131727219,...","{'chunk': 3, 'text': 'For centuries, he strain..."
4,Aatrox5,"[0.005279228091239929, -0.013822841458022594, ...","{'chunk': 4, 'text': 'The flesh he stole and c..."
...,...,...,...
894,Smolder1,"[0.018552804365754128, -0.033252231776714325, ...","{'chunk': 894, 'text': 'Long before the lands ..."
895,Smolder2,"[0.003526126267388463, -0.03364427387714386, -...","{'chunk': 895, 'text': 'In exchange, the drago..."
896,Smolder3,"[0.017532963305711746, -0.029706742614507675, ...","{'chunk': 896, 'text': 'The handful that survi..."
897,Smolder4,"[0.0013252843637019396, -0.033858876675367355,...","{'chunk': 897, 'text': ' Scared, alone, and hu..."


In [None]:
!pip install pinecone-client pinecone-datasets

In [57]:
import os

use_serverless = os.environ.get("USE_SERVERLESS", "False").lower() == "true"

In [58]:
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pc.io)
api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'
environment = os.environ.get('PINECONE_ENVIRONMENT') or 'PINECONE_ENVIRONMENT'

# configure client
pc = Pinecone(api_key=api_key)

  from tqdm.autonotebook import tqdm


In [59]:
index_name = 'rag'
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [60]:
index.upsert_from_dataframe(processed_df, batch_size=100)

sending upsert requests: 100%|██████████| 899/899 [00:08<00:00, 103.28it/s]


{'upserted_count': 899}

In [70]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00899,
 'namespaces': {'': {'vector_count': 899}},
 'total_vector_count': 899}

In [71]:
ids = pd.read_csv('../data/champion_names.csv')
ids = ids['champion_with_number'].values.tolist()

In [72]:
records = index.fetch(ids)

In [79]:
records.vectors[ids[0]]['metadata']['text']

'Whether mistaken for a demon or god, many tales have been told of the Darkin Blade... but few know his real name, or the story of his fall.In ancient times, long before desert sands swallowed the empire, a mighty champion of Shurima was brought before the Sun Disc to become the avatar for a now forgotten celestial ideal. Remade as one of the Ascended, his wings were the golden light of dawn, and his armor sparkled like a constellation of hope from beyond the great veil.Aatrox was his name. He was at the vanguard of every noble conflict. So true and just was his conduct that other god-warriors would always gather at his side, and ten thousand mortals of Shurima marched behind him. When Setaka, the Ascended warrior-queen, called for his help against the rebellion of Icathia, Aatrox answered without hesitation.'

In [80]:
def topk_retrieval(query, k=5, ids=ids):
    topk_similarity = []
    topk_metadata = []
    query_vector = get_embedding(query)
    for name in ids:
        records = index.fetch([name])
        similarity = cosine_similarity(query_vector, records.vectors[name]['values'])
        topk_similarity.append(similarity)
        topk_metadata.append(records.vectors[name]['metadata']['text'])
    
    topk_similarity, topk_metadata = zip(*sorted(zip(topk_similarity, topk_metadata), reverse=True))
    topk_similarity = list(topk_similarity[:k])
    topk_metadata = list(topk_metadata[:k])
    
    return topk_similarity, topk_metadata


In [81]:
query = ids[0]
res_topk_similarity, res_topk_metadata = topk_retrieval(query, k=5, ids=ids)

In [84]:
res_topk_similarity
res_topk_metadata

['For centuries, he strained against this hellish confinement... until some nameless mortal was foolish enough to try and wield the blade once more. Aatrox seized upon this opportunity, forcing his will and an imitation of his original form onto his bearer, though the process quickly drained all life from the new body.In the years that followed, Aatrox groomed many more hosts—men and women of exceptional vitality or fortitude. Though his grasp of such magics had been limited in life, he learned to take control of a mortal in the span of single breath, and in battle he discovered he could feast on his victims to build himself ever larger and stronger.Aatrox traveled the land, searching desperately, endlessly, for a way return to his previous Ascended form… but the riddle of the blade proved unsolvable, and in time he realized he would never be free of it. The flesh he stole and crudely shaped began to feel like a mockery of his former glory—a cage only slightly larger than the sword.',


In [None]:
from py_pdf_parser.loaders import load_file

document = load_file("../data/LoL_Lore_Collection.pdf")