In [1]:
import pandas as pd
from pinecone import Pinecone
import os

  from tqdm.autonotebook import tqdm


In [None]:
def data_processing(csv_path:str='../data/champions_lore.csv'):
    '''
    This function processes the data and returns a DataFrame with the processed data

    '''
    df = load_data(csv_path = csv_path)
    new_df = chunk_story(df, chunk_size=1000, overlap_size=200)
    new_df['champion'] = new_df['champion'].str.replace("[’\s]", "", regex=True)
    new_df['champion_with_number'] = new_df['champion'] + (new_df.groupby('champion').cumcount() + 1).astype(str)
    
    parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    csv_name_path = os.path.join(parent_dir, 'data/champion_names.csv')
    
    new_df['champion_with_number'].to_csv(csv_name_path, index=False)

    final_df = get_df_embeddings(new_df, model='RAG-Embedding')

    # Create a new DataFrame with the processed data
    processed_df = pd.DataFrame({'id': new_df['champion_with_number'], 'values': final_df, 'metadata': new_df['story_chunk']})

    
    # Add metadata to the processed_df
    for _, row in processed_df.iterrows():
        index = row.name
        story_chunk = row['metadata']

        metadata_dict = {}

        metadata_dict['chunk'] = index
        metadata_dict['text'] = story_chunk
        row['metadata'] = metadata_dict

    return processed_df

In [6]:
from langchain_community.document_loaders.csv_loader import CSVLoader
file_path = (
    "./data/novels_0.1.3.csv"
)

loader = CSVLoader(file_path=file_path)
data = loader.load()

for record in data[:2]:
    print(record)

page_content="id: 56152\nname: A Former Child Soldier Who Uses a Magic Sword Wants to Live with an Older Sister of a Former Enemy Executive\nassoc_names: ['A Former Child Soldier Who Uses a Magic Sword Wants to Live with a Missy, The Former Leader of His Enemies', 'Maken Tsukai no Moto Shounen Hei wa, Moto Tekikanbu no Onee-san to Issho ni Ikitai', '魔剣使いの元少年兵は、元敵幹部のお姉さんと一緒に生きたい']\noriginal_language: japanese\nauthors: ['hasekura mondo', '支倉文度']\ngenres: ['action', 'adventure', 'fantasy', 'romance', 'seinen']\ntags: ['adapted to manga', 'calm protagonist', 'enemies become lovers', 'heroes', 'identity crisis', 'magic beasts', 'male protagonist', 'older love interests', 'protagonist strong from the start', 'strong to stronger', 'sword and magic', 'teamwork']\nstart_year: 2018\nlicensed: False\noriginal_publisher: shinkigensha\nenglish_publisher: \ncomplete_original: False\nchapters_original_current: 144 chapters\ncomplete_translated: False\nchapter_latest_translated: c24\nrelease_freq: 3.

In [11]:
len(data)

13592

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(data)

len(all_splits)

21684

In [15]:
import os
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.environ.get('PINECONE_API_KEY')

# configure client
pc = Pinecone(api_key=api_key)

In [16]:
pc

<pinecone.control.pinecone.Pinecone at 0x73dd4f2545e0>

In [36]:
index_name = 'novels'
index = pc.Index(index_name)
# wait a moment for connection

index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [34]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")



In [37]:
import time
from langchain_pinecone import PineconeVectorStore

namespace = "novelvector"

docsearch = PineconeVectorStore.from_documents(
    documents=all_splits,
    index_name=index_name,
    embedding=embeddings_model, 
    namespace=namespace 
)

time.sleep(1)


In [4]:
try:
    # 尝试获取脚本的路径
    current_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    # 在 Jupyter Notebook 中使用当前工作目录
    current_dir = os.getcwd()
parent_dir = os.path.dirname(os.path.dirname(current_dir))
csv_path = os.path.join(parent_dir, 'data/novels_0.1.3.csv')
processed_df = data_processing(csv_path=csv_path)



NameError: name 'data_processing' is not defined

In [None]:
# initialize connection to pinecone (get API key at app.pc.io)
api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'
environment = os.environ.get('PINECONE_ENVIRONMENT') or 'PINECONE_ENVIRONMENT'

# configure client
pc = Pinecone(api_key=api_key)

# create a new index
index_name = 'novels'
index = pc.Index(index_name)

index.upsert_from_dataframe(processed_df, batch_size=100)
print(f"Data stored in index {index_name}")