In [1]:
import pandas as pd
from pinecone import Pinecone
import os

  from tqdm.autonotebook import tqdm


In [None]:
def data_processing(csv_path:str='../data/champions_lore.csv'):
    '''
    This function processes the data and returns a DataFrame with the processed data

    '''
    df = load_data(csv_path = csv_path)
    new_df = chunk_story(df, chunk_size=1000, overlap_size=200)
    new_df['champion'] = new_df['champion'].str.replace("[’\s]", "", regex=True)
    new_df['champion_with_number'] = new_df['champion'] + (new_df.groupby('champion').cumcount() + 1).astype(str)
    
    parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    csv_name_path = os.path.join(parent_dir, 'data/champion_names.csv')
    
    new_df['champion_with_number'].to_csv(csv_name_path, index=False)

    final_df = get_df_embeddings(new_df, model='RAG-Embedding')

    # Create a new DataFrame with the processed data
    processed_df = pd.DataFrame({'id': new_df['champion_with_number'], 'values': final_df, 'metadata': new_df['story_chunk']})

    
    # Add metadata to the processed_df
    for _, row in processed_df.iterrows():
        index = row.name
        story_chunk = row['metadata']

        metadata_dict = {}

        metadata_dict['chunk'] = index
        metadata_dict['text'] = story_chunk
        row['metadata'] = metadata_dict

    return processed_df

In [6]:
from langchain_community.document_loaders.csv_loader import CSVLoader
file_path = (
    "./data/novels_0.1.3.csv"
)

loader = CSVLoader(file_path=file_path)
data = loader.load()

for record in data[:2]:
    print(record)

page_content="id: 56152\nname: A Former Child Soldier Who Uses a Magic Sword Wants to Live with an Older Sister of a Former Enemy Executive\nassoc_names: ['A Former Child Soldier Who Uses a Magic Sword Wants to Live with a Missy, The Former Leader of His Enemies', 'Maken Tsukai no Moto Shounen Hei wa, Moto Tekikanbu no Onee-san to Issho ni Ikitai', '魔剣使いの元少年兵は、元敵幹部のお姉さんと一緒に生きたい']\noriginal_language: japanese\nauthors: ['hasekura mondo', '支倉文度']\ngenres: ['action', 'adventure', 'fantasy', 'romance', 'seinen']\ntags: ['adapted to manga', 'calm protagonist', 'enemies become lovers', 'heroes', 'identity crisis', 'magic beasts', 'male protagonist', 'older love interests', 'protagonist strong from the start', 'strong to stronger', 'sword and magic', 'teamwork']\nstart_year: 2018\nlicensed: False\noriginal_publisher: shinkigensha\nenglish_publisher: \ncomplete_original: False\nchapters_original_current: 144 chapters\ncomplete_translated: False\nchapter_latest_translated: c24\nrelease_freq: 3.

In [11]:
len(data)

13592

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(data)

len(all_splits)

21684

In [44]:
import os
from pinecone import Pinecone
from dotenv import load_dotenv

load_dotenv()

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.getenv('PINECONE_API_KEY')

# configure client
pc = Pinecone(api_key=api_key)

In [46]:
pc

<pinecone.control.pinecone.Pinecone at 0x1b778202100>

In [68]:
index_name = 'novels'
index = pc.Index(index_name)
# wait a moment for connection

index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'novelvector': {'vector_count': 3000}},
 'total_vector_count': 3000}

In [53]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [52]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
   ---------------------------------------- 0.0/227.1 kB ? eta -:--:--
   ---------------------------------------- 0.0/227.1 kB ? eta -:--:--
   ---------------------------------------- 0.0/227.1 kB ? eta -:--:--
   - -------------------------------------- 10.2/227.1 kB ? eta -:--:--
   ----- --------------------------------- 30.7/227.1 kB 330.3 kB/s eta 0:00:01
   ------- ------------------------------- 41.0/227.1 kB 330.3 kB/s eta 0:00:01
   ------------------ ------------------- 112.6/227.1 kB 731.4 kB/s eta 0:00:01
   ------------------------------------ --- 204.8/227.1 kB 1.2 MB/s eta 0:00:01
   ---------------------------------------- 227.1/227.1 kB 1.2 MB/s eta 0:00:00
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1


In [76]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, namespace='novelvector', embedding=embeddings_model)

In [77]:
vector_store

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1b724089160>

In [78]:
results = vector_store.similarity_search_with_score(
    "The Most Sinister Hero"
)

In [79]:
results

[(Document(metadata={'row': 214.0, 'source': './data/novels_0.1.3.csv', 'start_index': 0.0}, page_content="id: 42010\nname: A Villainess Needs to Have the Ability of a Villainess\nassoc_names: ['绿茶要有绿茶的本事']\noriginal_language: chinese\nauthors: ['苏钱钱']\ngenres: ['comedy', 'drama', 'romance', 'slice of life']\ntags: ['acting', 'beautiful female lead', 'celebrities', 'character growth', 'cold love interests', 'couple growth', 'cute story', 'female protagonist', 'handsome male lead', 'hard-working protagonist', 'late romance', 'modern day', 'slow growth at start', 'slow romance', 'strong love interests', 'stubborn protagonist', 'unlucky protagonist']\nstart_year: 2021\nlicensed: False\noriginal_publisher: jjwxc\nenglish_publisher: \ncomplete_original: True\nchapters_original_current: 63 chapters\ncomplete_translated: False\nchapter_latest_translated: c218\nrelease_freq: 2.3\nactivity_week_rank: 1727\nactivity_month_rank: 1604\nactivity_all_time_rank: 3283\non_reading_lists: 1808\nreading_

In [83]:
for res in results:
    print(f"* {res[0].page_content} [{res[0].metadata}]")

* id: 42010
name: A Villainess Needs to Have the Ability of a Villainess
assoc_names: ['绿茶要有绿茶的本事']
original_language: chinese
authors: ['苏钱钱']
genres: ['comedy', 'drama', 'romance', 'slice of life']
tags: ['acting', 'beautiful female lead', 'celebrities', 'character growth', 'cold love interests', 'couple growth', 'cute story', 'female protagonist', 'handsome male lead', 'hard-working protagonist', 'late romance', 'modern day', 'slow growth at start', 'slow romance', 'strong love interests', 'stubborn protagonist', 'unlucky protagonist']
start_year: 2021
licensed: False
original_publisher: jjwxc
english_publisher: 
complete_original: True
chapters_original_current: 63 chapters
complete_translated: False
chapter_latest_translated: c218
release_freq: 2.3
activity_week_rank: 1727
activity_month_rank: 1604
activity_all_time_rank: 3283
on_reading_lists: 1808
reading_list_month_rank: 1848
reading_list_all_time_rank: 3974
rating: 3.7
rating_votes: 76
related_series_ids: [{'row': 214.0, 'sour

In [56]:
index

<pinecone.data.index.Index at 0x1b7781cc880>

In [55]:
vector_store

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1b777e73fa0>

In [50]:
import time
from langchain_pinecone import PineconeVectorStore

namespace = "novelvector"

docsearch = PineconeVectorStore.from_documents(
    documents=all_splits,
    index_name=index_name,
    embedding=embeddings_model, 
    namespace=namespace 
)

time.sleep(1)


NameError: name 'all_splits' is not defined

In [4]:
try:
    # 尝试获取脚本的路径
    current_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    # 在 Jupyter Notebook 中使用当前工作目录
    current_dir = os.getcwd()
parent_dir = os.path.dirname(os.path.dirname(current_dir))
csv_path = os.path.join(parent_dir, 'data/novels_0.1.3.csv')
processed_df = data_processing(csv_path=csv_path)



NameError: name 'data_processing' is not defined

In [None]:
# initialize connection to pinecone (get API key at app.pc.io)
api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'
environment = os.environ.get('PINECONE_ENVIRONMENT') or 'PINECONE_ENVIRONMENT'

# configure client
pc = Pinecone(api_key=api_key)

# create a new index
index_name = 'novels'
index = pc.Index(index_name)

index.upsert_from_dataframe(processed_df, batch_size=100)
print(f"Data stored in index {index_name}")

In [2]:
from openai import OpenAI

In [3]:
openai_api_key = 'sk-no-key-required'

In [9]:
client = OpenAI(
    base_url="http://127.0.0.1:8080/v1",
    api_key=openai_api_key
)

In [39]:
completion = client.chat.completions.create(
    model="LLaMA_CPP",
    messages=[
        {"role": "system", "content": "You are Llama, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
        {"role": "user", "content": "Recommend some Japanese romantic novels"},
    ]
)

In [40]:
print(completion.choices[0].message.content)

1. "My Solo Exchange Diary" (Hitori Kiete Aru Hi ni) by Natsuki Tada - This manga follows the story of a high school girl who starts a diary to record her experiences as she navigates love, friendship, and self-discovery.

2. "Kimi no Na wa." (Your Name.) by Makoto Shinkai - A romantic drama movie about two teenagers whose fates are mysteriously linked when they suddenly start exchanging their bodies.

3. "The Piano Teacher" (Kiken na Koto wo Wakamemasu) by Haruki Murakami - A novel that tells the story of a love affair between a piano teacher and her student, exploring themes of obsession, desire, and the complexities of human relationships.

4. "The Tales of Bee" (Hachigatsu no Cider) by Yumiko Kawakami - A coming-of-age story about two girls who fall in love while working at a French restaurant in a small town.

5. "Orange" (Ooranju) by Ichigo Takano - A high school romance manga where a student accidentally receives a letter from her future self, which contains information about a 