<a href="https://colab.research.google.com/github/nattapatreesiriwattanakul/RAG-UNIVERSITY/blob/main/G12_Juiz_Esport.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install Requirement

In [None]:
!pip install -U langchain langchain-text-splitters langchain-community bs4 neo4j tiktoken langchain-neo4j langchain-huggingface sentence_transformers langchain[google-genai] faiss-cpu pandas

In [None]:
import os
try:
  import google.colab
  from google.colab import output

  output.enable_custom_widget_manager()
except:
  print("no")

In [None]:
import os
from langchain_google_genai import ChatGoogleGenerativeAI
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from google.generativeai.types import HarmCategory, HarmBlockThreshold


In [None]:
from neo4j import GraphDatabase
#Neo4J Aura
URI = os.environ["NEO4J_URI"] = "NEO4J URI"
USER = os.environ["NEO4J_USERNAME"] = "NEO4J USERNAME"
PASSWORD = os.environ["NEO4J_PASSWORD"] = "NEO4J PASSWORD"

APi_KEY = os.environ["GOOGLE_API_KEY"] = "GOOGLE API KEY"

driver = GraphDatabase.driver(URI, auth=(USER, PASSWORD))

def test_connection():
    with driver.session() as session:
        r = session.run("RETURN 'CONNECTED TO NEO4J AURA' AS msg")
        print(r.single()["msg"])


test_connection()
model = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite")

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [None]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

embedding_dim = len(embeddings.embed_query("hello world"))
index = faiss.IndexFlatL2(embedding_dim)
vector_store = FAISS(embedding_function=embeddings, index=index, docstore=InMemoryDocstore(), index_to_docstore_id={})

# 2. Load data

In [None]:
csv_path = "https://docs.google.com/spreadsheets/d/e/2PACX-1vS6SZDhTcq5FpX0xSMIxFCDvcmQib60GKFhVKZGZfcJiJ69hU2gOletsbRKXCerww/pub?output=csv"

df_dowloaded = pd.read_csv(csv_path)
display(df_dowloaded.head())

# 3. Cleaning Data

In [None]:
df_cleaned = df_dowloaded.dropna()

df_cleaned = df_cleaned.drop_duplicates()

print("\n--- Head of the cleaned DataFrame ---")
display(df_cleaned.head())

In [None]:
import re

def clean_text(text):
    """
    Removes leading/trailing whitespace, replaces multiple internal spaces with a single space,
    and replaces newline characters with a space.
    """
    if isinstance(text, str):
        text = text.strip()
        text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
        text = text.replace('\n', ' ')    # Replace newline characters with a space
    return text

# Define the text columns to clean
text_columns = ['Team', 'Role', 'Player', 'Nationality', 'Status']

# Apply the cleaning function to each specified text column
for col in text_columns:
    df_cleaned[col] = df_cleaned[col].apply(clean_text)

# Display the first few rows of the cleaned DataFrame
display(df_cleaned.head())

# 4. Import Dataset to neo4j aura


### Create Node and Relation

In [None]:
def import_row(tx, row):
    player = row["Player"]
    status = row["Status"]
    team = row["Team"]
    role = row["Role"]
    nationality = row["Nationality"]

    relation_status = re.sub(r'[^A-Za-z0-9_]', '_', status)

    query = f"""
    MERGE (p:Node:ROV_Team {{name: $player}})
    MERGE (t:Team {{name: $team}})
    MERGE (r:Role {{name: $role}})
    MERGE (n:Nationality {{name: $nationality}})

    MERGE (p)-[:{relation_status}]->(t)
    MERGE (p)-[:played_as]->(r)
    MERGE (p)-[:has_nationality]->(n)
    """

    tx.run(query, player=player, team=team, role=role, nationality=nationality)

In [None]:
with driver.session() as session:
    for idx, row in df_cleaned.iterrows():
        session.execute_write(import_row, row)
        print(f"Imported row (Example Player) {idx+1}: Create Node {row['Player']} complete!")

# 5. ทำ Combined Text เพื่อนำไป Storing Vector

In [None]:
# Create a list of combined text strings
combined_texts = []
for index, row in df_cleaned.iterrows():
    # Combining 'title', 'predicate', and 'artist' for the music dataset
    combined_text = f"{row['Team']} {row['Role']} {row['Player']} {row['Nationality']} {row['Status']}"
    combined_texts.append(combined_text)

# Create a new DataFrame with the combined text
df_combined_text = pd.DataFrame({"combined_text": combined_texts})

# Add auto increment column "ID"
df_combined_text['ID'] = range(1, len(df_combined_text) + 1)

# Display the first and last few rows of the DataFrame
display(df_combined_text.head())
display(df_combined_text.tail())

# 6. Storing Vector

In [None]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter, TokenTextSplitter

# setting Token Splitter
text_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=20)

# ดึงข้อมูล Metadata
df_combined_text['Team'] = df_cleaned['Team'].values
df_combined_text['Role'] = df_cleaned['Role'].values
df_combined_text['Player'] = df_cleaned['Player'].values
df_combined_text['Nationality'] = df_cleaned['Nationality'].values
df_combined_text['Status'] = df_cleaned['Status'].values

documents = []

# ลูปทีละแถว
for index, row in df_combined_text.iterrows():
    full_text = row['combined_text']

    chunks = text_splitter.split_text(full_text)


    if index == 0:
        print(f"\n--- Example Chunks for first entry (ID: {row['ID']}) ---")
        for i, chunk in enumerate(chunks):
            print(f"Chunk {i+1}: '{chunk}'")
        print("-------------------------------------------")

    # สร้าง Document ให้ครบทุกชิ้นที่หั่นออกมา
    for chunk in chunks:
        doc = Document(
            page_content=chunk,
            metadata={
                "id": row['ID'],
                "team": row['Team'],
                "role": row['Role'],
                "player": row['Player'],
                "nationality": row['Nationality'],
                "status": row['Status']
            }
        )
        documents.append(doc)


# บันทึกเข้า Vector Store
document_ids = vector_store.add_documents(documents=documents)

print(document_ids[:3])

# 7. Create RAG Agent with FAISS

In [None]:
from langchain.tools import tool
@tool(response_format="content_and_artifact")

def retrieve_context(query: str):
  """Retrieve information to help answer a query."""
  retrieved_docs = vector_store.similarity_search(query, k=4)
  serialized = "\n\n".join([doc.page_content for doc in retrieved_docs])
  return serialized, retrieved_docs

### Implement RAG Agent

In [None]:
from langchain.agents import create_agent
tools = [retrieve_context]
prompt =( """
You are a helpful assistant that answers questions using a retrieval tool.

You MUST:
1. Call `retrieve_context` with the user's query.
2. Wait for the tool result.
3. After receiving the tool result, generate a FINAL ANSWER for the user.
4. DO NOT stop after calling the tool — you must produce a final answer.

If the context does not contain the answer, reply:
"I don't have enough information to answer that question."
""")
agent = create_agent(model, tools, system_prompt=prompt)

In [None]:
query = "tell me the name of player who play Role as a Top lane in PSG Esports Team."

for event in agent.stream({"messages": [{"role": "user", "content": query}]}, stream_mode="values"):
    event["messages"][-1].pretty_print()

### Implement RAG Chain

In [None]:
from langchain.agents.middleware import dynamic_prompt, ModelRequest

@dynamic_prompt
def prompt_with_context(request: ModelRequest) -> str:
    last_query = request.state["messages"][-1].text
    retrieved_docs = vector_store.similarity_search(last_query)
    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
    system_message = f"""You are a helpful assistant that can answer questions based on the following context:

{docs_content}

Answer the user's question based only on the provided context. If you cannot answer the question based on the context, say "I don't have enough information to answer that question."
"""
    return system_message

rag_chain_faiss = create_agent(model, tools=[], middleware=[prompt_with_context])

In [None]:
query = "Who is a player of Buriram United Esports."

for step in rag_chain_faiss.stream({"messages": [{"role": "user", "content": query}]}, stream_mode="values"):
    step["messages"][-1].pretty_print()

# 8. Rag pattern 2 graph db using neo4j document

In [None]:
from neo4j import GraphDatabase
import os

driver = GraphDatabase.driver(
    os.environ["NEO4J_URI"],
    auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
)

with driver.session() as session:
    session.run("MATCH (c:Chunk) DETACH DELETE c")
    session.run("DROP CONSTRAINT chunk_id_unique IF EXISTS")
    session.run("DROP INDEX chunk_id_index IF EXISTS")

In [None]:
from langchain_neo4j import Neo4jVector
import os

db = Neo4jVector.from_documents(
    documents, embeddings, url=os.environ["NEO4J_URI"], username=os.environ["NEO4J_USERNAME"], password=os.environ["NEO4J_PASSWORD"]
)

In [None]:
from langchain.tools import tool
@tool(response_format="content_and_artifact")
def neo4j_retrieve_context(query: str):
  """Retrieve information from the Neo4j Vector Database to help answer a query."""
  retrieved_docs = db.similarity_search_with_score(query, k=4)
  serialized = "\n\n".join(
      (f"Source: {doc[0].metadata}\nContent: {doc[0].page_content}")
      for doc in retrieved_docs
  )
  return serialized, retrieved_docs

In [None]:
from langchain.agents import create_agent
neo4j_tools = [neo4j_retrieve_context]
prompt = ("""
You are a helpful assistant that answers questions using a retrieval tool.

You MUST:
1. Call `retrieve_context` with the user's query.
2. Wait for the tool result.
3. After receiving the tool result, generate a FINAL ANSWER for the user.
4. DO NOT stop after calling the tool — you must produce a final answer.

If the context does not contain the answer, reply:
"I don't have enough information to answer that question."
""")
neo4j_agent = create_agent(model, neo4j_tools, system_prompt=prompt)

In [None]:
query_neo4j = "Who is a player of PSG Esports?."

for event in neo4j_agent.stream({"messages": [{"role": "user", "content": query_neo4j}]}, stream_mode="values"):
    event["messages"][-1].pretty_print()

### Neo4J Implement RAG Chain

In [None]:
from langchain.agents.middleware import dynamic_prompt, ModelRequest

@dynamic_prompt
def prompt_with_neo4j_context(request: ModelRequest) -> str:
    last_query = request.state["messages"][-1].text
    retrieved_docs = db.similarity_search_with_score(last_query, k=4)
    docs_content = "\n\n".join(doc[0].page_content for doc in retrieved_docs)
    system_message = f"""You are a helpful assistant that can answer questions based on the following context:

{docs_content}

Answer the user's question based only on the provided context. If you cannot answer the question based on the context, say "I don't have enough information to answer that question."
"""
    return system_message

neo4j_chain = create_agent(model, tools=[], middleware=[prompt_with_neo4j_context])

In [None]:
query_neo4j = "Who is a player played as a Top Lane in TALON Team."

for step in neo4j_chain.stream({"messages": [{"role": "user", "content": query_neo4j}]}, stream_mode="values"):
    step["messages"][-1].pretty_print()

# 9. Neo4j with triple data using cypher query

In [None]:
from langchain_neo4j import Neo4jGraph, GraphCypherQAChain
import os

graph = Neo4jGraph(
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"]
)

chain = GraphCypherQAChain.from_llm(
    model, graph=graph, verbose=True, allow_dangerous_requests=True
)

chain.run("Who is a player of Bacon Time?")