In [1]:
import tqdm
import pandas as pd
from langchain_community.graphs import Neo4jGraph
from pydantic_settings import BaseSettings, SettingsConfigDict
from langchain_community.embeddings import OllamaEmbeddings

In [2]:

class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file=".env", env_ignore_empty=True, extra="ignore"
    )

    # Neo4j
    NEO4J_URL: str
    NEO4J_USERNAME: str
    NEO4J_PASSWORD: str
    
    # Groq
    GROQ_API_KEY: str
    GROQ_MODEL_ID: str

    # Ollama
    OLLAMA_EMBEDDING_MODEL_ID: str


settings = Settings()

In [3]:
graph = Neo4jGraph(
    url=settings.NEO4J_URL,
    username=settings.NEO4J_USERNAME,
    password=settings.NEO4J_PASSWORD,
)

In [4]:
result = graph.query("""
MATCH (m:Movie{title: 'Toy Story'}) 
RETURN m.title, m.plot, m.poster
""")

data = pd.DataFrame(result)

data

Unnamed: 0,m.title,m.plot,m.poster
0,Toy Story,A cowboy doll is profoundly threatened and jea...,https://image.tmdb.org/t/p/w440_and_h660_face/...


In [5]:
print(graph.schema)

Node properties are the following:
Movie {url: STRING, runtime: INTEGER, revenue: INTEGER, embedding: LIST, imdbRating: FLOAT, released: STRING, countries: LIST, languages: LIST, plot: STRING, imdbVotes: INTEGER, imdbId: STRING, year: INTEGER, poster: STRING, movieId: STRING, tmdbId: STRING, title: STRING, budget: INTEGER},Genre {name: STRING},User {userId: STRING, name: STRING},Actor {url: STRING, name: STRING, tmdbId: STRING, bornIn: STRING, bio: STRING, died: DATE, born: DATE, imdbId: STRING, poster: STRING},Director {url: STRING, bornIn: STRING, bio: STRING, died: DATE, born: DATE, imdbId: STRING, name: STRING, poster: STRING, tmdbId: STRING},Person {url: STRING, bornIn: STRING, bio: STRING, died: DATE, born: DATE, imdbId: STRING, name: STRING, poster: STRING, tmdbId: STRING}
Relationship properties are the following:
RATED {rating: FLOAT, timestamp: INTEGER},ACTED_IN {role: STRING},DIRECTED {role: STRING}
The relationships are the following:
(:Movie)-[:IN_GENRE]->(:Genre),(:User)-

In [8]:
# Get all movies
result = graph.query("""
MATCH (m:Movie)
RETURN m.movieId, m.title, m.plot, m.poster
""")

data = pd.DataFrame(result)

# Rename columns
data.columns = ['id', 'title', 'plot', 'poster']

data

# Save the data
data.to_csv('data/movies.csv', index=False)


In [9]:
# Calculate embeddings
print(settings.OLLAMA_EMBEDDING_MODEL_ID)
embeddings = OllamaEmbeddings(
  model=settings.OLLAMA_EMBEDDING_MODEL_ID,
)

text = "This is a test document."

embedding = embeddings.embed_query(text)
embedding[:5]

mxbai-embed-large


[0.21226204931735992,
 -0.13709497451782227,
 0.19398094713687897,
 0.715175449848175,
 -0.46527099609375]

In [21]:
# Get embeddings for all movies
data_embeddings = pd.DataFrame(columns=['movieId', 'embedding'])

for i, row in tqdm.tqdm(data.iterrows(), total=len(data)):
    embedding = embeddings.embed_query(row['plot'])
    row = pd.DataFrame({
        'movieId': [row['id']],
        'embedding': [embedding]
    })
    data_embeddings = pd.concat([data_embeddings, row])

# Save the data
data_embeddings.to_csv('data/movies_embeddings.csv', index=False)

100%|██████████| 9125/9125 [05:57<00:00, 25.49it/s]
