In [None]:
!pip install -q kaggle
from google.colab import files
files.upload()
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d asaniczka/tmdb-movies-dataset-2023-930k-movies
!unzip tmdb-movies-dataset-2023-930k-movies.zip

In [None]:
!pip install -U langchain-community
!pip install token-count
%pip install langchain langchain-community
%pip install langchain-openai
%pip install pymysql
%pip install tidb-vector
%pip install --upgrade --quiet  langchain-google-genai pillow

In [None]:
import pandas as pd
from token_count import TokenCount

df = pd.read_csv('TMDB_movie_dataset_v11.csv')
df.head(10)

In [None]:
import requests
from langchain_community.embeddings import JinaEmbeddings
from numpy import dot
from numpy.linalg import norm
from PIL import Image
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import TiDBVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

tidb_connection_string = "YOUR_TIDB_CONNECTION_STRING"


jina_embedding=JinaEmbeddings(jina_api_key='YOUR_JINA_API_KEY', model_name="jina-embeddings-v2-base-en")

TABLE_NAME = "YOUR_TABLE_NAME"

db = TiDBVectorStore.from_texts(
    texts=[
        ""
    ],
    embedding=jina_embedding,
    table_name=TABLE_NAME,
    connection_string=tidb_connection_string,
    distance_strategy="cosine",  # default, another option is "l2"
)

In [None]:
import pandas as pd
df = pd.read_csv('TMDB_movie_dataset_v11.csv')
df.head(10)

text_datasets_with_metadata = []
# metadatas
for index, row in df.iterrows():
  tagline = row['tagline']
  if tagline == None or tagline!=tagline:
    tagline = ""
  metadata = {
      "id": row['id'],
      "title": row['title'],
      "tagline": tagline,
      "genres": row['genres'],
      "release_date": row['release_date'],
      "budget": row['budget'],
      "revenue": row['revenue'],
      "runtime": row['runtime'],
      "popularity": row['popularity'],
      "vote_average": row['vote_average'],
      "vote_count": row['vote_count']
  }
  text_datasets_with_metadata.append({
      "text": row['overview'],
      "metadata": metadata
  })

print(text_datasets_with_metadata[:5])


In [None]:
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# Function to process each text_data item
def process_text_data(i, text_data, added_indices, db):
    if i in added_indices:
        return None

    if not text_data["metadata"]:
        return None

    try:
        db.add_texts(
            texts=[text_data["text"]],
            metadatas=[text_data["metadata"]]
        )
        # If successful, return the index to add to the list
        return i
    except Exception as e:
        print("ERROR: " + text_data["metadata"]["id"])
        return None

# Load existing added indices from file, if it exists
try:
    with open('added_indices.json', 'r') as f:
        added_indices = json.load(f)
except FileNotFoundError:
    added_indices = []

# Create a ThreadPoolExecutor for multithreading
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = []

    for i, text_data in enumerate(text_datasets_with_metadata)
    futures.append(executor.submit(process_text_data, i, text_data, added_indices, db))

    # Process the results as they complete
    for future in tqdm(as_completed(futures), total=len(futures)):
      result = future.result()
      if result is not None:
        added_indices.append(result)
        with open('added_indices.json', 'w') as f:
          json.dump(added_indices, f)