# Installation:-

In [None]:
!pip install chromadb epitran panphon numpy

## To install epitran dependencies:-

In [None]:
!pip install chromadb epitran panphon numpy
# Install espeak-ng based on the operating system
import platform
from subprocess import check_output, CalledProcessError
if platform.system() == "Linux":
    !apt-get update
    !apt-get install -y espeak-ng
elif platform.system() == "Windows":
    try:
        check_output(['choco', 'install', 'espeak-ng'], text=True)
    except CalledProcessError as e:
        print(f"Error installing espeak-ng: {e}")

In [None]:
!sudo apt-get update
!sudo apt-get install -y build-essential git wget
# Clone the Flite repository
!git clone https://github.com/festvox/flite.git
%cd flite
# Build Flite
!./configure && make
# Build lex_lookup
%cd testsuite
!make lex_lookup
# Move lex_lookup to a global path
!sudo cp lex_lookup /usr/local/bin

In [None]:
%cd ..
%cd ..

/root/Vdb


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [None]:
!pwd

/root/Vdb


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# To IPA conversion:-

In [1]:
from epitran import Epitran

# Initialize Epitran for English
english_epi = Epitran('eng-Latn')

def english_to_ipa(text):
    """Converts English text to IPA using Epitran."""
    try:
        return english_epi.transliterate(text)
    except Exception as e:
        return f"Error: {e}"

# Initialize Epitran for Hindi
hindi_epi = Epitran('hin-Deva')

def hindi_to_ipa(text):
    """Converts Hindi text to IPA using Epitran."""
    try:
        return hindi_epi.transliterate(text)
    except Exception as e:
        return f"Error: {e}"

In [None]:
# from subprocess import check_output, CalledProcessError
# from epitran import Epitran
# def english_to_ipa(text):
#     """Converts English text to IPA using espeak-ng."""
#     try:
#         ipa = check_output(['espeak-ng', '-q', '--ipa', text], text=True).strip()
#         return ipa
#     except CalledProcessError as e:
#         return f"Error: {e}"
# hindi_epi = Epitran('hin-Deva')
# def hindi_to_ipa(text):
#     """Converts Hindi text to IPA using epitran."""
#     try:
#         return hindi_epi.transliterate(text)
#     except Exception as e:
#         return f"Error: {e}"

In [2]:
print(english_to_ipa("aarthy"))

ɑɹθi


# Database actions:-

In [2]:
import panphon
import panphon.distance
import numpy as np
from chromadb.config import Settings
from chromadb import Client
import chromadb
import os
# Initialize PanPhon
ft = panphon.FeatureTable()

In [3]:
# Specify the persistence directory
persist_directory = "Vdb(epi)"
os.makedirs(persist_directory, exist_ok=True)
client = chromadb.PersistentClient(path=persist_directory)

## To list collections:-

In [4]:
# Get a list of all collection names
collection_names = client.list_collections()

# Print the collection names
for name in collection_names:
    print(name)

Collection(name=vdb_60k_ip)
Collection(name=vdb_l2)
Collection(name=vdb_central_l2)
Collection(name=vdb_cosine)
Collection(name=vdb_ip)


## To create a new vdb:-

In [None]:
# Create a ChromaDB collection
# collection = client.create_collection( name="vdb_l2")
collection = client.create_collection( name="vdb_central_l2",metadata={"hnsw:space": "l2"} )


#To delete a collection:-

In [None]:
# client.delete_collection(name="vdb_60k_ip")

## or load an existing db:-

In [4]:
collection = client.get_collection("vdb_central_l2")

In [None]:
collection = client.get_collection("vdb_l2")

In [16]:
collection = client.get_collection("vdb_cosine")

In [None]:
collection = client.get_collection("vdb_ip")

## Function definitions:-

In [5]:
# Function to compute the average feature vector (same as before)
def ipa2vec(ipa):
    vectors = ft.word_to_vector_list(ipa, numeric=True)
    processed_vectors = np.array(vectors)
    avg_vector = np.mean(processed_vectors, axis=0)
    return avg_vector.tolist()  # ChromaDB needs lists


def embedder(names, pronunciations):
    """Adds pronunciations to the ChromaDB collection."""

    if len(names) != len(pronunciations):
        raise ValueError("Names and pronunciations lists must have the same length.")

    embeddings = [ipa2vec(ipa) for ipa in pronunciations]
    collection.add(
        documents=names,
        embeddings=embeddings,
        metadatas=[{"ipa": ipa} for ipa in pronunciations],  # Store IPA for reference
        ids=[str(i) for i in range(len(names))] # provide IDs to avoid issues
    )


In [None]:
import random

def update_age():
    # Generate random ages for each document in the collection
    random_ages = [random.randint(18, 70) for _ in range(collection.count())]
    
    # Update the collection with new ages
    for i, age in enumerate(random_ages):
        collection.update(
            ids=[str(i)],
            metadatas=[{"age": age}]
        )

# update_age()

In [10]:
# Get the total number of documents in the collection
total_documents = collection.count()
print(f"Total number of documents in the collection: {total_documents}")

Total number of documents in the collection: 55691


In [49]:
def chromaquerrier(query_ipa, n_results=5):
    """Retrieves similar names from the ChromaDB vdb."""
    query_embedding = ipa2vec(query_ipa)
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results,
        include=["distances", "metadatas", "documents"] #include additional data
    )
    return results

# Querrier with weighted levenshtein edit distance:-

In [6]:
def querrier(query_ipa, n_results=30, weight_chromadb=0.7, weight_edit_distance=0.3, cutoff=0.7):
    """Retrieves similar names with combined ChromaDB and edit distance scoring."""
    query_embedding = ipa2vec(query_ipa)
    results = collection.query(
        query_embeddings=[query_embedding],
        include=["distances", "metadatas", "documents"]
    )
    dst = panphon.distance.Distance()
    # Calculate combined scores
    scores = []
    for i in range(len(results["documents"][0])):
        chromadb_distance = results["distances"][0][i]
        metadata = results["metadatas"][0][i]

        if metadata and "ipa" in metadata:
            edit_distance = dst.weighted_feature_edit_distance(query_ipa, metadata["ipa"])
            combined_score = weight_chromadb * chromadb_distance + weight_edit_distance * edit_distance
            if collection.name == "vdb_l2" and combined_score > cutoff:
                continue  # Skip results with a combined score greater than cutoff
            scores.append((i, combined_score))  # Store index and score

    # Sort by combined scores and select top results
    scores.sort(key=lambda item: item[1])  # Sort by score
    top_indices = [item[0] for item in scores[:n_results]]

    # Extract top results
    filtered_results = {
        "documents": [results["documents"][0][i] for i in top_indices],
        "metadatas": [results["metadatas"][0][i] for i in top_indices],
        "distances": [results["distances"][0][i] for i in top_indices],
    }
    return filtered_results

## To input data to the database:-

### To input English data :-

In [18]:
# Define the file path
input_path = "/content/Names.txt"
# Read names from file
with open(input_path, "r") as file:
    names = [line.strip() for line in file.readlines()]

# Convert names to IPA
ipas = [english_to_ipa(name) for name in names]

In [None]:
# Store the IPA list as a binary file
import pickle
ipa_binary_output_path = "ipas_epi.pkl"
with open(ipa_binary_output_path, "wb") as file:
    pickle.dump(ipas, file)
# To read the IPA list back from the binary file
# with open(ipa_binary_output_path, "rb") as file:
#     ipas = pickle.load(file)

In [19]:
print(ipas)

[]


# To enter large data:-(>40k)

## for multi part names:-

In [12]:
import random
import math
import re
import numpy as np

# Define the file path
input_path = "central(56k).txt"

# Read names from file
with open(input_path, "r") as file:
    names = [line.strip() for line in file.readlines()]

# Function to vectorize name parts
def vectorize_name_parts(name):
    # Remove special characters
    cleaned_name = re.sub(r'[^\w\s]', '', name)
    parts = [part for part in cleaned_name.split() if len(part) > 2]
    ipa_parts = [english_to_ipa(part) for part in parts]
    vectors = [ipa2vec(ipa) for ipa in ipa_parts]
    return parts, ipa_parts, vectors

# Calculate the number of batches
batch_size = 10000  # Choose a batch size less than the maximum
num_batches = math.ceil(len(names) / batch_size)

# Add data in batches
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(names))

    batch_names = names[start_idx:end_idx]
    batch_embeddings = []
    batch_metadatas = []
    batch_ids = []
    for name_idx, name in enumerate(batch_names):
        parts, ipa_parts, vectors = vectorize_name_parts(name)
        if vectors:
            age = random.randint(18, 70)
            for part_idx, (part, ipa, vector) in enumerate(zip(parts, ipa_parts, vectors)):
                if isinstance(vector, np.ndarray) and vector.ndim == 1 and not np.isnan(vector).any():
                    batch_embeddings.append(vector)
                    batch_metadatas.append({"full_name": name, "part": part, "ipa": ipa, "age": age})
                    batch_ids.append(f"{start_idx + name_idx}_{part_idx}")
                else:
                    reason = "not a 1D array" if not isinstance(vector, np.ndarray) or vector.ndim != 1 else "contains NaN values"
                    print(f"Skipping vector for part '{part}' in name '{name}'({reason}) due to invalid vector: {vector} ")

    if batch_embeddings:  # Ensure there is data to add
        collection.add(
            documents=[metadata["full_name"] for metadata in batch_metadatas],
            embeddings=batch_embeddings,
            ids=batch_ids,
            metadatas=batch_metadatas
        )
        print(f"Added batch {i + 1} of {num_batches}")


Skipping vector for part 'Ansari' in name 'A Ansari'(not a 1D array) due to invalid vector: [0.0, 0.6666666666666666, -0.3333333333333333, 0.6666666666666666, -0.6666666666666666, -1.0, -0.6666666666666666, -0.6666666666666666, 0.6666666666666666, -1.0, -1.0, 0.5, 0.0, -0.5, -1.0, -0.3333333333333333, -0.3333333333333333, -0.3333333333333333, -0.6666666666666666, -1.0, 0.5, -1.0, 0.0, 0.0] 
Skipping vector for part 'Aziz' in name 'A Aziz'(not a 1D array) due to invalid vector: [0.0, 0.0, 0.0, 1.0, -1.0, -1.0, -1.0, 0.0, 1.0, -1.0, -1.0, 0.5, 0.0, -0.5, -1.0, -0.5, -1.0, -0.5, -1.0, -1.0, 0.0, -1.0, 0.0, 0.0] 
Skipping vector for part 'Bardhan' in name 'A B Bardhan'(not a 1D array) due to invalid vector: [-0.3333333333333333, 0.3333333333333333, 0.0, 0.0, -0.8333333333333334, -1.0, -0.6666666666666666, -1.0, 1.0, -1.0, -1.0, 0.6666666666666666, 0.0, -0.5, -0.6666666666666666, -0.6666666666666666, -0.6666666666666666, -0.3333333333333333, -0.6666666666666666, -1.0, 0.0, -1.0, 0.0, 0.0] 


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Skipping vector for part 'Ajit' in name 'Ajit Kumar Sarmah'(not a 1D array) due to invalid vector: [0.0, 0.0, 0.0, 0.0, -0.25, -1.0, -1.0, -0.5, 0.5, -1.0, -1.0, 0.0, 0.0, 0.0, -1.0, -0.5, -0.5, -0.5, -1.0, -1.0, 0.0, -1.0, 0.0, 0.0] 
Skipping vector for part 'Kumar' in name 'Ajit Kumar Sarmah'(not a 1D array) due to invalid vector: [-0.2, 0.6, -0.2, 0.2, -0.8, -1.0, -0.6, -1.0, 0.6, -1.0, -1.0, 0.2, -0.6, -0.2, -0.2, 0.2, -0.6, 0.2, -0.2, -1.0, 0.4, -1.0, 0.0, 0.0] 
Skipping vector for part 'Sarmah' in name 'Ajit Kumar Sarmah'(not a 1D array) due to invalid vector: [-0.2, 0.6, -0.2, 0.6, -0.8, -1.0, -0.6, -0.6, 0.6, -1.0, -1.0, 0.6, -0.2, -0.4, -0.6, -0.6, -0.2, -0.6, -0.6, -1.0, 0.4, -1.0, 0.0, 0.0] 
Skipping vector for part 'Ajit' in name 'Ajit Kumar Singh'(not a 1D array) due to invalid vector: [0.0, 0.0, 0.0, 0.0, -0.25, -1.0, -1.0, -0.5, 0.5, -1.0, -1.0, 0.0, 0.0, 0.0, -1.0, -0.5, -0.5, -0.5, -1.0, -1.0, 0.0, -1.0, 0.0, 0.0] 
Skipping vector for part 'Kumar' in name 'Ajit Kumar S

KeyboardInterrupt: 

chroma won't allow u to add >40k entries in a single add request, so u have to do it in batches and also think about fixing the unique id problem

In [None]:
import math
# Calculate the number of batches
batch_size = 40000  # Choose a batch size less than the maximum
num_batches = math.ceil(len(names) / batch_size)

# Add data in batches
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(names))

    batch_names = names[start_idx:end_idx]
    batch_ipas = ipas[start_idx:end_idx]
    batch_embeddings = [ipa2vec(ipa) for ipa in batch_ipas]

    # Generate unique IDs for each document in the batch
    batch_ids = [str(j) for j in range(start_idx, end_idx)]

    # Create metadata for the batch
    batch_metadatas = [{"ipa": ipa,"age":"69"} for ipa in batch_ipas]

    collection.add(
        documents=batch_names,
        embeddings=batch_embeddings,
        ids=batch_ids,
        metadatas=batch_metadatas  # Add metadatas here
    )
    print(f"Added batch {i + 1} of {num_batches}")

Added batch 1 of 2
Added batch 2 of 2


#Else:-

In [None]:
# Add pronunciations to the ChromaDB collection
embedder(names, ipas)

# all_embeddings = collection.get(include=["embeddings"])["embeddings"]
# print("All embeddings:", all_embeddings)

### To input hindi data :-

In [None]:
# Define the file path
input_path = "sname.txt"
# Read names from file
with open(input_path, "r") as file:
    names = [line.strip() for line in file.readlines()]

# Convert names to IPA
ipas = [hindi_to_ipa(name) for name in names]

# Add pronunciations to the ChromaDB collection
embedder(names, ipas)

all_embeddings = collection.get(include=["embeddings"])["embeddings"]
# print("All embeddings:", all_embeddings)

# Querry:-

In [98]:
query_ipa = "ˈɑːdi"
results = querrier(query_ipa)
print(results['documents'])
print(results['metadatas'])
print(results['distances'])

['Aadi', 'Adi']
[{'age': '69', 'ipa': 'ɑdi'}, {'age': '69', 'ipa': 'ɑdi'}]
[0.4444443881511688, 0.4444443881511688]


In [9]:
# List of 20 Indian names
indian_names = [
    "Aarav", "Vivaan", "Aditya", "Vihaan", "Arjun",
    "Sai", "Reyansh", "Ayaan", "Krishna", "Ishaan",
    "Shaurya", "Atharv", "Dhruv", "Kabir", "Rudra",
    "Aarush", "Anay", "Om", "Parth", "Rishi"
]

# Loop through each name, convert to IPA, and query the ChromaDB collection
for name in indian_names:
    ipa = english_to_ipa(name)
    print(f"The IPA representation of '{name}' is: {ipa}")
    results = querrier(ipa)
    print(f"Results for '{name}':")
    print(results['documents'])
    # print(results['metadatas'])
    print("distance:-",results['distances'])
indian_names = [
    "Aarav", "Vivaan", "Aditya", "Vihaan", "Arjun",
    "Sai", "Reyansh", "Ayaan", "Krishna", "Ishaan",
    "Shaurya", "Atharv", "Dhruv", "Kabir", "Rudra",
    "Aarush", "Anay", "Om", "Parth", "Rishi"
]


The IPA representation of 'Aarav' is: æɹɑv
Results for 'Aarav':
['Aarav']
distance:- [0.0]
The IPA representation of 'Vivaan' is: vɪvɑn
Results for 'Vivaan':
['Vivaan']
distance:- [0.0]
The IPA representation of 'Aditya' is: ɑdɪtjə
Results for 'Aditya':
['Aditya', 'Aaditya', 'Aadittya']
distance:- [0.0, 0.1388888955116272, 0.1388888955116272]
The IPA representation of 'Vihaan' is: vihɑn
Results for 'Vihaan':
['Vihaan', 'Vyhaan']
distance:- [0.0, 0.1600000113248825]
The IPA representation of 'Arjun' is: ɑɹd͡ʒən
Results for 'Arjun':
['Arjun', 'Arjan']
distance:- [0.0, 0.0]
The IPA representation of 'Sai' is: saj
Results for 'Sai':
['Cy', 'Sai']
distance:- [0.0, 0.0]
The IPA representation of 'Reyansh' is: ɹejənʃ
Results for 'Reyansh':
['Rayansh', 'Reyansh', 'Reyaansh']
distance:- [0.0, 0.0, 0.0]
The IPA representation of 'Ayaan' is: ajɑn
Results for 'Ayaan':
['Ayaan']
distance:- [0.0]
The IPA representation of 'Krishna' is: kɹɪʃnə
Results for 'Krishna':
['Krishna', 'Chrishna']
distance:-

In [32]:
def detect_language(text):
    """Detects if the input text is Hindi or English."""
    if any('\u0900' <= char <= '\u097F' for char in text):
        return 'hindi'
    else:
        return 'english'

In [48]:
# prompt: so write code to input a name in english from the user, use the espeak -ng to convert it to ipa and then querry it
name=input()
print(language)
language = detect_language(name)

# Convert the name to IPA based on the detected language
if language == 'hindi':
    ipa = hindi_to_ipa(name)
    cutoff = 10
else:
    ipa = english_to_ipa(name)
    cutoff = 0.7

print(f"The IPA representation of '{name}' is: {ipa}")

# Query the ChromaDB collection
results = querrier(ipa,cutoff=cutoff)
print(f"The IPA representation of '{name}' is: {ipa}")

# Query the ChromaDB collection
results = querrier(ipa)
print(results['documents'])
print(results['metadatas'])
print(results['distances'])

hindi
The IPA representation of 'गौतमी' is: ɡɔːtmiː
The IPA representation of 'गौतमी' is: ɡɔːtmiː
[]
[]
[]


In [50]:
# Get user input
# example :- Enter a name in Hindi:अधि
name = input("Enter a name in Hindi: ")

# Convert the name to IPA
ipa = hindi_to_ipa(name)

print(f"The IPA representation of '{name}' is: {ipa}")

# Query the ChromaDB collection
results = chromaquerrier(ipa)
print(results['documents'])
print(results['metadatas'])
print(results['distances'])

The IPA representation of 'गौतमी' is: ɡɔːtmiː
[['Agampreet', 'Muttuk Kumaran', 'Kumbhkarna', 'Kruban', 'Pragnika']]
[[{'age': 41, 'ipa': 'əɡəmpɹit'}, {'age': 70, 'ipa': 'mʌtək kumɹ̩æn'}, {'age': 22, 'ipa': 'kʌmbkɑɹnə'}, {'age': 37, 'ipa': 'kɹʌbən'}, {'age': 42, 'ipa': 'pɹæɡnɪkə'}]]
[[0.8537500500679016, 0.9203306436538696, 0.9204938411712646, 0.9377778768539429, 0.9412500262260437]]


In [None]:
# Get user input
# example :- Enter a name in Hindi:अधि
name = input("Enter a name in English: ")

# Convert the name to IPA
ipa = hindi_to_ipa(name)

print(f"The IPA representation of '{name}' is: {ipa}")

# Query the ChromaDB collection
results = chromaquerrier(ipa)
print(results['documents'])
print(results['metadatas'])
print(results['distances'])