### Chromadb setup

Stockage des embeddings : 
- Enregistrement des vecteurs et de leurs métadonnées dans la base vectorielle ChromaDB :
    - Une collection pour les données d’entraînement
    - Une collection pour les données de test.

In [16]:
import chromadb
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [17]:
# load embeddings
embeddings = np.load("../data/embedding/embeddings.npy")

embeddings.shape

(14273, 384)

In [18]:
# Load original df
df = pd.read_csv("../data/processed/data.csv")
df = df.reset_index(drop=True)


prepare data

In [19]:
# Get train/test indices
train_idx, test_idx = train_test_split(
    range(len(df)),
    test_size=0.2, 
    random_state=42, 
    stratify=df['airline_sentiment']
)


# Add train data

train_df = df.iloc[train_idx].reset_index(drop=True)
train_embeddings = embeddings[train_idx]

# Add test data

test_df = df.iloc[test_idx].reset_index(drop=True)
test_embeddings = embeddings[test_idx]

print(f"Train: {len(train_df)}, Test: {len(test_df)}")


Train: 11418, Test: 2855


inserting

In [None]:

# chroma db setup
os.makedirs("../chromadb",exist_ok=True)
client = chromadb.PersistentClient("../chromadb")



# Create separate collections
train_collection = client.get_or_create_collection(name="airline_sentiment_train")
test_collection = client.get_or_create_collection(name="airline_sentiment_test")

BATCH_SIZE = 5000


# insert train data
if train_collection.count() == 0:

    for batch_start in range(0, len(train_df), BATCH_SIZE):

        batch_end = min(batch_start + BATCH_SIZE, len(train_df))
        
        # Get batch data
        batch_df = train_df.iloc[batch_start:batch_end]
        batch_embeddings = train_embeddings[batch_start:batch_end]

        try:
            train_collection.add(
                embeddings=batch_embeddings.tolist(),
                documents=batch_df['clean_text'].tolist(),
                metadatas=[
                    {
                        "label": str(row['airline_sentiment']),
                        "airline": str(row['airline'])
                    }
                    for _, row in batch_df.iterrows()
                ],
                ids=[f"train_{i}" for i in range(batch_start, batch_end)]
            )
        except Exception as e:
            print(f"Error inserting train: {e}")
            raise
else:
    print("data aleardy exists in train")

# instert test data
if test_collection.count() == 0:

    for batch_start in range(0, len(test_df), BATCH_SIZE):
        batch_end = min(batch_start + BATCH_SIZE, len(test_df))
            
        # Get batch data
        batch_df = test_df.iloc[batch_start:batch_end]
        batch_embeddings = test_embeddings[batch_start:batch_end]
        
        try:
            test_collection.add(
                embeddings=batch_embeddings.tolist(),
                documents=batch_df['clean_text'].tolist(),
                metadatas=[
                    {
                        "label": str(row['airline_sentiment']),
                        "airline": str(row['airline'])
                    }
                    for _, row in batch_df.iterrows()
                ],
                ids=[f"test_{i}" for i in range(batch_start, batch_end)]
            )
        except Exception as e:
            print(f"Error inserting train data: {e}")
            raise
else:
    print("data aleardy exists in test")


print(f"Train collection: {train_collection.count()} documents")
print(f"Test collection:  {test_collection.count()} documents")
print(f"Total:            {train_collection.count() + test_collection.count()} documents")

Train collection: 11418 documents
Test collection:  2855 documents
Total:            14273 documents


In [23]:
print(client.list_collections())

collection = client.get_collection("airline_sentiment_train")

# Check collection info
print(collection.count())  # number of items in the collection


[Collection(name=airline_sentiment_train), Collection(name=airline_sentiment_test)]
11418


In [24]:
collection.get(include=["documents", "metadatas"], limit=5)


{'ids': ['train_0', 'train_1', 'train_2', 'train_3', 'train_4'],
 'embeddings': None,
 'documents': ['you are the Official airlines of DivadaPouch aka ThePoopQueen',
  'just Cancelled Flighted my flight and told me to call to rebook. Been on hold for 48 minutes at 4 am and still waiting',
  "Hi, Virgin! I'm on hold for 40-50 minutes -- are there any earlier flights from LA to NYC tonight; earlier than 11:50pm?",
  'any ways to get through the 50 minute wait to book a flight?',
  "but you guys switched me and didn't inform me of the chathes"],
 'uris': None,
 'included': ['documents', 'metadatas'],
 'data': None,
 'metadatas': [{'label': 'positive', 'airline': 'Southwest'},
  {'label': 'negative', 'airline': 'US Airways'},
  {'airline': 'Virgin America', 'label': 'negative'},
  {'airline': 'American', 'label': 'negative'},
  {'label': 'negative', 'airline': 'US Airways'}]}

In [25]:
train_collection = client.get_collection("airline_sentiment_train")



# Query similar tweets
results = train_collection.query(
    query_texts=["Flight was delayed for hours"],
    n_results=5,
    include=["documents", "metadatas", "distances"]
)

print("Similar tweets:")
for doc, metadata, distance in zip(
    results['documents'][0], 
    results['metadatas'][0], 
    results['distances'][0]
):
    print(f"\nText: {doc}")
    print(f"Label: {metadata['label']}")
    print(f"Distance: {distance:.4f}")

Similar tweets:

Text: Gerne :)
Label: neutral
Distance: 8.2260

Text: thanks for the response. I know it's not your fault... But Im in ORD in T5 and hungry if you want to stop by
Label: negative
Distance: 9.1247

Text: Please try it yourself - call 1-800-433-7300 and see what happens... then you'll understand. allrepresentativesbusy nooption
Label: negative
Distance: 9.2172

Text: ok!!! That's super helpful. Thank you. I'll reach out if I have any other questions.
Label: positive
Distance: 9.2285

Text: are u paying incedentals? noworstairline
Label: neutral
Distance: 9.3978
