In [None]:

import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import re
from sklearn.model_selection import train_test_split
import joblib


In [None]:
ds = load_dataset('7Xan7der7/us_airline_sentiment')
df = ds['train'].to_pandas()
df.head()

In [None]:
df['airline_sentiment'].value_counts()


In [None]:
df = df[['text','airline_sentiment']]
df.describe(include='all')

In [None]:
df['airline_sentiment'].nunique()

In [None]:
def clean_text(text):
    if not isinstance(text, str):
        return ''
    text = re.sub(r'http+', '', text)
    text = re.sub(r'@+', '', text)
    text = re.sub(r'[^A-Za-z0-9]', ' ', text)
    text = text.lower().strip()
    return text

df['text'] = df['text'].apply(clean_text)
df = df.drop_duplicates(subset=['text'])
df = df.dropna(subset=['text'])
df.to_csv('./data/processed/batch_processed.csv', index=False)

df.describe(include='all')


In [None]:
import nlpaug
import nlpaug.augmenter.word as naw

import nltk
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
aug_synonym = naw.SynonymAug(aug_src='wordnet', aug_min=1, aug_max=3)

aug_insert = naw.SynonymAug(aug_src='wordnet', aug_min=1, aug_max=2, aug_p=0.3)


In [None]:

def augment_text(text, augmenter, num_augmentations=1):
    augmented_texts = []
    for _ in range(num_augmentations):
        try:
            aug_text = augmenter.augment(text)
            if aug_text and aug_text != text:
                augmented_texts.append(aug_text)
        except:
            continue
    return augmented_texts

print("=" * 60)
print("AUGMENTING MINORITY CLASSES")
print("=" * 60)

negative_df = df[df['airline_sentiment'] == 'negative']
neutral_df = df[df['airline_sentiment'] == 'neutral']
positive_df = df[df['airline_sentiment'] == 'positive']

max_count = len(negative_df)

neutral_needed = max_count - len(neutral_df)
positive_needed = max_count - len(positive_df)

print(f"\nAugmentation targets:")
print(f"  • Neutral class:  {len(neutral_df):,} → {max_count:,} (need {neutral_needed:,} more)")
print(f"  • Positive class: {len(positive_df):,} → {max_count:,} (need {positive_needed:,} more)")

augmented_data = []

print(f"\nAugmenting neutral class...")
neutral_samples = neutral_df.sample(n=neutral_needed, replace=True, random_state=42)
for idx, row in enumerate(neutral_samples.itertuples(), 1):
    if idx % 500 == 0:
        print(f"   Processed {idx}/{neutral_needed}...")
    
    augmenter = aug_synonym if idx % 2 == 0 else aug_insert
    aug_texts = augment_text(row.text, augmenter, num_augmentations=1)
    
    if aug_texts:
        augmented_data.append({
            'airline_sentiment': row.airline_sentiment,
            'text': f" {row.text}"
        })

print(f"Neutral class augmented: {len([d for d in augmented_data if d['airline_sentiment'] == 'neutral'])} new samples")

print(f"\nAugmenting positive class...")
positive_samples = positive_df.sample(n=positive_needed, replace=True, random_state=42)
for idx, row in enumerate(positive_samples.itertuples(), 1):
    if idx % 500 == 0:
        print(f"   Processed {idx}/{positive_needed}...")
    
    augmenter = aug_synonym if idx % 2 == 0 else aug_insert
    aug_texts = augment_text(row.text, augmenter, num_augmentations=1)
    
    if aug_texts:
        augmented_data.append({
            'airline_sentiment': row.airline_sentiment,
            'text': f"{row.text}"
        })

print(f"Positive class augmented: {len([d for d in augmented_data if d['airline_sentiment'] == 'positive'])} new samples")

augmented_df = pd.DataFrame(augmented_data)

In [None]:
df_balanced = pd.concat([df, augmented_df], ignore_index=True)

df = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df.describe()


In [None]:
embed_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

df = df.reset_index(drop=True)
embeddings = embed_model.encode(df['text'], show_progress_bar=True, convert_to_numpy=True)

In [None]:
embeddings

In [None]:
import math
import chromadb

chroma_client = chromadb.Client()


collection = chroma_client.create_collection(
    name="aerostream_reviews",
    
)

documents = df["text"].astype(str).tolist()

metadatas = [
    {
        "review_id": int(i),
        "airline_sentiment": row["airline_sentiment"]
    }
    for i, row in df.reset_index().iterrows()
]

ids = [str(i) for i in range(len(documents))]

BATCH_SIZE = 5000
num_batches = math.ceil(len(ids) / BATCH_SIZE)

for batch_idx in range(num_batches):
    start = batch_idx * BATCH_SIZE
    end = start + BATCH_SIZE

    collection.add(
        ids=ids[start:end],
        embeddings=embeddings[start:end].tolist(),
        documents=documents[start:end],
        metadatas=metadatas[start:end],
    )

    print(f"Inserted batch {batch_idx + 1}/{num_batches}")

print(f"\nStored {len(ids)} vectors successfully in ChromaDB")


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report , f1_score
from sklearn.metrics import confusion_matrix 


all_items = collection.get(include=['embeddings','metadatas','documents'])
X = embeddings  
y = df['airline_sentiment'].values
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X, y, df.index, test_size=0.2, random_state=42, stratify=y)


def evaluate_model(name , model):
    print(f"=== {name} ===") 

    train_acc = model.score(X_train, y_train) 
    test_pred = model.predict(X_test) 
    test_acc = accuracy_score(y_test, test_pred) 
    
    f1 = f1_score(y_test, test_pred, average="macro") 

    print("Train accuracy:", train_acc) 
    print("Test accuracy :", test_acc) 
    print("F1-score :", f1) 
    print("Gap:", train_acc - test_acc) 
    print(classification_report(y_test, test_pred)) 

    print(confusion_matrix(y_test, test_pred) )

