In [1]:

import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import re
from sklearn.model_selection import train_test_split
import joblib



In [2]:
ds = load_dataset('7Xan7der7/us_airline_sentiment')
df = ds['train'].to_pandas()
df.head()

us_airline_sentiment.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/14640 [00:00<?, ? examples/s]

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,name,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,5.70306e+17,neutral,1.0,,,Virgin America,cairdin,0,@VirginAmerica What @dhepburn said.,,2/24/2015 11:35,,Eastern Time (US & Canada)
1,5.70301e+17,positive,0.3486,,0.0,Virgin America,jnardino,0,@VirginAmerica plus you've added commercials t...,,2/24/2015 11:15,,Pacific Time (US & Canada)
2,5.70301e+17,neutral,0.6837,,,Virgin America,yvonnalynn,0,@VirginAmerica I didn't today... Must mean I n...,,2/24/2015 11:15,Lets Play,Central Time (US & Canada)
3,5.70301e+17,negative,1.0,Bad Flight,0.7033,Virgin America,jnardino,0,@VirginAmerica it's really aggressive to blast...,,2/24/2015 11:15,,Pacific Time (US & Canada)
4,5.70301e+17,negative,1.0,Can't Tell,1.0,Virgin America,jnardino,0,@VirginAmerica and it's a really big bad thing...,,2/24/2015 11:14,,Pacific Time (US & Canada)


In [3]:
df['airline_sentiment'].value_counts()


airline_sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64

In [4]:
df = df[['text','airline_sentiment']]
df.describe(include='all')

Unnamed: 0,text,airline_sentiment
count,14640,14640
unique,14427,3
top,@united thanks,negative
freq,6,9178


In [5]:
df['airline_sentiment'].nunique()

3

In [6]:
def clean_text(text):
    if not isinstance(text, str):
        return ''
    text = re.sub(r'http+', '', text)
    text = re.sub(r'@+', '', text)
    text = re.sub(r'[^A-Za-z0-9]', ' ', text)
    text = text.lower().strip()
    return text

df['text'] = df['text'].apply(clean_text)
df = df.drop_duplicates(subset=['text'])
df = df.dropna(subset=['text'])
# df.to_csv('./data/processed/batch_processed.csv', index=False)

df.describe(include='all')


Unnamed: 0,text,airline_sentiment
count,14360,14360
unique,14360,3
top,virginamerica what dhepburn said,negative
freq,1,9079


In [7]:
import nlpaug
import nlpaug.augmenter.word as naw

import nltk
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
aug_synonym = naw.SynonymAug(aug_src='wordnet', aug_min=1, aug_max=3)

aug_insert = naw.SynonymAug(aug_src='wordnet', aug_min=1, aug_max=2, aug_p=0.3)


In [8]:

def augment_text(text, augmenter, num_augmentations=1):
    augmented_texts = []
    for _ in range(num_augmentations):
        try:
            aug_text = augmenter.augment(text)
            if aug_text and aug_text != text:
                augmented_texts.append(aug_text)
        except:
            continue
    return augmented_texts

print("=" * 60)
print("AUGMENTING MINORITY CLASSES")
print("=" * 60)

negative_df = df[df['airline_sentiment'] == 'negative']
neutral_df = df[df['airline_sentiment'] == 'neutral']
positive_df = df[df['airline_sentiment'] == 'positive']

max_count = len(negative_df)

neutral_needed = max_count - len(neutral_df)
positive_needed = max_count - len(positive_df)

print(f"\nAugmentation targets:")
print(f"  • Neutral class:  {len(neutral_df):,} → {max_count:,} (need {neutral_needed:,} more)")
print(f"  • Positive class: {len(positive_df):,} → {max_count:,} (need {positive_needed:,} more)")

augmented_data = []

print(f"\nAugmenting neutral class...")
neutral_samples = neutral_df.sample(n=neutral_needed, replace=True, random_state=42)
for idx, row in enumerate(neutral_samples.itertuples(), 1):
    if idx % 500 == 0:
        print(f"   Processed {idx}/{neutral_needed}...")
    
    augmenter = aug_synonym if idx % 2 == 0 else aug_insert
    aug_texts = augment_text(row.text, augmenter, num_augmentations=1)
    
    if aug_texts:
        augmented_data.append({
            'airline_sentiment': row.airline_sentiment,
            'text': aug_texts[0]
        })

print(f"Neutral class augmented: {len([d for d in augmented_data if d['airline_sentiment'] == 'neutral'])} new samples")

print(f"\nAugmenting positive class...")
positive_samples = positive_df.sample(n=positive_needed, replace=True, random_state=42)
for idx, row in enumerate(positive_samples.itertuples(), 1):
    if idx % 500 == 0:
        print(f"   Processed {idx}/{positive_needed}...")
    
    augmenter = aug_synonym if idx % 2 == 0 else aug_insert
    aug_texts = augment_text(row.text, augmenter, num_augmentations=1)
    
    if aug_texts:
        augmented_data.append({
            'airline_sentiment': row.airline_sentiment,
            'text': aug_texts[0]
        })

print(f"Positive class augmented: {len([d for d in augmented_data if d['airline_sentiment'] == 'positive'])} new samples")

augmented_df = pd.DataFrame(augmented_data)

AUGMENTING MINORITY CLASSES

Augmentation targets:
  • Neutral class:  3,043 → 9,079 (need 6,036 more)
  • Positive class: 2,238 → 9,079 (need 6,841 more)

Augmenting neutral class...
   Processed 500/6036...
   Processed 1000/6036...
   Processed 1500/6036...
   Processed 2000/6036...
   Processed 2500/6036...
   Processed 3000/6036...
   Processed 3500/6036...
   Processed 4000/6036...
   Processed 4500/6036...
   Processed 5000/6036...
   Processed 5500/6036...
   Processed 6000/6036...
Neutral class augmented: 6036 new samples

Augmenting positive class...
   Processed 500/6841...
   Processed 1000/6841...
   Processed 1500/6841...
   Processed 2000/6841...
   Processed 2500/6841...
   Processed 3000/6841...
   Processed 3500/6841...
   Processed 4000/6841...
   Processed 4500/6841...
   Processed 5000/6841...
   Processed 5500/6841...
   Processed 6000/6841...
   Processed 6500/6841...
Positive class augmented: 6841 new samples


In [9]:
df_balanced = pd.concat([df, augmented_df], ignore_index=True)

df = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [10]:
df.describe()


Unnamed: 0,text,airline_sentiment
count,27237,27237
unique,26348,3
top,[southwestair thanks],negative
freq,9,9079


In [11]:
# df.to_csv('./data/processed/dataframe_balanced.csv', index=False)
df = pd.read_csv('./data/processed/dataframe_balanced.csv')
df.duplicated().sum()

np.int64(897)

In [12]:
df = df.drop_duplicates()

In [13]:
df['airline_sentiment'].value_counts()


airline_sentiment
negative    9079
neutral     8753
positive    8508
Name: count, dtype: int64

In [14]:
neg_df = df[df['airline_sentiment'] == 'negative']
neu_df = df[df['airline_sentiment'] == 'neutral']
pos_df = df[df['airline_sentiment'] == 'positive']

neg_df_reduced = neg_df.sample(
    n=len(pos_df),
    random_state=42
)

df = pd.concat(
    [neg_df_reduced, neu_df, pos_df],
    ignore_index=True
).sample(frac=1, random_state=42)

In [15]:
df['airline_sentiment'].value_counts()


airline_sentiment
neutral     8753
negative    8508
positive    8508
Name: count, dtype: int64

In [16]:
# embed_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
embed_model = SentenceTransformer('cardiffnlp/twitter-roberta-base-sentiment')

df = df.reset_index(drop=True)
embeddings = embed_model.encode(df['text'], show_progress_bar=True, convert_to_numpy=True,batch_size=32)

'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: ed465b38-0f70-43ab-b2e4-af21c6b31fcf)')' thrown while requesting HEAD https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].
No sentence-transformers model found with name cardiffnlp/twitter-roberta-base-sentiment. Creating a new one with mean pooling.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Batches:   0%|          | 0/806 [00:00<?, ?it/s]

In [17]:
df.head()

Unnamed: 0,text,airline_sentiment
0,united we would how do i contact you to disc...,negative
1,['united any chance you are allowing reflight ...,neutral
2,['united if you d love to see more girls be in...,neutral
3,usairways what does your reservation is out ...,negative
4,jetblue if there s a schedule change on my tix...,neutral


In [18]:
embeddings

array([[-1.6160055e-01, -5.9598249e-01,  1.2330471e-01, ...,
        -8.7793320e-01,  8.6030159e-03, -2.4165902e-01],
       [-1.7571187e-01, -8.7361604e-01,  1.8219958e-01, ...,
        -1.7844989e+00, -2.8510524e-02, -2.9796681e-01],
       [ 5.2962351e-01,  4.5887071e-01, -2.6892433e-01, ...,
        -5.0736237e-01, -3.2030919e-01,  9.7607052e-01],
       ...,
       [-9.3582802e-04, -6.7999470e-01, -8.8480771e-02, ...,
        -1.5023693e+00, -4.1139656e-01,  2.6550552e-01],
       [-2.7207145e-02, -2.4444950e-01, -1.1655974e-01, ...,
        -1.7925044e+00, -4.1917968e-01,  5.5006921e-01],
       [ 1.3636443e-01,  1.0526717e+00, -3.9309824e-01, ...,
         9.4752079e-01, -5.2063471e-01,  1.6376647e+00]],
      shape=(25769, 768), dtype=float32)

In [19]:
import math
import chromadb

chroma_client = chromadb.Client()


collection = chroma_client.create_collection(
    name="aerostream_reviews",
    
)

documents = df["text"].astype(str).tolist()

metadatas = [
    {
        "review_id": int(i),
        "airline_sentiment": row["airline_sentiment"]
    }
    for i, row in df.reset_index().iterrows()
]

ids = [str(i) for i in range(len(documents))]

BATCH_SIZE = 5000
num_batches = math.ceil(len(ids) / BATCH_SIZE)

for batch_idx in range(num_batches):
    start = batch_idx * BATCH_SIZE
    end = start + BATCH_SIZE

    collection.add(
        ids=ids[start:end],
        embeddings=embeddings[start:end].tolist(),
        documents=documents[start:end],
        metadatas=metadatas[start:end],
    )

    print(f"Inserted batch {batch_idx + 1}/{num_batches}")

print(f"\nStored {len(ids)} vectors successfully in ChromaDB")


Inserted batch 1/6
Inserted batch 2/6
Inserted batch 3/6
Inserted batch 4/6
Inserted batch 5/6
Inserted batch 6/6

Stored 25769 vectors successfully in ChromaDB


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report , f1_score
from sklearn.metrics import confusion_matrix 


all_items = collection.get(include=['embeddings','metadatas','documents'])
X = embeddings  
y = df['airline_sentiment'].values
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X, y, df.index, test_size=0.2, random_state=42, stratify=y)


def evaluate_model(name , model):
    print(f"=== {name} ===") 

    train_acc = model.score(X_train, y_train) 
    test_pred = model.predict(X_test) 
    test_acc = accuracy_score(y_test, test_pred) 
    
    f1 = f1_score(y_test, test_pred, average="macro") 

    print("Train accuracy:", train_acc) 
    print("Test accuracy :", test_acc) 
    print("F1-score :", f1) 
    print("Gap:", train_acc - test_acc) 
    print(classification_report(y_test, test_pred)) 

    print(confusion_matrix(y_test, test_pred) )



In [22]:
from sklearn.multiclass import OneVsRestClassifier

logreg = OneVsRestClassifier(LogisticRegression(max_iter=3000))


logreg.fit(X_train, y_train)


evaluate_model("logreg", logreg)

=== logreg ===
Train accuracy: 0.876546204220228
Test accuracy : 0.8608847497089639
F1-score : 0.8612000462226422
Gap: 0.015661454511264106
              precision    recall  f1-score   support

    negative       0.90      0.90      0.90      1702
     neutral       0.82      0.82      0.82      1751
    positive       0.86      0.86      0.86      1701

    accuracy                           0.86      5154
   macro avg       0.86      0.86      0.86      5154
weighted avg       0.86      0.86      0.86      5154

[[1535  117   50]
 [ 137 1433  181]
 [  40  192 1469]]


In [23]:
from sklearn.calibration import CalibratedClassifierCV

base_svm = LinearSVC(C=0.3,max_iter=3000, random_state=42)
svm = CalibratedClassifierCV(base_svm, method="sigmoid", cv=5)
svm.fit(X_train, y_train)


evaluate_model("svm", svm)

=== svm ===
Train accuracy: 0.8769827795294688
Test accuracy : 0.8608847497089639
F1-score : 0.8611714936204287
Gap: 0.0160980298205049
              precision    recall  f1-score   support

    negative       0.89      0.90      0.90      1702
     neutral       0.82      0.82      0.82      1751
    positive       0.87      0.86      0.87      1701

    accuracy                           0.86      5154
   macro avg       0.86      0.86      0.86      5154
weighted avg       0.86      0.86      0.86      5154

[[1538  117   47]
 [ 144 1432  175]
 [  41  193 1467]]


In [None]:
rf = RandomForestClassifier(
    n_estimators=300,
    n_jobs=-1,
)

rf.fit(X_train, y_train)

evaluate_model("rf", rf)

=== rf ===
Train accuracy: 0.9999029832646131
Test accuracy : 0.8554520760574311
F1-score : 0.8560525647583437
Gap: 0.144450907207182
              precision    recall  f1-score   support

    negative       0.90      0.86      0.88      1702
     neutral       0.81      0.84      0.83      1751
    positive       0.86      0.87      0.87      1701

    accuracy                           0.86      5154
   macro avg       0.86      0.86      0.86      5154
weighted avg       0.86      0.86      0.86      5154

[[1456  161   85]
 [ 125 1474  152]
 [  42  180 1479]]


In [24]:
joblib.dump(svm, './models/svm_sentiment_model.pkl')
print('Model Saved best')

Model Saved best


In [None]:

test_pred = svm.predict(X_test) 

    
metrics = {
    "accuracy": accuracy_score(y_test, test_pred),
    "f1-score": f1_score(y_test, test_pred, average="macro"),
    "report": classification_report(y_test, test_pred, output_dict=True)
}

joblib.dump(metrics, "./models/svm_metrics.pkl")


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier

classes = np.unique(y_train)
y_train_bin = label_binarize(y_train, classes=classes)
y_test_bin = label_binarize(y_test, classes=classes)

svm = OneVsRestClassifier(LinearSVC(max_iter=3000))
svm.fit(X_train, y_train_bin)

y_score = svm.decision_function(X_test)

plt.figure(figsize=(8, 6))
for i, class_name in enumerate(classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f"{class_name} (AUC = {roc_auc:.2f})")

plt.plot([0, 1], [0, 1], 'k--', lw=2)  
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves for LinearSVC (One-vs-Rest)")
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.show()
