In [10]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
        accuracy_score, precision_score,
        recall_score, f1_score
    )

In [12]:
class SentimentAnalysisFramework:
    def __init__(self, dataset_name):
        self.dataset_name = dataset_name
        self.dataset = load_dataset(dataset_name)

    def preprocess_text(self, text):
        """
        Comprehensive text preprocessing
        """
        import re
        import nltk
        from nltk.corpus import stopwords
        from nltk.tokenize import word_tokenize

        # Download necessary NLTK resources
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)

        # Convert to lowercase
        text = text.lower()

        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Tokenization
        tokens = word_tokenize(text)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]

        return ' '.join(tokens)

    def feature_extraction(self):
        from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
        from gensim.models import Word2Vec

        # Convert to pandas
        train_df = self.dataset['train'].to_pandas()
        test_df = self.dataset['test'].to_pandas()

        # Standardize column names
        if 'text' not in train_df.columns:
            if 'content' in train_df.columns:
                train_df.rename(columns={'content': 'text'}, inplace=True)
                test_df.rename(columns={'content': 'text'}, inplace=True)
            elif 'sentence' in train_df.columns:
                train_df.rename(columns={'sentence': 'text'}, inplace=True)
                test_df.rename(columns={'sentence': 'text'}, inplace=True)

        # Preprocessing
        train_df['cleaned_text'] = train_df['text'].apply(self.preprocess_text)
        test_df['cleaned_text'] = test_df['text'].apply(self.preprocess_text)

        # Bag of Words
        bow_vectorizer = CountVectorizer(max_features=5000)
        bow_train = bow_vectorizer.fit_transform(train_df['cleaned_text'])
        bow_test = bow_vectorizer.transform(test_df['cleaned_text'])

        # TF-IDF
        tfidf_vectorizer = TfidfVectorizer(max_features=5000)
        tfidf_train = tfidf_vectorizer.fit_transform(train_df['cleaned_text'])
        tfidf_test = tfidf_vectorizer.transform(test_df['cleaned_text'])

        # Word2Vec
        word2vec = self._word2vec_features(train_df['cleaned_text'], test_df['cleaned_text'])

        features = {
            'Bag of Words': {
                'train': bow_train,
                'test': bow_test
            },
            'TF-IDF': {
                'train': tfidf_train,
                'test': tfidf_test
            },
            'Word2Vec': word2vec
        }

        return {
            'features': features,
            'labels': {
                'train': train_df['label'],
                'test': test_df['label']
            },
            'original_texts': {
                'train': train_df['text'],
                'test': test_df['text']
            }
        }

    def _word2vec_features(self, train_texts, test_texts):
        """
        Word2Vec feature extraction
        """
        import numpy as np
        from gensim.models import Word2Vec

        # Tokenize texts
        train_tokens = [text.split() for text in train_texts]
        test_tokens = [text.split() for text in test_texts]

        # Train Word2Vec model
        w2v_model = Word2Vec(sentences=train_tokens, vector_size=100, window=5, min_count=1, workers=4)

        # Create document vectors
        def document_vector(tokens):
            vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
            return np.mean(vectors, axis=0) if vectors else np.zeros(100)

        train_vectors = np.array([document_vector(tokens) for tokens in train_tokens])
        test_vectors = np.array([document_vector(tokens) for tokens in test_tokens])

        return {
            'train': train_vectors,
            'test': test_vectors
        }


In [18]:


# Model Evaluation and Comparison
class SentimentModelComparison:
    def __init__(self, features, labels, original_texts):
        self.features = features
        self.labels = labels
        self.original_texts = original_texts
    
    

    def traditional_ml_models(self):
        """
        Evaluate traditional machine learning models across feature types.
        Skips models incompatible with certain feature formats (e.g. MultinomialNB with Word2Vec).
        """
        

        # Define models to evaluate
        models = {
            'Logistic Regression': LogisticRegression(max_iter=1000),
            'Decision Tree': DecisionTreeClassifier(),
            'Naive Bayes': MultinomialNB()
        }

        results = {}

        # Loop through each feature extraction method
        for feature_name, feature_data in self.features.items():
            X_train = feature_data['train']
            X_test = feature_data['test']
            y_train = self.labels['train']
            y_test = self.labels['test']

            feature_results = {}

            for model_name, model in models.items():
                # Skip incompatible combinations
                if feature_name == 'Word2Vec' and model_name == 'Naive Bayes':
                    print(f"⚠️ Skipping {model_name} on {feature_name} (contains negative values).")
                    continue

                # Train and evaluate model
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)

                feature_results[model_name] = {
                    'Accuracy': accuracy_score(y_test, y_pred),
                    'Precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
                    'Recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
                    'F1 Score': f1_score(y_test, y_pred, average='weighted', zero_division=0)
                }

            results[feature_name] = feature_results

        return results

    
    def vader_sentiment_analysis(self):
        """
        Evaluate VADER sentiment analysis
        """
        from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
        from sklearn.metrics import accuracy_score
        
        # Initialize VADER sentiment analyzer
        sid = SentimentIntensityAnalyzer()
        
        # Predict sentiment using VADER
        def vader_predict(text):
            # Get polarity scores
            scores = sid.polarity_scores(text)
            
            # Classify based on compound score
            # Positive: compound score > 0
            # Negative: compound score < 0
            return 1 if scores['compound'] > 0 else 0
        
        # Apply VADER to test texts
        y_pred = [vader_predict(text) for text in self.original_texts['test']]
        y_true = self.labels['test']
        
        # Compute metrics
        return {
            'VADER': {
                'Accuracy': accuracy_score(y_true, y_pred),
                # Note: Other metrics might be challenging with VADER's simple classification
            }
        }
    
   
   


    def visualize_results(self, traditional_results, transformer_results, vader_results):
        """
        Visualize model performance
        """
        # Prepare data for visualization
        model_performances = {}
        
        # Process traditional ML results
        for feature, models in traditional_results.items():
            for model, metrics in models.items():
                key = f"{model} ({feature})"
                model_performances[key] = metrics['Accuracy']
        
        # Process transformer results
        for model, metrics in transformer_results.items():
            model_performances[model] = metrics['Accuracy']
        
        # Process VADER results
        for model, metrics in vader_results.items():
            model_performances[model] = metrics['Accuracy']
        
        # Create bar plot
        plt.figure(figsize=(14, 7))
        bars = plt.bar(model_performances.keys(), model_performances.values())
        plt.title('Sentiment Analysis Model Performance Comparison')
        plt.xlabel('Models')
        plt.ylabel('Accuracy')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        
        # Add value labels on top of each bar
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                     f'{height:.4f}',
                     ha='center', va='bottom')
        
        plt.savefig('model_performance.png')
        plt.close()

In [23]:



def main():
    # Datasets to analyze
    datasets = ['imdb', 'yelp_polarity', 'amazon_polarity']
    
    # Comprehensive results storage
    comprehensive_results = {}
    
    for dataset_name in datasets:
        print(f"\nAnalyzing {dataset_name} dataset:")
        
        # Preprocessing and Feature Extraction
        analysis_framework = SentimentAnalysisFramework(dataset_name)
        extracted_features = analysis_framework.feature_extraction()
        
        # Model Comparison
        model_comparison = SentimentModelComparison(
            extracted_features['features'], 
            extracted_features['labels'],
            extracted_features['original_texts']
        )
        
        # Evaluate Traditional ML Models
        traditional_results = model_comparison.traditional_ml_models()
        
        # # Evaluate Transformer Models
        # transformer_results = model_comparison.transformer_models()
        
        # # Evaluate VADER
        # vader_results = model_comparison.vader_sentiment_analysis()
        
        # # Visualize Results
        # model_comparison.visualize_results(traditional_results)
        
        # Store results
        comprehensive_results[dataset_name] = {
            'Traditional Models': traditional_results
            # 'Transformer Models': transformer_results,
            # 'VADER': vader_results
        }
        
        # Print Results
        print("\nTraditional ML Models Results:")
        for feature, models in traditional_results.items():
            print(f"\n{feature} Feature Extraction:")
            for model, metrics in models.items():
                print(f"{model}:")
                for metric, value in metrics.items():
                    print(f"  {metric}: {value:.4f}")
        
        # print("\nTransformer Models Results:")
        # for model, metrics in transformer_results.items():
        #     print(f"{model}:")
        #     for metric, value in metrics.items():
        #         print(f"  {metric}: {value:.4f}")
        
        # print("\nVADER Results:")
        # for model, metrics in vader_results.items():
        #     print(f"{model}:")
        #     for metric, value in metrics.items():
        #             print(f"  {metric}: {value:.4f}")

if __name__ == "__main__":
    main()


Analyzing imdb dataset:
⚠️ Skipping Naive Bayes on Word2Vec (contains negative values).

Traditional ML Models Results:

Bag of Words Feature Extraction:
Logistic Regression:
  Accuracy: 0.8483
  Precision: 0.8484
  Recall: 0.8483
  F1 Score: 0.8483
Decision Tree:
  Accuracy: 0.7103
  Precision: 0.7103
  Recall: 0.7103
  F1 Score: 0.7103
Naive Bayes:
  Accuracy: 0.8381
  Precision: 0.8389
  Recall: 0.8381
  F1 Score: 0.8380

TF-IDF Feature Extraction:
Logistic Regression:
  Accuracy: 0.8800
  Precision: 0.8801
  Recall: 0.8800
  F1 Score: 0.8800
Decision Tree:
  Accuracy: 0.7079
  Precision: 0.7081
  Recall: 0.7079
  F1 Score: 0.7078
Naive Bayes:
  Accuracy: 0.8424
  Precision: 0.8428
  Recall: 0.8424
  F1 Score: 0.8424

Word2Vec Feature Extraction:
Logistic Regression:
  Accuracy: 0.8047
  Precision: 0.8048
  Recall: 0.8047
  F1 Score: 0.8047
Decision Tree:
  Accuracy: 0.6626
  Precision: 0.6627
  Recall: 0.6626
  F1 Score: 0.6626

Analyzing yelp_polarity dataset:
⚠️ Skipping Naive B

KeyError: 'text'


Analyzing imdb dataset:


NameError: name 'SentimentAnalysisFramework' is not defined

### Imdb dataset

In [40]:
imdb_dataset_name =  'imdb'   #['imdb', 'yelp_polarity', 'amazon_polarity']
comprehensive_results = {}

In [41]:
print(f"\nAnalyzing {imdb_dataset_name} dataset:")


Analyzing imdb dataset:


In [42]:
# Preprocessing and Feature Extraction
imdb_analysis_framework = SentimentAnalysisFramework(imdb_dataset_name)
imdb_extracted_features = analysis_framework.feature_extraction()

In [43]:
 # Model Comparison
imdb_model_comparison = SentimentModelComparison(
            extracted_features['features'], 
            extracted_features['labels'],
            extracted_features['original_texts']
        )

In [44]:
# Evaluate Traditional ML Models
imdb_traditional_results = imdb_model_comparison.traditional_ml_models()

⚠️ Skipping Naive Bayes on Word2Vec (contains negative values).


In [45]:
imdb_traditional_results

{'Bag of Words': {'Logistic Regression': {'Accuracy': 0.84832,
   'Precision': 0.848351042815489,
   'Recall': 0.84832,
   'F1 Score': 0.8483166207370035},
  'Decision Tree': {'Accuracy': 0.71088,
   'Precision': 0.7108914239040762,
   'Recall': 0.71088,
   'F1 Score': 0.7108760845682874},
  'Naive Bayes': {'Accuracy': 0.83812,
   'Precision': 0.8388571123523861,
   'Recall': 0.83812,
   'F1 Score': 0.8380319181289936}},
 'TF-IDF': {'Logistic Regression': {'Accuracy': 0.88004,
   'Precision': 0.8800502765594781,
   'Recall': 0.88004,
   'F1 Score': 0.8800391890649181},
  'Decision Tree': {'Accuracy': 0.7112,
   'Precision': 0.7113995015994101,
   'Recall': 0.7112,
   'F1 Score': 0.7111318474523167},
  'Naive Bayes': {'Accuracy': 0.84244,
   'Precision': 0.8427998575784915,
   'Recall': 0.84244,
   'F1 Score': 0.842398639098845}},
 'Word2Vec': {'Logistic Regression': {'Accuracy': 0.803,
   'Precision': 0.8030332822666046,
   'Recall': 0.803,
   'F1 Score': 0.8029945907042737},
  'Decisi

In [6]:
import torch
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    AutoTokenizer,
    AutoModelForSequenceClassification,
)
from datasets import load_dataset
from sklearn.metrics import accuracy_score
import pandas as pd

# Load models and tokenizers
models = {
    "philipobiorah/bert-imdb-model": (
        BertForSequenceClassification.from_pretrained("philipobiorah/bert-imdb-model"),
        BertTokenizer.from_pretrained("bert-base-uncased"),
    ),
    "DistilBERT-SST-2": (
        AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english"),
        AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english"),
    ),
    "RoBERTa-Sentiment": (
        AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment"),
        AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment"),
    )
}

# Evaluation datasets
datasets_info = {
    "SST-2": ("glue", "sst2", "sentence", "label"),
    "Yelp": ("yelp_polarity", None, "text", "label"),
    "Amazon": ("amazon_polarity", None, "content", "label"),
    "IMDB": ("imdb", None, "text", "label")
}

# Predict function
def predict(texts, model, tokenizer):
    preds = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()
        preds.append(pred)
    return preds

# Evaluate models on datasets
results = []
for dataset_name, (dataset_key, subset, text_key, label_key) in datasets_info.items():
    print(f"\nEvaluating on {dataset_name} dataset...")
    dataset = load_dataset(dataset_key, subset)
    texts = dataset["test"][text_key][:100]  # small batch for speed
    labels = dataset["test"][label_key][:100]

    for model_name, (model, tokenizer) in models.items():
        print(f"  Running {model_name}...")
        model.eval()
        pred_labels = predict(texts, model, tokenizer)
        acc = accuracy_score(labels, pred_labels)
        results.append({
            "Model": model_name,
            "Dataset": dataset_name,
            "Accuracy": round(acc, 4)
        })

# Display results in plain text
df = pd.DataFrame(results)
df_pivot = df.pivot(index="Model", columns="Dataset", values="Accuracy").reset_index()

print("\n=== Model Accuracy Comparison ===")
print(df_pivot.to_string(index=False))


  return torch.load(checkpoint_file, map_location=map_location)



Evaluating on SST-2 dataset...
  Running philipobiorah/bert-imdb-model...
  Running DistilBERT-SST-2...
  Running RoBERTa-Sentiment...

Evaluating on Yelp dataset...
  Running philipobiorah/bert-imdb-model...
  Running DistilBERT-SST-2...
  Running RoBERTa-Sentiment...

Evaluating on Amazon dataset...
  Running philipobiorah/bert-imdb-model...
  Running DistilBERT-SST-2...
  Running RoBERTa-Sentiment...

Evaluating on IMDB dataset...
  Running philipobiorah/bert-imdb-model...
  Running DistilBERT-SST-2...
  Running RoBERTa-Sentiment...

=== Model Accuracy Comparison ===
                        Model  Amazon  IMDB  SST-2  Yelp
             DistilBERT-SST-2    0.85  0.89    0.0  0.85
            RoBERTa-Sentiment    0.47  0.79    0.0  0.42
philipobiorah/bert-imdb-model    0.89  0.96    0.0  0.89


In [7]:
import torch
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    AutoTokenizer,
    AutoModelForSequenceClassification,
)
from datasets import load_dataset
from sklearn.metrics import accuracy_score
import pandas as pd

# Load models and tokenizers
models = {
    "philipobiorah/bert-imdb-model": (
        BertForSequenceClassification.from_pretrained("philipobiorah/bert-imdb-model"),
        BertTokenizer.from_pretrained("bert-base-uncased"),
    ),
    "DistilBERT-SST-2": (
        AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english"),
        AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english"),
    ),
    "RoBERTa-Sentiment": (
        AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment"),
        AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment"),
    )
}

# Evaluation datasets
datasets_info = {
    "SST-2": ("glue", "sst2", "sentence", "label"),
    "Yelp": ("yelp_polarity", None, "text", "label"),
    "Amazon": ("amazon_polarity", None, "content", "label"),
    "IMDB": ("imdb", None, "text", "label")
}

# Predict function
def predict(texts, model, tokenizer):
    preds = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()
        preds.append(pred)
    return preds

# Evaluate models on datasets
results = []
for dataset_name, (dataset_key, subset, text_key, label_key) in datasets_info.items():
    print(f"\nEvaluating on {dataset_name} dataset...")
    dataset = load_dataset(dataset_key, subset)

    # Use 'validation' split for SST-2; others use 'test'
    split = "validation" if dataset_name == "SST-2" else "test"
    texts = dataset[split][text_key][:100]  # small batch for speed
    labels = dataset[split][label_key][:100]

    for model_name, (model, tokenizer) in models.items():
        print(f"  Running {model_name}...")
        model.eval()
        pred_labels = predict(texts, model, tokenizer)
        acc = accuracy_score(labels, pred_labels)
        results.append({
            "Model": model_name,
            "Dataset": dataset_name,
            "Accuracy": round(acc, 4)
        })

# Display results in plain text
df = pd.DataFrame(results)
df_pivot = df.pivot(index="Model", columns="Dataset", values="Accuracy").reset_index()

print("\n=== Model Accuracy Comparison ===")
print(df_pivot.to_string(index=False))


  return torch.load(checkpoint_file, map_location=map_location)



Evaluating on SST-2 dataset...
  Running philipobiorah/bert-imdb-model...
  Running DistilBERT-SST-2...
  Running RoBERTa-Sentiment...

Evaluating on Yelp dataset...
  Running philipobiorah/bert-imdb-model...
  Running DistilBERT-SST-2...
  Running RoBERTa-Sentiment...

Evaluating on Amazon dataset...
  Running philipobiorah/bert-imdb-model...
  Running DistilBERT-SST-2...
  Running RoBERTa-Sentiment...

Evaluating on IMDB dataset...
  Running philipobiorah/bert-imdb-model...
  Running DistilBERT-SST-2...
  Running RoBERTa-Sentiment...

=== Model Accuracy Comparison ===
                        Model  Amazon  IMDB  SST-2  Yelp
             DistilBERT-SST-2    0.85  0.89   0.94  0.85
            RoBERTa-Sentiment    0.47  0.79   0.40  0.42
philipobiorah/bert-imdb-model    0.89  0.96   0.89  0.89


In [9]:
import torch
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    AutoTokenizer,
    AutoModelForSequenceClassification,
)
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Load models and tokenizers
models = {
    "philipobiorah/bert-imdb-model": (
        BertForSequenceClassification.from_pretrained("philipobiorah/bert-imdb-model"),
        BertTokenizer.from_pretrained("bert-base-uncased"),
    ),
    "DistilBERT-SST-2": (
        AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english"),
        AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english"),
    ),
    "RoBERTa-Sentiment": (
        AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment"),
        AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment"),
    )
}

# Evaluation datasets
datasets_info = {
    "SST-2": ("glue", "sst2", "sentence", "label"),
    "Yelp": ("yelp_polarity", None, "text", "label"),
    "Amazon": ("amazon_polarity", None, "content", "label"),
    "IMDB": ("imdb", None, "text", "label")
}

# Predict function
def predict(texts, model, tokenizer):
    preds = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()
        preds.append(pred)
    return preds

# Evaluate models on datasets
results = []
for dataset_name, (dataset_key, subset, text_key, label_key) in datasets_info.items():
    print(f"\nEvaluating on {dataset_name} dataset...")
    dataset = load_dataset(dataset_key, subset)

    # Use 'validation' split for SST-2; others use 'test'
    split = "validation" if dataset_name == "SST-2" else "test"
    texts = dataset[split][text_key][:100]  # small batch for speed
    labels = dataset[split][label_key][:100]

    for model_name, (model, tokenizer) in models.items():
        print(f"  Running {model_name}...")
        model.eval()
        pred_labels = predict(texts, model, tokenizer)
        acc = accuracy_score(labels, pred_labels)
        precision = precision_score(labels, pred_labels, average='weighted', zero_division=0)
        recall = recall_score(labels, pred_labels, average='weighted', zero_division=0)
        f1 = f1_score(labels, pred_labels, average='weighted', zero_division=0)
        results.append({
            "Model": model_name,
            "Dataset": dataset_name,
            "Accuracy": round(acc, 4),
            "Precision": round(precision, 4),
            "Recall": round(recall, 4),
            "F1-Score": round(f1, 4)
        })

# Display results in plain text
df = pd.DataFrame(results)
print("\n=== Model Evaluation Metrics ===")
print(df.to_string(index=False))


  return torch.load(checkpoint_file, map_location=map_location)



Evaluating on SST-2 dataset...
  Running philipobiorah/bert-imdb-model...
  Running DistilBERT-SST-2...
  Running RoBERTa-Sentiment...

Evaluating on Yelp dataset...
  Running philipobiorah/bert-imdb-model...
  Running DistilBERT-SST-2...
  Running RoBERTa-Sentiment...

Evaluating on Amazon dataset...
  Running philipobiorah/bert-imdb-model...
  Running DistilBERT-SST-2...
  Running RoBERTa-Sentiment...

Evaluating on IMDB dataset...
  Running philipobiorah/bert-imdb-model...
  Running DistilBERT-SST-2...
  Running RoBERTa-Sentiment...

=== Model Evaluation Metrics ===
                        Model Dataset  Accuracy  Precision  Recall  F1-Score
philipobiorah/bert-imdb-model   SST-2      0.89     0.8973    0.89    0.8891
             DistilBERT-SST-2   SST-2      0.94     0.9406    0.94    0.9399
            RoBERTa-Sentiment   SST-2      0.40     0.6337    0.40    0.4885
philipobiorah/bert-imdb-model    Yelp      0.89     0.8983    0.89    0.8898
             DistilBERT-SST-2    Yelp 

In [15]:
amazon_polarity_dataset_name =  'amazon_polarity'   #['imdb', 'yelp_polarity', 'amazon_polarity']
amazon_polarity_comprehensive_results = {}
print(f"\nAnalyzing {amazon_polarity_dataset_name} dataset:")

# Preprocessing and Feature Extraction
amazon_polarity_analysis_framework = SentimentAnalysisFramework(amazon_polarity_dataset_name)
amazon_polarity_extracted_features = amazon_polarity_analysis_framework.feature_extraction()


 # Model Comparison
amazon_polarity_model_comparison = SentimentModelComparison(
            amazon_polarity_extracted_features['features'], 
            amazon_polarity_extracted_features['labels'],
            amazon_polarity_extracted_features['original_texts']
        )
# Evaluate Traditional ML Models
amazon_polarity_traditional_results = amazon_polarity_model_comparison.traditional_ml_models()


Analyzing amazon_polarity dataset:
⚠️ Skipping Naive Bayes on Word2Vec (contains negative values).


In [16]:
amazon_polarity_traditional_results

{'Bag of Words': {'Logistic Regression': {'Accuracy': 0.86613,
   'Precision': 0.8662564377289097,
   'Recall': 0.86613,
   'F1 Score': 0.8661184454729299},
  'Decision Tree': {'Accuracy': 0.754625,
   'Precision': 0.7546334188174096,
   'Recall': 0.754625,
   'F1 Score': 0.7546229718055013},
  'Naive Bayes': {'Accuracy': 0.819685,
   'Precision': 0.8196932501318902,
   'Recall': 0.819685,
   'F1 Score': 0.8196838366722405}},
 'TF-IDF': {'Logistic Regression': {'Accuracy': 0.86661,
   'Precision': 0.8666284297145678,
   'Recall': 0.86661,
   'F1 Score': 0.8666083236634686},
  'Decision Tree': {'Accuracy': 0.7478975,
   'Precision': 0.7478977864768795,
   'Recall': 0.7478975,
   'F1 Score': 0.7478974271659911},
  'Naive Bayes': {'Accuracy': 0.82035,
   'Precision': 0.8203514262365846,
   'Recall': 0.82035,
   'F1 Score': 0.8203498000448362}},
 'Word2Vec': {'Logistic Regression': {'Accuracy': 0.8475075,
   'Precision': 0.8475086701398817,
   'Recall': 0.8475075,
   'F1 Score': 0.84750737

In [29]:
!pip install vaderSentiment



In [30]:

    # Datasets to analyze
datasets = ['imdb', 'yelp_polarity', 'amazon_polarity']
    
    # Comprehensive results storage
comprehensive_results = {}
    
for dataset_name in datasets:
        print(f"\nAnalyzing {dataset_name} dataset:")
        
        # Preprocessing and Feature Extraction
        analysis_framework = SentimentAnalysisFramework(dataset_name)
        extracted = analysis_framework.feature_extraction()
        
        # Model Comparison
        model_comparison = SentimentModelComparison(
            extracted['features'], 
            extracted['labels'],
            extracted['original_texts']
        )
        
        # # Traditional ML Models
        # traditional_results = model_comparison.traditional_ml_models()
        
        # VADER Sentiment Analysis
        vader_results = model_comparison.vader_sentiment_analysis()

        # (Optional) Visualize
        # model_comparison.visualize_results(traditional_results, {}, vader_results)
        
        # Store results
        comprehensive_results[dataset_name] = {
            # 'Traditional Models': traditional_results,
            'VADER': vader_results
        }
        
        # # Print Traditional ML results
        # print("\nTraditional ML Models Results:")
        # for feature, models in traditional_results.items():
        #     print(f"\n{feature} Feature Extraction:")
        #     for model, metrics in models.items():
        #         print(f"{model}:")
        #         for metric, value in metrics.items():
        #             print(f"  {metric}: {value:.4f}")

        # Print VADER results
        print("\nVADER Results:")
        for model, metrics in vader_results.items():
            print(f"{model}:")
            for metric, value in metrics.items():
                print(f"  {metric}: {value:.4f}")

        


Analyzing imdb dataset:

VADER Results:
VADER:
  Accuracy: 0.6974

Analyzing yelp_polarity dataset:

VADER Results:
VADER:
  Accuracy: 0.7134

Analyzing amazon_polarity dataset:

VADER Results:
VADER:
  Accuracy: 0.6966


In [28]:
vader_results

{'VADER': {'Accuracy': 0.6974}}