# Import the libraries

In [None]:
import logging
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from textblob import TextBlob
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import joblib
import unittest
import matplotlib.pyplot as plt
import seaborn as sns

# Load data

In [None]:
# Configure logging
logging.basicConfig(filename='sentiment_analysis.log', level=logging.INFO, format='%(asctime)s:%(levelname)s:%(message)s')

# Function to mount Google Drive and load dataset
def load_data_from_drive(file_path: str, encoding: str = 'ISO-8859-1') -> pd.DataFrame:
    logging.info(f'Loading data from {file_path}')
    drive.mount('/content/drive')
    df = pd.read_csv(file_path, encoding=encoding)
    logging.info('Data loaded successfully')
    return df



# Data Preprocessing

In [None]:
# Function to preprocess the dataset
def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    logging.info('Preprocessing data')
    df.columns = ['sentiment', 'news']
    df = df.drop_duplicates()
    df = df.dropna()
    logging.info('Data preprocessing completed')
    return df


# Vader

In [None]:
# Function to perform VADER sentiment analysis
def vader_sentiment_analysis(df: pd.DataFrame) -> pd.DataFrame:
    logging.info('Performing VADER sentiment analysis')
    nltk.download('vader_lexicon')
    sid = SentimentIntensityAnalyzer()

    df['vader_compound'] = df['news'].apply(lambda x: sid.polarity_scores(x)['compound'])
    df['vader_sentiment'] = df['vader_compound'].apply(classify_vader)
    logging.info('VADER sentiment analysis completed')
    return df



In [None]:

# Function to classify VADER compound scores
def classify_vader(compound_score: float) -> str:
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'



# Textlob

In [None]:
# Function to perform TextBlob sentiment analysis
def textblob_sentiment_analysis(df: pd.DataFrame) -> pd.DataFrame:
    logging.info('Performing TextBlob sentiment analysis')
    df['textblob_sentiment'] = df['news'].apply(lambda x: TextBlob(x).sentiment.polarity)
    df['textblob_sentiment_category'] = df['textblob_sentiment'].apply(classify_textblob)
    logging.info('TextBlob sentiment analysis completed')
    return df



In [None]:
# Function to classify TextBlob polarity scores
def classify_textblob(polarity_score: float) -> str:
    if polarity_score > 0:
        return 'positive'
    elif polarity_score < 0:
        return 'negative'
    else:
        return 'neutral'



# Confusion matrix

In [None]:
# Function to visualize the confusion matrix
def plot_confusion_matrix(y_true, y_pred, labels, model_name):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.show()


# Sentiment distribution

In [None]:
# Function to visualize the distribution of sentiments
def plot_sentiment_distribution(df: pd.DataFrame, sentiment_column: str, model_name: str):
    plt.figure(figsize=(6, 4))
    sns.countplot(data=df, x=sentiment_column, order=['negative', 'neutral', 'positive'], palette='viridis')
    plt.title(f'Sentiment Distribution - {model_name}')
    plt.show()


In [None]:
def evaluate_sentiment_analysis(df, model_name, prediction_column):
    logging.info(f'Evaluating sentiment analysis performance for {model_name}')

    # Convert predictions to string labels if they are numeric
    if df[prediction_column].dtype != 'object':
        df[prediction_column] = df[prediction_column].map({0: 'negative', 1: 'neutral', 2: 'positive'})  # Adjust mapping if needed

    accuracy = accuracy_score(df['sentiment'], df[prediction_column])
    classification_report_output = classification_report(df['sentiment'], df[prediction_column], target_names=['negative', 'neutral', 'positive'])

    # Log and print the results
    logging.info(f'Accuracy for {model_name}: {accuracy}')
    logging.info(f'Classification Report for {model_name}:\n{classification_report_output}')
    print(f'--- {model_name} Performance ---')
    print(f'Accuracy: {accuracy}')
    print(f'Classification Report:\n{classification_report_output}')



# Training and evaluation models

In [None]:
# Function to train and evaluate machine learning models using Grid Search
def train_and_evaluate_models(df: pd.DataFrame) -> None:
    logging.info('Starting machine learning model training and evaluation')

    # Step 1: TF-IDF Vectorization
    tfidf = TfidfVectorizer(max_features=5000)
    X = tfidf.fit_transform(df['news']).toarray()
    y = df['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define models and parameters for Grid Search
    models = {
        'Logistic Regression': LogisticRegression(),
        'SVM': SVC(),
        'Random Forest': RandomForestClassifier()
    }

    params = {
        'Logistic Regression': {'C': [0.1, 1, 10], 'max_iter': [100, 200]},
        'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
        'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, None]}
    }

    best_models = {}

    for name, model in models.items():
        logging.info(f'Performing Grid Search for {name}')
        clf = GridSearchCV(model, params[name], cv=5)
        clf.fit(X_train, y_train)
        best_models[name] = clf.best_estimator_
        preds = clf.predict(X)
        logging.info(f'Best parameters for {name}: {clf.best_params_}')
        print(f'---{name}---')
        print(f'Best parameters: {clf.best_params_}')
        evaluate_sentiment_analysis(df.assign(predicted=preds), model_name=name, prediction_column='predicted')

    # Save the best model and the TF-IDF vectorizer
    logging.info('Saving the best model and TF-IDF vectorizer')
    best_model = best_models['Logistic Regression']  # You can change this based on the performance
    joblib.dump(best_model, 'best_model.pkl')
    joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
    logging.info('Model training and evaluation completed')

# Performance analysis

In [None]:
# Function to compare the performance of VADER, TextBlob, and ML models
def compare_models(df: pd.DataFrame) -> None:
    # Evaluate VADER
    evaluate_sentiment_analysis(df, model_name='VADER', prediction_column='vader_sentiment')

    # Evaluate TextBlob
    evaluate_sentiment_analysis(df, model_name='TextBlob', prediction_column='textblob_sentiment_category')

    # Train and Evaluate ML Models
    train_and_evaluate_models(df)


# Unit tests

In [None]:
# Unit tests for the sentiment analysis functions
class TestSentimentAnalysis(unittest.TestCase):
    def setUp(self):
        # Sample data for testing
        self.df = pd.DataFrame({
            'sentiment': ['positive', 'negative', 'neutral'],
            'news': [
                'This is a great day for the market!',
                'The market is crashing, it is a bad day.',
                'The market is stable with no major changes.'
            ]
        })

    def test_classify_vader(self):
        self.assertEqual(classify_vader(0.1), 'positive')
        self.assertEqual(classify_vader(-0.1), 'negative')
        self.assertEqual(classify_vader(0), 'neutral')

    def test_classify_textblob(self):
        self.assertEqual(classify_textblob(0.1), 'positive')
        self.assertEqual(classify_textblob(-0.1), 'negative')
        self.assertEqual(classify_textblob(0), 'neutral')

    def test_vader_sentiment_analysis(self):
        df_result = vader_sentiment_analysis(self.df.copy())
        self.assertIn('vader_compound', df_result.columns)
        self.assertIn('vader_sentiment', df_result.columns)
        self.assertEqual(df_result.shape[0], 3)  # Ensure the number of rows is unchanged
        self.assertTrue(all(df_result['vader_sentiment'].isin(['positive', 'negative', 'neutral'])))

    def test_textblob_sentiment_analysis(self):
        df_result = textblob_sentiment_analysis(self.df.copy())
        self.assertIn('textblob_sentiment', df_result.columns)
        self.assertIn('textblob_sentiment_category', df_result.columns)
        self.assertEqual(df_result.shape[0], 3)  # Ensure the number of rows is unchanged
        self.assertTrue(all(df_result['textblob_sentiment_category'].isin(['positive', 'negative', 'neutral'])))


# Main

In [None]:
# Main function to run all steps
def main():
    logging.info('Starting the sentiment analysis pipeline')

    file_path = '/content/drive/MyDrive/all-data.csv'
    df = load_data_from_drive(file_path)
    df = preprocess_data(df)

    # VADER Analysis
    df = vader_sentiment_analysis(df)

    # TextBlob Analysis
    df = textblob_sentiment_analysis(df)

    # Compare Models
    compare_models(df)

    logging.info('Sentiment analysis pipeline completed successfully')

# Run the main function
if __name__ == "__main__":
    # Run the main pipeline
    main()

    # Run unit tests
    logging.info('Running unit tests')
    unittest.main(argv=[''], verbosity=2, exit=False)

Mounted at /content/drive


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


--- VADER Performance ---
Accuracy: 0.5435007232899359
Classification Report:
              precision    recall  f1-score   support

    negative       0.41      0.30      0.34       604
     neutral       0.74      0.52      0.61      2872
    positive       0.40      0.71      0.51      1363

    accuracy                           0.54      4839
   macro avg       0.52      0.51      0.49      4839
weighted avg       0.60      0.54      0.55      4839

--- TextBlob Performance ---
Accuracy: 0.4908038851002273
Classification Report:
              precision    recall  f1-score   support

    negative       0.30      0.38      0.34       604
     neutral       0.63      0.54      0.58      2872
    positive       0.36      0.43      0.39      1363

    accuracy                           0.49      4839
   macro avg       0.43      0.45      0.44      4839
weighted avg       0.52      0.49      0.50      4839



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

---Logistic Regression---
Best parameters: {'C': 10, 'max_iter': 100}
--- Logistic Regression Performance ---
Accuracy: 0.9497830130192189
Classification Report:
              precision    recall  f1-score   support

    negative       0.97      0.90      0.93       604
     neutral       0.95      0.98      0.96      2872
    positive       0.95      0.91      0.93      1363

    accuracy                           0.95      4839
   macro avg       0.95      0.93      0.94      4839
weighted avg       0.95      0.95      0.95      4839

---SVM---
Best parameters: {'C': 10, 'kernel': 'rbf'}
--- SVM Performance ---
Accuracy: 0.9532961355651994
Classification Report:
              precision    recall  f1-score   support

    negative       0.98      0.90      0.94       604
     neutral       0.94      0.99      0.96      2872
    positive       0.97      0.91      0.93      1363

    accuracy                           0.95      4839
   macro avg       0.96      0.93      0.95      4839
w


----------------------------------------------------------------------
Ran 0 tests in 0.000s

OK
