## Sentiment Analysis

In [28]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import re

# Download VADER lexicon (if not already downloaded)
nltk.download('vader_lexicon')

# Function to apply VADER sentiment analysis
def analyze_sentiment_vader(text):
    """
    Applies the VADER algorithm to a given text and returns the sentiment label and compound score.
    """
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(text)
    compound_score = vs['compound']

    if compound_score >= 0.05:
        sentiment_label = "Positive"
    elif compound_score <= -0.05:
        sentiment_label = "Negative"
    else:
        sentiment_label = "Neutral"

    return sentiment_label, compound_score

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [33]:
# Load the dataset (replace 'tweets-data.csv' with your actual file path if different)
# Make sure 'tweets-data.csv' is in the same directory as your notebook or provide the full path.
try:
    df = pd.read_csv('tweets-data.csv')
except FileNotFoundError:
    print("Error: 'tweets-data.csv' not found. Please ensure the file is in the correct directory.")
    # Create a dummy DataFrame for demonstration if the file is not found
    data = {'text': [
        "This is a great movie!",
        "I hate this product, it's terrible.",
        "The weather is just okay today.",
        "Feeling very happy and excited about this new opportunity.",
        "Absolutely disgusted by the service here.",
        "It's raining outside.",
        "Wonderful news!",
        "Such a frustrating experience.",
        "Neutral statement about an event.",
        "Best day ever!"
    ]}
    df = pd.DataFrame(data)
    print("Using a dummy DataFrame for demonstration.")

# Take a sample of 500 rows if the DataFrame has more than 500 rows
if len(df) > 500:
    df_sample = df.sample(n=500, random_state=42).copy() # Use .copy() to avoid SettingWithCopyWarning
else:
    df_sample = df.copy() # Use .copy() even if not sampling to ensure independent copy

# Clean the tweet text
def clean_tweet_text(text):
    """
    Cleans the tweet text by removing URLs, mentions, hashtags, and special characters.
    It also ensures that tokens are joined with a single space.
    """
    if not isinstance(text, str):
        return ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs
    text = re.sub(r'@\w+', '', text) # Remove mentions
    text = re.sub(r'#\w+', '', text) # Remove hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove special characters, keep letters and spaces
    text = ' '.join(text.split()) # Replace multiple spaces with a single space
    return text.strip() # Remove leading/trailing whitespace

df_sample['cleaned_text'] = df_sample['Tweets'].apply(clean_tweet_text)

# Apply the sentiment analysis function to the cleaned text
df_sample[['sentiment_label', 'sentiment_score']] = df_sample['cleaned_text'].apply(
    lambda x: pd.Series(analyze_sentiment_vader(x))
)

# Display the first few rows with the new columns
print("\nDataFrame with Sentiment Analysis Results:")
print(df_sample[['Tweets', 'cleaned_text', 'sentiment_label', 'sentiment_score']].head())

# Optional: Display value counts for sentiment labels
print("\nSentiment Label Distribution:")
print(df_sample['sentiment_label'].value_counts())


DataFrame with Sentiment Analysis Results:
                                                 Tweets  \
2899  Le #DessinDePresse de Sanaga : ls sont morts c...   
594   #Russia #Wagner #RussiaCivilWar https://t.co/P...   
2870  Exclusive content -https://t.co/oEiSIIB2Z1\n.\...   
52    Auch heute geht die politische Nachricht des T...   
1391  @crazyclipsonly Same type that would take a ho...   

                                           cleaned_text sentiment_label  \
2899  Le de Sanaga ls sont morts comme ils ont vcu R...         Neutral   
594                                                             Neutral   
2870                                  Exclusive content        Positive   
52    Auch heute geht die politische Nachricht des T...        Negative   
1391  Same type that would take a homemade PlayStati...         Neutral   

      sentiment_score  
2899           0.0000  
594            0.0000  
2870           0.1280  
52            -0.5994  
1391           0.0000  

Senti

In [37]:
import pandas as pd
from transformers import pipeline
import re
import os

# Install transformers if you haven't already
# !pip install transformers

# Load the dataset (replace 'tweets-data.csv' with your actual file path if different)
try:
    df = pd.read_csv('tweets-data.csv')
except FileNotFoundError:
    print("Error: 'tweets-data.csv' not found. Please ensure the file is in the correct directory.")
    # Create a dummy DataFrame for demonstration if the file is not found
    data = {'text': [
        "This is a great movie!",
        "I hate this product, it's terrible.",
        "The weather is just okay today.",
        "Feeling very happy and excited about this new opportunity.",
        "Absolutely disgusted by the service here.",
        "It's raining outside.",
        "Wonderful news!",
        "Such a frustrating experience.",
        "Neutral statement about an event.",
        "Best day ever!"
    ]}
    df = pd.DataFrame(data)
    print("Using a dummy DataFrame for demonstration.")

# Take a sample of 500 rows if the DataFrame has more than 500 rows
if len(df) > 500:
    df_sample = df.sample(n=500, random_state=42).copy() # Use .copy() to avoid SettingWithCopyWarning
else:
    df_sample = df.copy() # Use .copy() even if not sampling to ensure independent copy

# --- Debugging Step 1: Check the size of df_sample ---
print(f"Size of df_sample before cleaning: {len(df_sample)} rows")


# Clean the tweet text
def clean_tweet_text(text):
    """
    Cleans the tweet text by removing URLs, mentions, hashtags, and special characters.
    It also ensures that tokens are joined with a single space.
    """
    if not isinstance(text, str):
        return ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs
    text = re.sub(r'@\w+', '', text) # Remove mentions
    text = re.sub(r'#\w+', '', text) # Remove hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove special characters, keep letters and spaces
    text = ' '.join(text.split()) # Replace multiple spaces with a single space
    return text.strip() # Remove leading/trailing whitespace

df_sample['cleaned_text'] = df_sample['Tweets'].apply(clean_tweet_text)

# --- Debugging Step 2: Check for empty strings after cleaning ---
empty_cleaned_texts = df_sample['cleaned_text'].apply(lambda x: x == "").sum()
print(f"Number of empty cleaned texts: {empty_cleaned_texts}")


# Initialize the sentiment analysis pipeline
print("Initializing sentiment analysis pipeline (this may download a model if it's the first time)...")
try:
    sentiment_pipeline = pipeline("sentiment-analysis")
    print("Sentiment analysis pipeline initialized.")
except Exception as e:
    print(f"Error initializing pipeline: {e}")
    print("Please ensure you have an active internet connection for the first run to download the model.")
    exit()

print("Performing sentiment analysis on tweets one by one...")
sentiment_labels = []
sentiment_scores = []

# Iterate through each cleaned text
for index, cleaned_text in df_sample['cleaned_text'].items():
    try:
        # Pass each text as a list with one element to the pipeline
        # The pipeline expects an iterable of strings
        results = sentiment_pipeline([cleaned_text])
        # The result is a list of dictionaries, so we take the first element
        result = results[0]
        sentiment_labels.append(result['label'])
        sentiment_scores.append(result['score'])
    except Exception as e:
        print(f"Error processing text at index {index} ('{cleaned_text}'): {e}")
        # Append placeholder results to maintain length consistency
        sentiment_labels.append('ERROR')
        sentiment_scores.append(0.0)

print("Sentiment analysis complete.")

# --- Debugging Step 3: Critical Check for Mismatched Lengths ---
if len(sentiment_labels) != len(df_sample):
    print(f"Mismatch detected! Length of sentiment_labels ({len(sentiment_labels)}) does not match df_sample ({len(df_sample)})")
    # This should ideally not happen when processing one by one and appending placeholders.
    # If it does, there's a more fundamental issue with how results are being collected.
    # For robust handling, if this check still fails:
    diff = len(df_sample) - len(sentiment_labels)
    if diff > 0:
        print(f"Padding sentiment results with {diff} neutral entries.")
        sentiment_labels.extend(['Neutral'] * diff)
        sentiment_scores.extend([0.0] * diff)
    elif diff < 0:
        print(f"Truncating sentiment results by {-diff} entries.")
        sentiment_labels = sentiment_labels[:len(df_sample)]
        sentiment_scores = sentiment_scores[:len(df_sample)]


# Add new columns to the DataFrame
df_sample['sentiment_label_ml'] = sentiment_labels
df_sample['sentiment_score_ml'] = sentiment_scores

# Display the first few rows with the new columns
print("\nDataFrame with Machine Learning Sentiment Analysis Results:")
print(df_sample[['Tweets', 'cleaned_text', 'sentiment_label_ml', 'sentiment_score_ml']].head())

# Optional: Display value counts for sentiment labels
print("\nSentiment Label Distribution (ML Model):")
print(df_sample['sentiment_label_ml'].value_counts())

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Size of df_sample before cleaning: 500 rows
Number of empty cleaned texts: 74
Initializing sentiment analysis pipeline (this may download a model if it's the first time)...


Device set to use cpu


Sentiment analysis pipeline initialized.
Performing sentiment analysis on tweets one by one...


Token indices sequence length is longer than the specified maximum sequence length for this model (934 > 512). Running this sequence through the model will result in indexing errors


Error processing text at index 188 ('La Rebelin del Grupo Wagner Un Desafo Crtico para el Gobierno Ruso La rebelin del Grupo Wagner ha sacudido el establishment poltico ruso Este grupo un contratista militar privado con estrechos vnculos con el Kremlin ha sido acusado de llevar a cabo numerosas atrocidades en Ucrania y Siria La rebelin donde un grupo de combatientes de Wagner se amotinaron contra el comando ruso oficial ha planteado serios interrogantes sobre la lealtad del grupo hacia el gobierno ruso La rebelin comenz el de junio de cuando un grupo de combatientes de Wagner se neg a obedecer rdenes de llevar a cabo una misin en Ucrania Los combatientes al parecer descontentos con su remuneracin y condiciones tomaron como rehenes a sus comandantes y exigieron ser llevados de regreso a Rusia Finalmente la rebelin fue sofocada por fuerzas especiales rusas aunque se hablo de una negociacin pero lo cierto es que ha dejado un interrogante en el panorama poltico ruso La rebelin ha planteado