In [4]:
import dask
import numpy as np
import pandas as pd

In [None]:
import time
import dask.dataframe as dd
from nltk.sentiment import SentimentIntensityAnalyzer
from dask.distributed import Client, progress
from dask.diagnostics import ProgressBar, ResourceProfiler, Profiler, CacheProfiler

# Initialize SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Function to apply sentiment analysis
def analyze_sentiment(text):
    return sia.polarity_scores(text)['compound']

# Function to process the DataFrame
def process_data(df, npartitions=None):
    start_time = time.time()
    
    # Convert DataFrame to Dask DataFrame
    ddf = dd.from_pandas(df, npartitions=npartitions)
    
    # Apply sentiment analysis
    ddf['sentiment'] = ddf['Body'].map(analyze_sentiment, meta=('Body', 'float64'))

    # Use Dask diagnostics for monitoring
    with ProgressBar(), ResourceProfiler() as rprof, Profiler() as prof, CacheProfiler() as cprof:
        # Compute result and persist in memory for large datasets
        result = ddf.persist()
        progress(result)  # Optional: to monitor progress

    end_time = time.time()
    total_time = end_time - start_time

    # Print performance metrics
    print(f"Total processing time: {total_time} seconds")
    rprof.visualize()  # Visualize resource usage
    prof.visualize()   # Visualize the profiling of task execution
    cprof.visualize()  # Visualize cache profiling

    return result.compute(), total_time

# Example usage of the script
if __name__ == "__main__":
    # Your DataFrame (assuming it's already loaded)
    # df_posts = ...

    # Configure Dask client
    client = Client(n_workers=4, threads_per_worker=1, memory_limit='2GB')

    print("Running with Dask parallel processing...")
    parallel_result, parallel_time = process_data(df_posts)
    print(f"Dask parallel processing time: {parallel_time} seconds")

    # Clean up Dask client
    client.close()