In [4]:
import dask
import numpy as np
import pandas as pd

In [None]:
import time
import dask.dataframe as dd
from nltk.sentiment import SentimentIntensityAnalyzer
from dask.distributed import Client

# Function to apply sentiment analysis
def analyze_sentiment(text):
    sia = SentimentIntensityAnalyzer()
    return sia.polarity_scores(text)['compound']

# Function to process the DataFrame
def process_data(df, npartitions=None):
    start_time = time.time()
    
    # Convert DataFrame to Dask DataFrame
    ddf = dd.from_pandas(df, npartitions=npartitions)
    # Apply sentiment analysis
    ddf['sentiment'] = ddf['Body'].map(analyze_sentiment, meta=('Body', 'float64'))

    # Compute result
    result = ddf.compute()
    
    end_time = time.time()
    return result, end_time - start_time

# Your DataFrame (assuming it's already loaded)
# df_posts = ...

# Sequential processing (no parallelism)
print("Running sequentially...")
seq_result, seq_time = process_data(df_posts)
print(f"Sequential processing time: {seq_time} seconds")

# Parallel processing with multiple cores
core_counts = [2, 4, 6, 8, 10]
for cores in core_counts:
    print(f"Running with {cores} cores...")
    with Client(n_workers=cores, threads_per_worker=1) as client:
        _, parallel_time = process_data(df_posts, npartitions=cores)
        efficiency = seq_time / (cores * parallel_time)
        print(f"Time with {cores} cores: {parallel_time} seconds, Efficiency: {efficiency}")