In [None]:
from dask.distributed import Client
from nltk.sentiment import SentimentIntensityAnalyzer

import dask.dataframe as dd
import psutil  # For memory monitoring
import pandas as pd
import pickle as pkl
import time

In [None]:
df_posts1 = pd.read_pickle('./pickle_dataframes/posts1.pkl')
df_posts2 = pd.read_pickle('./pickle_dataframes/posts2.pkl')
df_posts3 = pd.read_pickle('./pickle_dataframes/posts3.pkl')
df_posts = pd.concat([df_posts1, df_posts2, df_posts3])
df_posts.reset_index(drop=True, inplace=True)

df_posts = df_posts[df_posts['PostTypeId'] == 1]
#df_posts = df_posts.sample(frac=0.25)

### Estimate memory usage

In [None]:
# Initialize SentimentIntensityAnalyzer once
sia = SentimentIntensityAnalyzer()

# Function to apply sentiment analysis
def analyze_sentiment(text):
    return sia.polarity_scores(text)['compound']

def process_data(df, npartitions=None):
    start_time = time.time()
    
    # If npartitions is not specified, default to 1 (suitable for sequential processing)
    if npartitions is None:
        npartitions = 1

    # Convert DataFrame to Dask DataFrame
    ddf = dd.from_pandas(df, npartitions=npartitions)
    # Apply sentiment analysis
    ddf['sentiment'] = ddf['Body'].map(analyze_sentiment, meta=('Body', 'float64'))

    # Compute result and monitor memory usage
    result = ddf.compute()
    memory_usage = psutil.virtual_memory()
    
    end_time = time.time()
    return result, end_time - start_time, memory_usage.used

In [None]:
# Sequential processing (no parallelism)
print("Running sequentially...")
seq_result, seq_time, _ = process_data(df_posts)
print(f"Sequential processing time: {seq_time} seconds")

# Parallel processing with multiple cores
core_counts = [2, 4, 6, 8, 10]
for cores in core_counts:
    print(f"Running with {cores} cores...")
    with Client(n_workers=cores, threads_per_worker=2) as client:  # Adjust threads_per_worker as needed
        _, parallel_time, mem_usage = process_data(df_posts, npartitions=cores)
        efficiency = seq_time / (cores * parallel_time)
        print(f"Time with {cores} cores: {parallel_time} seconds, Efficiency: {efficiency}, Memory used: {mem_usage} bytes")