In [14]:
from dask.diagnostics import ProgressBar
from dask.distributed import Client
from nltk.sentiment import SentimentIntensityAnalyzer

import dask.dataframe as dd
import psutil
import pandas as pd
import pickle as pkl
import time

In [15]:
# Read the data
df_comments1 = pd.read_pickle('./pickle_dataframes/comments1.pkl')
df_comments2 = pd.read_pickle('./pickle_dataframes/comments2.pkl')
df_comments = pd.concat([df_comments1,df_comments2])
df_comments.reset_index(drop=True, inplace=True)

df_posts1 = pd.read_pickle('./pickle_dataframes/posts1.pkl')
df_posts2 = pd.read_pickle('./pickle_dataframes/posts2.pkl')
df_posts3 = pd.read_pickle('./pickle_dataframes/posts3.pkl')
df_posts = pd.concat([df_posts1, df_posts2, df_posts3])
df_posts.reset_index(drop=True, inplace=True)

df_postlinks = pd.read_pickle('./pickle_dataframes/posts_links.pkl')
df_tags = pd.read_pickle('./pickle_dataframes/tags.pkl')
df_users = pd.read_pickle('./pickle_dataframes/users.pkl')

df_posts = df_posts[df_posts['PostTypeId'] == 1]
#df_posts = df_posts.sample(frac=0.25)

# Remove entries with -1 in UserId and OwnerUserId columns
df_comments = df_comments[df_comments['UserId'] != -1]
df_posts = df_posts[df_posts['OwnerUserId'] != -1]

### Estimate memory usage

In [7]:
# Initialize SentimentIntensityAnalyzer once
sia = SentimentIntensityAnalyzer()

# Function to apply sentiment analysis
def analyze_sentiment(text):
    return sia.polarity_scores(text)['compound']

def process_data(df, npartitions=None):
    start_time = time.time()
    
    # If npartitions is not specified, default to 1 (suitable for sequential processing)
    if npartitions is None:
        npartitions = 1

    # Convert DataFrame to Dask DataFrame
    ddf = dd.from_pandas(df, npartitions=npartitions)
    # Apply sentiment analysis
    ddf['sentiment'] = ddf['Body'].map(analyze_sentiment, meta=('Body', 'float64'))

    # Compute result and monitor memory usage
    result = ddf.compute()
    memory_usage = psutil.virtual_memory()
    
    end_time = time.time()
    return result, end_time - start_time, memory_usage.used

In [9]:
# Sequential processing (no parallelism)
print("Running sequentially...")
seq_result, seq_time, _ = process_data(df_posts)
print(f"Sequential processing time: {seq_time} seconds")

# Parallel processing with multiple cores
core_counts = [2, 4, 6, 7, 8, 10]
for cores in core_counts:
    print(f"Running with {cores} cores...")
    with Client(n_workers=cores, threads_per_worker=2) as client:  # Adjust threads_per_worker as needed
        _, parallel_time, mem_usage = process_data(df_posts, npartitions=cores)
        efficiency = seq_time / (cores * parallel_time)
        print(f"Time with {cores} cores: {parallel_time} seconds, Efficiency: {efficiency}, Memory used: {mem_usage} bytes")

Running sequentially...
Sequential processing time: 23.122904062271118 seconds
Running with 2 cores...


This may cause some slowdown.
Consider scattering data ahead of time and using futures.


Time with 2 cores: 15.003249406814575 seconds, Efficiency: 0.770596536633209, Memory used: 13127147520 bytes
Running with 4 cores...


This may cause some slowdown.
Consider scattering data ahead of time and using futures.


Time with 4 cores: 9.33157753944397 seconds, Efficiency: 0.6194800387322538, Memory used: 13367934976 bytes
Running with 6 cores...


This may cause some slowdown.
Consider scattering data ahead of time and using futures.


Time with 6 cores: 8.02925729751587 seconds, Efficiency: 0.4799718331238639, Memory used: 13698764800 bytes
Running with 7 cores...


This may cause some slowdown.
Consider scattering data ahead of time and using futures.


Time with 7 cores: 7.701604604721069 seconds, Efficiency: 0.42890698476924854, Memory used: 13861740544 bytes
Running with 8 cores...


This may cause some slowdown.
Consider scattering data ahead of time and using futures.


Time with 8 cores: 7.600461483001709 seconds, Efficiency: 0.3802878304492606, Memory used: 14376480768 bytes
Running with 10 cores...


This may cause some slowdown.
Consider scattering data ahead of time and using futures.


Time with 10 cores: 7.4803972244262695 seconds, Efficiency: 0.30911331803030817, Memory used: 14571917312 bytes


### Sentiment Analysis Time

In [None]:
# Convert pandas DataFrame to Dask DataFrame
df_comments_dask = dd.from_pandas(df_comments, npartitions=10)  # Adjust npartitions as needed
df_posts_dask = dd.from_pandas(df_posts, npartitions=10)  # Adjust npartitions as needed

# Apply sentiment analysis
df_comments_dask['sentiment'] = df_comments_dask['Text'].map(analyze_sentiment)
df_posts_dask['body_sentiment'] = df_posts_dask['Body'].map(analyze_sentiment)
df_posts_dask['title_sentiment'] = df_posts_dask['Title'].map(analyze_sentiment)

# Compute the results with progress bar
with ProgressBar():
    df_comments_result = df_comments_dask.compute()
    df_posts_result = df_posts_dask.compute()