In [1]:
from pyspark.sql import SparkSession

# Start Spark session
spark = SparkSession.builder.appName("NaiveBayes_Example").getOrCreate()

workspace_default_storage_account = "projectgstoragedfb938a3e"
workspace_default_container = "azureml-blobstore-becc8696-e562-432e-af12-8a5e3e1f9b0f"
workspace_wasbs_base_url = f"wasbs://{workspace_default_container}@{workspace_default_storage_account}.blob.core.windows.net/"

StatementMeta(ba5e360d-f184-47a0-9859-76b5031b79e3, 45, 6, Finished, Available, Finished)

In [2]:
from pyspark.sql.functions import lit

cancer_path = f"{workspace_wasbs_base_url}cancer_subreddit_sentiment.parquet"
cancer_df = spark.read.parquet(cancer_path)

not_cancer_path = f"{workspace_wasbs_base_url}not_cancer_subreddit_sentiment.parquet"
not_cancer = spark.read.parquet(not_cancer_path)

cancer_df = cancer_df.withColumn("source", lit("cancer"))

not_cancer = not_cancer.withColumn("source", lit("non_cancer"))

df = cancer_df.union(not_cancer)
df = df.select('text', 'source')

df.show()

StatementMeta(ba5e360d-f184-47a0-9859-76b5031b79e3, 45, 7, Finished, Available, Finished)

+--------------------+------+
|                text|source|
+--------------------+------+
|Check out Northsi...|cancer|
|I had something s...|cancer|
|That's an insulti...|cancer|
|Yeah sorry, it wa...|cancer|
|I see my colorect...|cancer|
|The couple of ran...|cancer|
|I’ve encountered ...|cancer|
|I 100% agree with...|cancer|
|You should not ha...|cancer|
|**Your post has b...|cancer|
|Completely agree ...|cancer|
|Butt's have oil s...|cancer|
|I just found I ha...|cancer|
|in the mid 2000s ...|cancer|
|If you know all y...|cancer|
|Ultimately, us nu...|cancer|
|I had one the siz...|cancer|
|yeah I see that n...|cancer|
|This has been a r...|cancer|
|Day to day is dif...|cancer|
+--------------------+------+
only showing top 20 rows



In [3]:
import pandas as pd
import nrclex
import nltk
from nltk.tokenize import word_tokenize
from multiprocessing import Pool

# Download NLTK data for tokenization
nltk.download('punkt')

# Initialize NRC lexicon
nrc_lex = nrclex.NRCLex()

# Define a function to get sentiment from NRC lexicon
def get_sentiment(text):
    if isinstance(text, str):
        nrc_lex.affect_frequencies = nrc_lex.analyze(text)
        # Extracting positive, negative, and neutral sentiment
        sentiment = {
            "positive": nrc_lex.affect_frequencies.get("positive", 0),
            "negative": nrc_lex.affect_frequencies.get("negative", 0),
            "neutral": nrc_lex.affect_frequencies.get("neutral", 0)
        }
        return sentiment
    return {"positive": 0, "negative": 0, "neutral": 0}

# Function to apply NRC lexicon sentiment analysis on a DataFrame chunk
def process_chunk(chunk):
    chunk[['positive', 'negative', 'neutral']] = chunk['text_column'].apply(get_sentiment).apply(pd.Series)
    return chunk

# Step 1: Load the Parquet file into a Pandas DataFrame
#parquet_path = 'your_parquet_file.parquet'
#df = pd.read_parquet(parquet_path)

# Step 2: Split the DataFrame into chunks (for example, split into 4 chunks)
num_chunks = 4  # Number of chunks to split the DataFrame
chunk_size = len(df) // num_chunks
chunks = [df.iloc[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

# Step 3: Set up the multiprocessing pool and apply the function to each chunk
with Pool(processes=num_chunks) as pool:
    result_chunks = pool.map(process_chunk, chunks)

# Step 4: Combine the processed chunks back into a single DataFrame
final_df = pd.concat(result_chunks, ignore_index=True)

# Step 5: Save the updated DataFrame to a new Parquet file
final_df.to_parquet('sentiment_analysis_parallel_output.parquet')

# Optional: Print the first few rows to check the results
print(final_df.head())


StatementMeta(ba5e360d-f184-47a0-9859-76b5031b79e3, 45, 8, Finished, Available, Finished)

ModuleNotFoundError: No module named 'nrclex'