In [None]:
import sparknlp
import pyspark
from pyspark.sql import SparkSession

print("Spark NLP version: ", sparknlp.version())

## You need to add the spark-nlp jar to the spark session

spark = SparkSession.builder \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1") \
    .getOrCreate()

In [1]:
workspace_default_storage_account = "projectgstoragedfb938a3e"
workspace_default_container = "azureml-blobstore-becc8696-e562-432e-af12-8a5e3e1f9b0f"
workspace_wasbs_base_url = f"wasbs://{workspace_default_container}@{workspace_default_storage_account}.blob.core.windows.net/"

StatementMeta(ba5e360d-f184-47a0-9859-76b5031b79e3, 46, 6, Finished, Available, Finished)

In [2]:
from pyspark.sql.functions import lit

cancer_path = f"{workspace_wasbs_base_url}cancer_subreddit_sentiment.parquet"
cancer_df = spark.read.parquet(cancer_path)

not_cancer_path = f"{workspace_wasbs_base_url}not_cancer_subreddit_sentiment.parquet"
not_cancer = spark.read.parquet(not_cancer_path)

cancer_df = cancer_df.withColumn("source", lit("cancer"))

not_cancer = not_cancer.withColumn("source", lit("non_cancer"))

df = cancer_df.union(not_cancer)
df = df.select('text', 'source')

df.show()

StatementMeta(ba5e360d-f184-47a0-9859-76b5031b79e3, 46, 7, Finished, Available, Finished)

+--------------------+------+
|                text|source|
+--------------------+------+
|Check out Northsi...|cancer|
|I had something s...|cancer|
|That's an insulti...|cancer|
|Yeah sorry, it wa...|cancer|
|I see my colorect...|cancer|
|The couple of ran...|cancer|
|I’ve encountered ...|cancer|
|I 100% agree with...|cancer|
|You should not ha...|cancer|
|**Your post has b...|cancer|
|Completely agree ...|cancer|
|Butt's have oil s...|cancer|
|I just found I ha...|cancer|
|in the mid 2000s ...|cancer|
|If you know all y...|cancer|
|Ultimately, us nu...|cancer|
|I had one the siz...|cancer|
|yeah I see that n...|cancer|
|This has been a r...|cancer|
|Day to day is dif...|cancer|
+--------------------+------+
only showing top 20 rows



In [3]:
%pip install nrclex

StatementMeta(ba5e360d-f184-47a0-9859-76b5031b79e3, 46, 12, Finished, Available, Finished)

Collecting nrclex
  Downloading NRCLex-4.0-py3-none-any.whl (4.4 kB)
  Downloading NRCLex-3.0.0.tar.gz (396 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m396.4/396.4 KB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: nrclex
  Building wheel for nrclex (setup.py) ... [?25l- \ | done
[?25h  Created wheel for nrclex: filename=NRCLex-3.0.0-py3-none-any.whl size=43311 sha256=7fc78cec826b6b074a89b75b73dbb1c40bc0608836d8729c5daadcfae654b51f
  Stored in directory: /home/trusted-service-user/.cache/pip/wheels/d2/10/44/6abfb1234298806a145fd6bcaec8cbc712e88dd1cd6cb242fa
Successfully built nrclex
Installing collected packages: nrclex
Successfully installed nrclex-3.0.0
You should consider upgrading via the '/nfs4/pyenv-07a6af9c-1663-4ec1-9220-bf0c45c7f436/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packag




In [4]:
import pandas as pd
import nrclex
import nltk
from nltk.tokenize import word_tokenize
from multiprocessing import Pool

# Download NLTK data for tokenization
nltk.download('punkt')

# Initialize NRC lexicon
nrc_lex = nrclex.NRCLex()

# Define a function to get sentiment from NRC lexicon
def get_sentiment(text):
    if isinstance(text, str):
        nrc_lex.affect_frequencies = nrc_lex.analyze(text)
        # Extracting positive, negative, and neutral sentiment
        sentiment = {
            "positive": nrc_lex.affect_frequencies.get("positive", 0),
            "negative": nrc_lex.affect_frequencies.get("negative", 0),
            "neutral": nrc_lex.affect_frequencies.get("neutral", 0)
        }
        return sentiment
    return {"positive": 0, "negative": 0, "neutral": 0}

# Function to apply NRC lexicon sentiment analysis on a DataFrame chunk
def process_chunk(chunk):
    chunk[['positive', 'negative', 'neutral']] = chunk['text_column'].apply(get_sentiment).apply(pd.Series)
    return chunk

# Step 1: Load the Parquet file into a Pandas DataFrame
#parquet_path = 'your_parquet_file.parquet'
#df = pd.read_parquet(parquet_path)

# Step 2: Split the DataFrame into chunks (for example, split into 4 chunks)
num_chunks = 4  # Number of chunks to split the DataFrame
chunk_size = len(df) // num_chunks
chunks = [df.iloc[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

# Step 3: Set up the multiprocessing pool and apply the function to each chunk
with Pool(processes=num_chunks) as pool:
    result_chunks = pool.map(process_chunk, chunks)

# Step 4: Combine the processed chunks back into a single DataFrame
final_df = pd.concat(result_chunks, ignore_index=True)

# Step 5: Save the updated DataFrame to a new Parquet file
final_df.to_parquet('sentiment_analysis_parallel_output.parquet')

# Optional: Print the first few rows to check the results
print(final_df.head())


StatementMeta(ba5e360d-f184-47a0-9859-76b5031b79e3, 46, 18, Finished, Available, Finished)

[nltk_data] Downloading package punkt to /home/trusted-service-
[nltk_data]     user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


TypeError: NRCLex.__init__() missing 1 required positional argument: 'text'