# Review the Results of the NLP Pretrained Model

This model is very basic, but this folder shows you how to create an NLP pipeline with a pretrained model.

The .py file saved the results to a parquet that is stored in our container. We are reading it back here to review.

In [None]:
spark

In [None]:
from pyspark.sql.functions import col, udf, size
from pyspark.sql.types import FloatType

In [None]:
workspace_default_storage_account = "projectgstoragedfb938a3e"
workspace_default_container = "azureml-blobstore-becc8696-e562-432e-af12-8a5e3e1f9b0f"
workspace_wasbs_base_url = f"wasbs://{workspace_default_container}@{workspace_default_storage_account}.blob.core.windows.net/"

# the parquet path again
nlp_sample_test_path = f"{workspace_wasbs_base_url}nlp_result_sample_submissions.parquet"

# Read the Parquet file back into a dataframe
nlp_sample_test = spark.read.parquet(nlp_sample_test_path)

# Show first 5 rows
nlp_sample_test.show(5)

In [None]:
cancer_path = f"{workspace_wasbs_base_url}cancer_subreddit_sentiment.parquet"
# Read the Parquet file back into a dataframe
cancer_df = spark.read.parquet(cancer_path)

# Show first 5 rows
cancer_df.show(5)
cancer_df.printSchema()

In [None]:
from pyspark.sql import functions as F
df_flat = cancer_df.withColumn("sentiment_result", F.explode(F.col("sentiment"))) \
            .select("text", "sentiment_result.result")

# Show the results
df_flat.show(truncate=False)

In [None]:
df_flat.show(5)
comments_row_count = df_flat.count()
comment_col_count = len(df_flat.columns)
print(f"shape of the comments dataframe is {comments_row_count:,}x{comment_col_count}")

In [None]:
cancer_df.printSchema()

In [None]:
cancer_sentiment_df = cancer_df.select("text","sentiment.result")


cancer_sentiment_df.show(5)


In [None]:
def calculate_sentiment_score(results):
    # Sentiment weights
    sentiment_weights = {"positive": 1, "negative": -1, "neutral": 0}
    # Map the results to scores, default to 0 if key not found
    scores = [sentiment_weights.get(sent, 0) for sent in results]
    # Calculate normalized score
    normalized_score = sum(scores) / len(scores) if scores else 0
    return normalized_score

# Register the UDF
sentiment_score_udf = udf(calculate_sentiment_score, FloatType())

# Add a new column for the weighted sentiment score
cancer_sentiment_df_with_scores = cancer_sentiment_df.withColumn(
    "weighted_score", sentiment_score_udf(col("result"))
)

cancer_sentiment_df_with_scores.show(truncate=False)


In [None]:
from pyspark.sql.functions import when

# Add a column to label the sentiment
cancer_df = cancer_sentiment_df_with_scores.withColumn(
    "sentiment_label",
    when(col("weighted_score") > 0, "positive")
    .when(col("weighted_score") < 0, "negative")
    .otherwise("neutral")
)

cancer_df.select("text", "weighted_score", "sentiment_label").show(truncate=False)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Count occurrences of each sentiment label
sentiment_counts = sentiment_df_with_labels.groupBy("sentiment_label").count()

# Step 2: Convert the result to a pandas dataframe (for plotting)
sentiment_counts_pd = sentiment_counts.toPandas()

# Step 3: Create the plot
plt.figure(figsize=(8, 6))
sns.barplot(x='sentiment_label', y='count', data=sentiment_counts_pd, palette='viridis')

# Add labels and title
plt.xlabel('Sentiment Label')
plt.ylabel('Count')
plt.title('Sentiment Distribution (Positive, Negative, Neutral)')

# Show the plot
plt.show()


In [None]:
output_path = f"{workspace_wasbs_base_url}not_cancer_subreddit_sentiment.parquet"
# Read the Parquet file back into a dataframe
df_read_back = spark.read.parquet(output_path)

# Show first 5 rows
df_read_back.show(5)
df_read_back.printSchema()

In [None]:
sentiment_df = df_read_back.select("text","sentiment.result")

In [None]:
# Add a new column for the weighted sentiment score
sentiment_df_with_scores = sentiment_df.withColumn(
    "weighted_score", sentiment_score_udf(col("result"))
)

# Show the results
sentiment_df_with_scores.show(truncate=False)

In [None]:


# Add a column to label the sentiment
non_cancer_df = sentiment_df_with_scores.withColumn(
    "sentiment_label",
    when(col("weighted_score") > 0, "positive")
    .when(col("weighted_score") < 0, "negative")
    .otherwise("neutral")
)

# Show the results
non_cancer_df.select("text", "weighted_score", "sentiment_label").show(truncate=False)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Count occurrences of each sentiment label
sentiment_counts = sentiment_df_with_labels.groupBy("sentiment_label").count()

# Step 2: Convert the result to a pandas dataframe (for plotting)
sentiment_counts_pd = sentiment_counts.toPandas()

# Step 3: Create the plot
plt.figure(figsize=(8, 6))
sns.barplot(x='sentiment_label', y='count', data=sentiment_counts_pd, palette='viridis')

# Add labels and title
plt.xlabel('Sentiment Label')
plt.ylabel('Count')
plt.title('Sentiment Distribution (Positive, Negative, Neutral)')

# Show the plot
plt.show()

In [None]:
# Count occurrences of each sentiment label for cancer and non-cancer patients
cancer_sentiment_counts = cancer_df.groupBy("sentiment_label").count()
non_cancer_sentiment_counts = non_cancer_df.groupBy("sentiment_label").count()

# Convert to Pandas for ease of plotting and analysis
cancer_sentiment_counts_pd = cancer_sentiment_counts.toPandas()
non_cancer_sentiment_counts_pd = non_cancer_sentiment_counts.toPandas()

# Merge the data for a combined view
sentiment_comparison_df = cancer_sentiment_counts_pd.merge(
    non_cancer_sentiment_counts_pd,
    on='sentiment_label',
    how='outer',
    suffixes=('_cancer', '_non_cancer')
).fillna(0)

sentiment_comparison_df

In [None]:
# Creating a contingency table for Chi-square test
contingency_table = sentiment_comparison_df[['count_cancer', 'count_non_cancer']].values
contingency_table

In [None]:
from scipy.stats import chi2_contingency

# Perform Chi-square test
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

# Print the results
print(f"Chi-square statistic: {chi2_stat}")
print(f"P-value: {p_value}")
print(f"Degrees of freedom: {dof}")
print(f"Expected frequencies: \n{expected}")

In [None]:
# Plotting sentiment distribution for cancer and non-cancer patients
plt.figure(figsize=(10, 6))

# Plot for Cancer Patients
sns.barplot(x='sentiment_label', y='count_cancer', data=sentiment_comparison_df, color='blue', label='Cancer Patients')

# Plot for Non-Cancer Patients
sns.barplot(x='sentiment_label', y='count_non_cancer', data=sentiment_comparison_df, color='red', label='Non-Cancer Patients')

# Add labels and title
plt.xlabel('Sentiment Label')
plt.ylabel('Count')
plt.title('Sentiment Distribution: Cancer vs Non-Cancer Patients')
plt.legend()

# Show plot
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set figure size
plt.figure(figsize=(10, 6))

# Plot for Cancer Patients
sns.barplot(
    x='sentiment_label', y='count_cancer', data=sentiment_comparison_df,
    color='blue', label='Cancer Patients', alpha=0.7, dodge=True
)

# Plot for Non-Cancer Patients
sns.barplot(
    x='sentiment_label', y='count_non_cancer', data=sentiment_comparison_df,
    color='red', label='Non-Cancer Patients', alpha=0.7, dodge=True
)

# Add labels and title
plt.xlabel('Sentiment Label')
plt.ylabel('Count')
plt.title('Sentiment Distribution: Cancer vs Non-Cancer Patients')

# Add the legend
plt.legend()

# Show plot
plt.show()
