In [1]:
%load_ext lab_black

Step: Load data

In [2]:
import pandas as pd
import os
import numpy as np
import nltk
from pyprojroot import here

from skimpy import clean_columns
from data_cleaning.fun_hot_encode_limit import fun_hot_encode_limit

path_data = here("./data")
os.chdir(path_data)
data_ra = pd.read_csv("ra_data.csv")
data_ra = clean_columns(data_ra)

nltk.download("vader_lexicon")
data_ra["date"] = pd.to_datetime(data_ra["date"])

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  data_ra["date"] = pd.to_datetime(data_ra["date"])


In [3]:
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

# Let's assume your DataFrame is called df and the column with the text data is 'reviews'
data_ra["sentiment_scores"] = data_ra["reviews"].apply(
    lambda review: sia.polarity_scores(review)
)

# The above will return a dictionary with 'compound', 'neg', 'neu', 'pos' as keys.
# If you're interested in overall sentiment, you could use 'compound' score which is computed by summing the valence scores of each word in the lexicon, adjusted according to the rules, and then normalized to be between -1 (most extreme negative) and +1 (most extreme positive).
# You can create a new column for this score.

data_ra["compound_score"] = data_ra["sentiment_scores"].apply(
    lambda score_dict: score_dict["compound"]
)

In [4]:
data_ra_sentiment = data_ra.copy()

In [38]:
print(data_ra_sentiment["compound_score"].min())
data_ra_sentiment["compound_score"].describe()

-0.9803


count    185.000000
mean      -0.013512
std        0.609303
min       -0.980300
25%       -0.539900
50%        0.000000
75%        0.542300
max        0.991500
Name: compound_score, dtype: float64

In [37]:
from scipy import stats

# Let's say your DataFrame is called df and the column with sentiment scores is 'compound_score'
t_statistic, p_value = stats.ttest_1samp(data_ra_sentiment["compound_score"], 0)

print("t statistic:", t_statistic.round(2))
print("p value:", p_value.round(2))

t statistic: -0.3
p value: 0.76


In [55]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download("punkt")
nltk.download("wordnet")


lemmatizer = WordNetLemmatizer()


# Function to preprocess text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Tokenize the text
    word_tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    filtered_text = [
        lemmatizer.lemmatize(word) for word in word_tokens if word not in stop_words
    ]
    return " ".join(filtered_text)


# Preprocess the reviews
data_ra_sentiment["reviews_clean"] = (
    data_ra_sentiment["reviews"].apply(preprocess_text).values
)

[nltk_data] Downloading package punkt to /home/azureuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
data_ra_sentiment

In [42]:
from transformers import pipeline

# Initialize the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification")

# Define the candidate labels
candidate_labels = ["pain management"]


# Function to apply the classifier to a review
def classify_review(review):
    result = classifier(review, candidate_labels)
    return dict(zip(result["labels"], result["scores"]))


# Apply the classifier to each review
data_ra_sentiment["emotion_scores"] = data_ra_sentiment["reviews"].apply(
    classify_review
)

# Create a new column for each label
for label in candidate_labels:
    data_ra_sentiment[label] = data_ra_sentiment["emotion_scores"].apply(
        lambda scores: scores.get(label)
    )

No model was supplied, defaulted to facebook/bart-large-mnli (https://huggingface.co/facebook/bart-large-mnli)


Downloading:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [50]:
data_ra_sentiment = clean_columns(data_ra_sentiment)
len_pain = (
    data_ra_sentiment.query("pain_management >= .75").reset_index(drop=True).shape[0]
)
len_all = data_ra_sentiment.shape[0]
percent_pain = round(len_pain / len_all, 2)
data_pain = pd.DataFrame(
    {"len_pain": len_pain, "len_all": len_all, "percent_pain": percent_pain}, index=[0]
)

data_pain

Unnamed: 0,len_pain,len_all,percent_pain
0,92,185,0.5


In [57]:
data_ra_pain = data_ra_sentiment.query("pain_management >= .75").reset_index(drop=True)

In [65]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bertopic import BERTopic

# Download the necessary NLTK data
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))

# Add custom words to stop words
custom_stop_words = ["humira", "syringe", "kit", "pen", "injector"]
stop_words.update(custom_stop_words)

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()


# Function to preprocess text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Tokenize the text
    word_tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    filtered_text = [
        lemmatizer.lemmatize(word) for word in word_tokens if word not in stop_words
    ]
    return " ".join(filtered_text)


# Preprocess the reviews
reviews = data_ra_pain["reviews_clean"].apply(preprocess_text).values

# Create BERTopic model
topic_model = BERTopic(language="english")

# Fit the model to the reviews
topics, _ = topic_model.fit_transform(reviews)

# Get the topic frequencies
topic_info = topic_model.get_topic_info()

[nltk_data] Downloading package punkt to /home/azureuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [67]:
topic_info.to_csv("")

0    [since january 2009 taking without mtx taking ...
1    [diagnosed 4 year ago started methotrexate emb...
2    ['ve using 3 year . made remarkable improvemen...
Name: Representative_Docs, dtype: object