In [1]:
import pandas as pd
import json
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string
from nltk.corpus import stopwords
string.punctuation
from datetime import timedelta
import numpy as np
import nltk
from wordcloud import WordCloud
nltk.download('vader_lexicon')


# Download the stopwords resource
nltk.download('stopwords')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\zalian2\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zalian2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
dir_ = "C:/Users/zalian2/OneDrive - University of Illinois - Urbana/UIUC/Spring 2024/ACE 592 SAE/ACE_592_Project/Data/"


In [3]:
## Reading in the raw data
df = pd.read_csv(dir_+"raw_drug_reviews.csv")
# df = pd.read_csv(dir_+"drug_reviews_cleaned.csv")


In [4]:
## Some rows had strange characters. Creating this function to clean the text
import re
def clean_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s.,!?\'":;()]', '', text)

    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

    cleaned_text = cleaned_text.strip()
    
    return cleaned_text

In [5]:
df['cleaned_review'] = df['Text'].apply(clean_text)

In [6]:
df.to_csv(dir_ + "updated_drug_reviews_cleaned.csv", index=False)

In [7]:
## I want to 'trim' the the text columns. For example if the value for the drug column is 'Tirzepatide ', I want it to be 'Tirzepatide'
df[['Text', 'Drug', 'cleaned_review']] = df[['Text', 'Drug', 'cleaned_review']].apply(lambda x: x.str.strip())
df = df.rename(columns={'cleaned_review': 'Comment'})
# Make the text lowercase
df['lower_text'] =  df['Comment'].str.lower()
remv_punc = str.maketrans('','',string.punctuation + '“' +"‘"+'”')
# Remove the punctuation
df['lower_text_NoPunct'] = df['lower_text'].str.translate(remv_punc)

# Download the stopwords, but add another list with "amp", "", and white space " "
sw_list = stopwords.words('english') + ['amp',''," "]
# split on whitespace to get separate words
df['words'] = [x.split(" ") for x in df['lower_text_NoPunct']]
# Take out stopwords
# Convert the text to a set, subtract the set of stopwords, turn into list
df['words'] = [list(set(x) - set(sw_list)) for x in df['words']]
# Creating an instance of the CountVectorizer with a stop word list.
vct = CountVectorizer(stop_words=sw_list)
# Gives me an output of word counts 
X = vct.fit_transform(list(df['lower_text_NoPunct']))
# make X a dataframe
word_counts = pd.DataFrame(X.sum(axis=0))
# Assign to columns a list of the feature names from .get_feature_names_out()
word_counts.columns = vct.get_feature_names_out()
# Transpose, so that word labels are rows instead of columns
word_counts = word_counts.T
words =  word_counts[0].index
# word_counts.reset_index(inplace=True)
# word_counts.rename(columns = {'index' : 'Words', 0 : 'Count'}, inplace=True)
# word_counts = word_counts.pivot_table(index=None, columns='Words', values='Count', aggfunc='first', fill_value=0)


In [8]:
## word pairs
## Creating another column that produces a list of words for each review 
df['words_orig'] = [x.split(" ") for x in df['lower_text_NoPunct']]
## The function below creates a list of all consecutive words in a string. I found it on Stackoverflow
from itertools import islice

## k is the number of consecutive elements
def consecutive_k_elements_join(lst, k):
    return [' '.join(x) for x in zip(*(islice(lst, i, None) for i in range(k)))]
## Applying the function to all rows
df['words_pairs'] = df.apply(lambda x: consecutive_k_elements_join(x['words_orig'],2), axis=1)
## Then, I want to add the word-pairs as columns in our 'data' dataset
## Adding an argument to the CountVectorizer function to get count of pairwise words
vct = CountVectorizer(stop_words=sw_list, ngram_range=(2, 2))
# Gives me an output of word counts 
X2 = vct.fit_transform(df['lower_text_NoPunct'])
# make X a dataframe
wordpairs_counts = pd.DataFrame(X2.sum(axis=0))
# Assign to columns a list of the feature names from .get_feature_names_out()
wordpairs_counts.columns = vct.get_feature_names_out()
# Transpose, so that word labels are rows instead of columns
wordpairs_counts = wordpairs_counts.T
wordpairs =  wordpairs_counts[0].index

words = words.to_list() + wordpairs.to_list() 

data = pd.DataFrame(X.toarray(), \
                 columns=word_counts.index)

## Concatenating the columns of the word pairs
data_pair=pd.concat([data, pd.DataFrame(X2.toarray(),columns=wordpairs_counts.index)], axis=1)
# merge pair words with words
data = pd.concat([data,data_pair],axis=1)

# data['Rating'] = df['Rating'].to_list()
# data['Drug'] = df['Drug'].to_list()


In [9]:
data

Unnamed: 0,01032020,011723,02,025,025mg,03,05,050,05mg,06,...,zofran stomach,zofran told,zofran took,zofran tuesday,zombie described,zombie hours,zombie looked,zombie worst,zone 1st,zone glucometer
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
746,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
747,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
## Creating a column for sentiment scores in the original dataset
sid = SentimentIntensityAnalyzer()
df['sentiment'] = [sid.polarity_scores(x)['compound'] for x in df.Comment]


In [11]:
df

Unnamed: 0,Text,Rating,Drug,Comment,lower_text,lower_text_NoPunct,words,words_orig,words_pairs,sentiment
0,I started ozempic 9 months ago. At the time I ...,10.0,Ozempic,I started ozempic 9 months ago. At the time I ...,i started ozempic 9 months ago. at the time i ...,i started ozempic 9 months ago at the time i w...,"[havent, started, 56, suddenly, pleased, ozemp...","[i, started, ozempic, 9, months, ago, at, the,...","[i started, started ozempic, ozempic 9, 9 mont...",0.9453
1,I did one injection and have been ill since.\n...,1.0,Ozempic,I did one injection and have been ill since. H...,i did one injection and have been ill since. h...,i did one injection and have been ill since ha...,"[eat, vomiting, since, dizziness, meals, 5, wo...","[i, did, one, injection, and, have, been, ill,...","[i did, did one, one injection, injection and,...",-0.7227
2,I’ve been Ozempic for 4 weeks. I am so glad t...,10.0,Ozempic,Ive been Ozempic for 4 weeks. I am so glad tha...,ive been ozempic for 4 weeks. i am so glad tha...,ive been ozempic for 4 weeks i am so glad that...,"[told, eat, body, 18, weight, sick, smaller, p...","[ive, been, ozempic, for, 4, weeks, i, am, so,...","[ive been, been ozempic, ozempic for, for 4, 4...",0.7257
3,"Started on Ozempic Sept 2020, starting weight ...",10.0,Ozempic,"Started on Ozempic Sept 2020, starting weight ...","started on ozempic sept 2020, starting weight ...",started on ozempic sept 2020 starting weight 3...,"[eat, although, hba1c, could, cramps, started,...","[started, on, ozempic, sept, 2020, starting, w...","[started on, on ozempic, ozempic sept, sept 20...",-0.9359
4,I took one Ozempic injection at .25 and ended ...,1.0,Ozempic,I took one Ozempic injection at .25 and ended ...,i took one ozempic injection at .25 and ended ...,i took one ozempic injection at 25 and ended u...,"[unstoppable, soon, please, vomiting, 2, dont,...","[i, took, one, ozempic, injection, at, 25, and...","[i took, took one, one ozempic, ozempic inject...",0.6931
...,...,...,...,...,...,...,...,...,...,...
745,"I have been on Jardiance for 2 years, as my do...",1.0,Jardiance,"I have been on Jardiance for 2 years, as my do...","i have been on jardiance for 2 years, as my do...",i have been on jardiance for 2 years as my doc...,"[years, 2, dont, weight, 63, 8, 58, done, past...","[i, have, been, on, jardiance, for, 2, years, ...","[i have, have been, been on, on jardiance, jar...",0.7535
746,I get it everyone is different but I will shar...,1.0,Jardiance,I get it everyone is different but I will shar...,i get it everyone is different but i will shar...,i get it everyone is different but i will shar...,"[gave, body, pre, minute, causing, blood, suga...","[i, get, it, everyone, is, different, but, i, ...","[i get, get it, it everyone, everyone is, is d...",0.2475
747,Disgusting experience. Urinating one and a hal...,3.0,Jardiance,Disgusting experience. Urinating one and a hal...,disgusting experience. urinating one and a hal...,disgusting experience urinating one and a half...,"[2, dont, ear, half, sore, drug, eye, pain, he...","[disgusting, experience, urinating, one, and, ...","[disgusting experience, experience urinating, ...",-0.8558
748,I've been on Jardiance in combination with met...,,Jardiance,I've been on Jardiance in combination with met...,i've been on jardiance in combination with met...,ive been on jardiance in combination with metf...,"[years, besides, 63, lost, infections, also, i...","[ive, been, on, jardiance, in, combination, wi...","[ive been, been on, on jardiance, jardiance in...",-0.4394
