In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
drug_reviews_drugs_com = fetch_ucirepo(id=462) 
  
# data (as pandas dataframes) 
X = drug_reviews_drugs_com.data.features 
y = drug_reviews_drugs_com.data.targets 
  
# metadata 
print(drug_reviews_drugs_com.metadata) 
  
# variable information 
print(drug_reviews_drugs_com.variables) 


{'uci_id': 462, 'name': 'Drug Reviews (Drugs.com)', 'repository_url': 'https://archive.ics.uci.edu/dataset/462/drug+review+dataset+drugs+com', 'data_url': 'https://archive.ics.uci.edu/static/public/462/data.csv', 'abstract': 'The dataset provides patient reviews on specific drugs along with related conditions and a 10 star patient rating reflecting overall patient satisfaction.', 'area': 'Health and Medicine', 'tasks': ['Classification', 'Regression', 'Clustering'], 'characteristics': ['Multivariate', 'Text'], 'num_instances': 215063, 'num_features': 6, 'feature_types': ['Integer'], 'demographics': [], 'target_col': None, 'index_col': ['id'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2018, 'last_updated': 'Wed Apr 03 2024', 'dataset_doi': '10.24432/C5SK5S', 'creators': ['Surya Kallumadi', 'Felix Grer'], 'intro_paper': {'title': 'Aspect-Based Sentiment Analysis of Drug Reviews Applying Cross-Domain and Cross-Data Learning', 'authors': 'F. Grä

In [23]:
import nltk
import numpy as np
import re

In [22]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [7]:
X.head()

Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [12]:
review = X['review']

In [13]:
review = review.to_numpy()

In [14]:
print(review)

['"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"'
 '"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \nWe have tried many different medications and so far this is the most effective."'
 '"I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects. But it contained hormone g

In [17]:
type(review)

numpy.ndarray

In [19]:
review = str(review)

In [21]:
review_sents = nltk.sent_tokenize(review)

In [26]:
corpus = []
for i in range(len(review_sents)):
    review_sent = re.sub("[^a-zA-Z0-9]", " ", review_sents[i])
    review_sent = review_sent.lower()
    corpus.append(review_sent)

In [27]:
corpus

['   it has no side effect  i take it in combination of bystolic 5 mg and fish oil      my son is halfway through his fourth week of intuniv ',
 'we became concerned when he began this last week  when he started taking the highest dose he will be on ',
 'for two days  he could hardly get out of bed  was very cranky  and slept for nearly 8 hours on a drive home from school vacation  very unusual for him  ',
 'i called his doctor on monday morning and she said to stick it out a few days ',
 'see how he did at school  and with getting up in the morning ',
 'the last two days have been problem free ',
 'he is much more agreeable than ever ',
 'he is less emotional  a good thing   less cranky ',
 'he is remembering all the things he should ',
 'overall his behavior is better ',
 ' nwe have tried many different medications and so far this is the most effective   ',
 '  i used to take another oral contraceptive  which had 21 pill cycle  and was very happy  very light periods  max 5 days  no o

In [29]:
stemmer = PorterStemmer()
for i in corpus:
    words = nltk.word_tokenize(i)
    for word in words:
        if word not in set(stopwords.words('english')):
            print(stemmer.stem(word))

side
effect
take
combin
bystol
5
mg
fish
oil
son
halfway
fourth
week
intuniv
becam
concern
began
last
week
start
take
highest
dose
two
day
could
hardli
get
bed
cranki
slept
nearli
8
hour
drive
home
school
vacat
unusu
call
doctor
monday
morn
said
stick
day
see
school
get
morn
last
two
day
problem
free
much
agreeabl
ever
less
emot
good
thing
less
cranki
rememb
thing
overal
behavior
better
nwe
tri
mani
differ
medic
far
effect
use
take
anoth
oral
contracept
21
pill
cycl
happi
light
period
max
5
day
side
effect
contain
hormon
gestoden
avail
us
switch
lybrel
ingredi
similar
pill
end
start
lybrel
immedi
first
day
period
instruct
said
period
last
two
week
take
second
pack
two
week
third
pack
thing
got
even
wors
third
period
last
two
week
039
end
third
week
still
daili
brown
discharg
nthe
posit
side
039
side
effect
idea
period
free
tempt
ala
039
marri
34
year
old
kid
take
pill
hassl
decid
get
mirena
pain
insert
cramp
rest
day
first
6
week
spot
period
stop
still
got
cramp
everi
month
never
need


In [30]:
lemmatizer = WordNetLemmatizer()
for i in corpus:
    words = nltk.word_tokenize(i)
    for word in words:
        if word not in (stopwords.words('english')):
            print(lemmatizer.lemmatize(word))

side
effect
take
combination
bystolic
5
mg
fish
oil
son
halfway
fourth
week
intuniv
became
concerned
began
last
week
started
taking
highest
dose
two
day
could
hardly
get
bed
cranky
slept
nearly
8
hour
drive
home
school
vacation
unusual
called
doctor
monday
morning
said
stick
day
see
school
getting
morning
last
two
day
problem
free
much
agreeable
ever
le
emotional
good
thing
le
cranky
remembering
thing
overall
behavior
better
nwe
tried
many
different
medication
far
effective
used
take
another
oral
contraceptive
21
pill
cycle
happy
light
period
max
5
day
side
effect
contained
hormone
gestodene
available
u
switched
lybrel
ingredient
similar
pill
ended
started
lybrel
immediately
first
day
period
instruction
said
period
lasted
two
week
taking
second
pack
two
week
third
pack
thing
got
even
worse
third
period
lasted
two
week
039
end
third
week
still
daily
brown
discharge
nthe
positive
side
039
side
effect
idea
period
free
tempting
ala
039
married
34
year
old
kid
taking
pill
hassle
decided
get

In [32]:
# BAg of words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2, 3))

In [33]:
cv.fit_transform(corpus)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 800 stored elements and shape (34, 775)>

In [34]:
cv.vocabulary_

{'it has': 323,
 'has no': 229,
 'no side': 413,
 'side effect': 547,
 'effect take': 155,
 'take it': 587,
 'it in': 325,
 'in combination': 288,
 'combination of': 104,
 'of bystolic': 433,
 'bystolic mg': 98,
 'mg and': 371,
 'and fish': 28,
 'fish oil': 182,
 'oil my': 448,
 'my son': 394,
 'son is': 561,
 'is halfway': 302,
 'halfway through': 223,
 'through his': 668,
 'his fourth': 272,
 'fourth week': 195,
 'week of': 734,
 'of intuniv': 435,
 'it has no': 324,
 'has no side': 230,
 'no side effect': 414,
 'side effect take': 548,
 'effect take it': 156,
 'take it in': 588,
 'it in combination': 326,
 'in combination of': 289,
 'combination of bystolic': 105,
 'of bystolic mg': 434,
 'bystolic mg and': 99,
 'mg and fish': 372,
 'and fish oil': 29,
 'fish oil my': 183,
 'oil my son': 449,
 'my son is': 395,
 'son is halfway': 562,
 'is halfway through': 303,
 'halfway through his': 224,
 'through his fourth': 669,
 'his fourth week': 273,
 'fourth week of': 196,
 'week of intuni

In [35]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(2, 3))

In [40]:
x = tfidf.fit_transform(corpus)

In [37]:
corpus[0]

'   it has no side effect  i take it in combination of bystolic 5 mg and fish oil      my son is halfway through his fourth week of intuniv '

In [41]:
x[0].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.14586499, 0.14586499,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  