# Data Wrangling

## Import and Preview Data

In [2]:
# import and preview dataset
import pandas as pd

reviews = pd.read_csv('drugsComTrain_raw.tsv', sep='\t')

reviews.head(10)

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37
5,155963,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2.0,"November 28, 2015",43
6,165907,Levonorgestrel,Emergency Contraception,"""He pulled out, but he cummed a bit in me. I t...",1.0,"March 7, 2017",5
7,102654,Aripiprazole,Bipolar Disorde,"""Abilify changed my life. There is hope. I was...",10.0,"March 14, 2015",32
8,74811,Keppra,Epilepsy,""" I Ve had nothing but problems with the Kepp...",1.0,"August 9, 2016",11
9,48928,Ethinyl estradiol / levonorgestrel,Birth Control,"""I had been on the pill for many years. When m...",8.0,"December 8, 2016",1


In [3]:
# pickle dataframe
reviews.to_pickle('drugsCom_reviews')

In [4]:
# view total rows and columns
reviews.shape

(161297, 7)

In [5]:
# check for nulls
print(reviews.review.isnull().sum())
print(reviews.rating.isnull().sum())

0
0


In [6]:
# check for correct data types
reviews.dtypes
print('Data Types', reviews.dtypes)

Data Types Unnamed: 0       int64
drugName        object
condition       object
review          object
rating         float64
date            object
usefulCount      int64
dtype: object


In [7]:
# count number of reviews for each rating value
rating_counts = reviews.rating.value_counts().sort_index()
rating_counts

1.0     21619
2.0      6931
3.0      6513
4.0      5012
5.0      8013
6.0      6343
7.0      9456
8.0     18890
9.0     27531
10.0    50989
Name: rating, dtype: int64

In [8]:
# calculate percentage of reviews for each rating value
rating_pct = reviews.rating.value_counts(normalize=True).sort_index() * 100
rating_pct

1.0     13.403225
2.0      4.297042
3.0      4.037893
4.0      3.107311
5.0      4.967854
6.0      3.932497
7.0      5.862477
8.0     11.711315
9.0     17.068513
10.0    31.611871
Name: rating, dtype: float64

## Sample Reviews

In [9]:
#sample reviews with a 1.0 rating
rated_1 = reviews[reviews.rating == 1]
rated_1.head(10)

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
6,165907,Levonorgestrel,Emergency Contraception,"""He pulled out, but he cummed a bit in me. I t...",1.0,"March 7, 2017",5
8,74811,Keppra,Epilepsy,""" I Ve had nothing but problems with the Kepp...",1.0,"August 9, 2016",11
24,213649,Tioconazole,Vaginal Yeast Infection,"""Do not use the cream that comes with this. It...",1.0,"April 17, 2017",7
33,214453,Tioconazole,Vaginal Yeast Infection,"""The burning is out of control about 20 minute...",1.0,"September 11, 2015",2
36,125343,Dulcolax,Constipation,"""SO MUCH PAIN! \r\nIn the last 2 years I have ...",1.0,"February 13, 2016",10
51,135645,Intuniv,ADHD,"""Intuniv did not work for my son; he was bounc...",1.0,"July 21, 2011",23
53,96906,Qvar,"Asthma, Maintenance","""I got heart palpitations, really bad - like a...",1.0,"November 15, 2015",16
54,215018,Opdivo,Non-Small Cell Lung Cance,"""My mother died from lung cancer. Her last hop...",1.0,"March 2, 2017",6
56,60455,Pyridium,Dysuria,"""I&#039;ve been having UTIs for 7 years, my mo...",1.0,"October 13, 2016",8
63,107449,Implanon,Birth Control,"""I have been on this for 8 months and sad to s...",1.0,"September 1, 2011",2


In [10]:
# sample reviews with a 2.0 rating
rated_2 = reviews[reviews.rating == 2]
rated_2.head(10)

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
5,155963,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2.0,"November 28, 2015",43
38,60678,MoviPrep,Bowel Preparation,"""I have taken this at least 5-6 times for the ...",2.0,"June 29, 2017",0
59,106703,Implanon,Birth Control,"""Never again! After being on depo I was suppos...",2.0,"August 20, 2015",1
68,202903,Gildess Fe 1 / 20,Birth Control,"""it caused me to gain 30 pounds""",2.0,"September 21, 2016",2
77,9116,Lo Loestrin Fe,Birth Control,"""I am always bleeding between periods, I&rsquo...",2.0,"October 26, 2017",5
95,45237,Fluoxetine,Major Depressive Disorde,"""I started Prozac as one of my first anti depr...",2.0,"January 12, 2016",18
139,130867,Levonorgestrel,Birth Control,"""I&#039;m 27 w/ two kids. After my second chil...",2.0,"June 26, 2011",2
174,218736,Opana ER,Pain,"""This medicine did nothing at all for my pain....",2.0,"August 1, 2009",12
189,118552,Amoxicillin / clarithromycin / lansoprazole,Helicobacter Pylori Infection,"""I had severe vomiting and diarhoea for 3 days...",2.0,"February 18, 2017",4
205,197894,Metoprolol,High Blood Pressure,"""I have been on metoprolol er 12.5mg since Sep...",2.0,"November 3, 2016",19


In [11]:
# sample reviews with a 3.0 rating
rated_3 = reviews[reviews.rating == 3]
rated_3.head(10)

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
14,98494,Nexplanon,Birth Control,"""Started Nexplanon 2 months ago because I have...",3.0,"August 7, 2014",10
61,192806,Drospirenone / ethinyl estradiol,Birth Control,"""I was put on Yasmin for 6 months to regulate ...",3.0,"April 11, 2011",8
90,43085,Sronyx,Birth Control,"""This medicine is absolutely terrible. After t...",3.0,"April 28, 2016",3
105,220696,Loestrin 24 Fe,2</span> users found this comment helpful.,"""I&#039;m 16 and I have been on Loestrin 24 f...",3.0,"November 3, 2010",2
123,144777,Etonogestrel,Birth Control,"""I was put on this birth control when I was 15...",3.0,"January 27, 2016",0
137,63453,Epiduo,Acne,"""My experience with this product is a lot diff...",3.0,"December 12, 2015",1
144,165523,Levonorgestrel,Birth Control,"""Had the Liletta inserted in November. The pro...",3.0,"May 24, 2017",4
186,44150,Ethinyl estradiol / norgestimate,Birth Control,"""I don&#039;t think I noticed this at first or...",3.0,"December 3, 2011",2
188,87410,Naproxen,Period Pain,"""I take all types of pain meds but Naproxen do...",3.0,"September 3, 2017",1
192,54286,Naloxegol,"Constipation, Drug Induced","""I am on opoids for chronic back pain. Used Mi...",3.0,"September 24, 2015",44


In [12]:
# sample reviews with a 4.0 rating
rated_4 = reviews[reviews.rating == 4]
rated_4.head(10)

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
12,191290,Pentasa,Crohn's Disease,"""I had Crohn&#039;s with a resection 30 years ...",4.0,"July 6, 2013",8
13,221320,Dextromethorphan,Cough,"""Have a little bit of a lingering cough from a...",4.0,"September 7, 2017",1
44,121333,Venlafaxine,Depression,"""my gp started me on Venlafaxine yesterday to ...",4.0,"April 27, 2016",3
48,146502,Tamsulosin,Overactive Bladde,"""24 Year Old, Male, UK ,Normally I would go ev...",4.0,"January 3, 2017",10
81,171349,Wellbutrin,Depression,"""Started taking it and I slept well at night a...",4.0,"January 13, 2017",15
96,102810,Aripiprazole,Depression,"""Intake Effexor XR 375 mg, and lorazepam for d...",4.0,"August 17, 2012",33
153,225257,Bupropion,Depression,"""I was really glad that I experienced none of ...",4.0,"October 8, 2015",0
155,98258,Nexplanon,Birth Control,"""I&#039;ve had this implant for 7 months now, ...",4.0,"June 4, 2015",4
199,109272,Nexplanon,Birth Control,"""Brief review due to character limit:\r\nInser...",4.0,"February 10, 2017",3
217,97654,Methimazole,Hyperthyroidism,"""My 6yr old daughter and I have been dealing w...",4.0,"July 11, 2016",3


In [13]:
# sample reviews with a 5.0 rating
rated_5 = reviews[reviews.rating == 5]
rated_5.head(10)

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
43,12056,Metaxalone,Muscle Spasm,"""I have been taking this medicine due to lower...",5.0,"June 4, 2014",55
64,60156,NuvaRing,Birth Control,"""I was off birth control for a while consideri...",5.0,"April 18, 2013",9
97,60280,NuvaRing,Birth Control,"""I am torn by the Nuvaring. The convenience is...",5.0,"October 31, 2011",0
147,59033,Morphine,Chronic Pain,"""I have been a long term sufferer of chronic p...",5.0,"June 8, 2016",54
152,143971,Etonogestrel,Birth Control,"""The first 3 years I was on nexplanon I had no...",5.0,"August 8, 2016",6
154,84639,Ethinyl estradiol / norgestimate,Birth Control,"""I wrote my expirence with this pill before, b...",5.0,"August 31, 2016",4
158,146513,Tamsulosin,Benign Prostatic Hyperplasia,"""Been taking this medication for 6 years. I h...",5.0,"September 16, 2016",12
169,94168,Trintellix,Depression,"""This drug causes persistent nausea in some us...",5.0,"November 3, 2015",28
182,82188,Liraglutide,"Diabetes, Type 2","""have only been on victorza for a few days. I...",5.0,"January 28, 2016",18


In [14]:
# sample reviews with a 6.0 rating
rated_6 = reviews[reviews.rating == 6]
rated_6.head(10)

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
28,132258,Ativan,Panic Disorde,"""Honestly, I have been taking ativan for 2 yea...",6.0,"June 1, 2015",47
60,131704,Effexor XR,Anxiety,"""Was on this med for 5 years. Worked fine but ...",6.0,"December 27, 2016",23
75,109866,Nexplanon,Birth Control,"""I&#039;ve had mine for over a year and notice...",6.0,"June 19, 2016",1
131,118141,Naltrexone,Opiate Dependence,"""I&#039;m Planning to be put in 17 Days in re...",6.0,"July 16, 2017",14
133,198581,Depo-Provera,Birth Control,"""Decreased my sex drive and physically in a lo...",6.0,"November 13, 2017",1
136,36697,Fentanyl,Breakthrough Pain,"""I was essentially mis-prescribed Actiq when I...",6.0,"April 25, 2010",17
170,209696,Lupron Depot,Endometriosis,"""I&#039;ve only had one dose of Lupron; before...",6.0,"April 14, 2016",8
179,122860,Linaclotide,"Constipation, Chronic","""I been on it for two months. Started with the...",6.0,"January 25, 2014",49
183,37843,Reclipsen,Birth Control,"""Been on Reclipsen for a few months now. Pros-...",6.0,"May 24, 2016",3
194,124114,Skyla,Birth Control,"""Got skyla inserted about a month ago a few da...",6.0,"May 25, 2016",1


In [15]:
# sample reviews with a 7.0 rating
rated_7 = reviews[reviews.rating == 7]
rated_7.head(10)

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
25,51215,Azithromycin,Chlamydia Infection,"""Was prescribed one dose over the course of on...",7.0,"December 14, 2015",7
49,153093,Doxycycline,Urinary Tract Infection,"""I battled a nasty UTI for over a month &amp; ...",7.0,"July 6, 2016",44
80,217014,Oxcarbazepine,Trigeminal Neuralgia,"""66 YO caucasian, male. Developed MS @22-23, d...",7.0,"July 18, 2016",15
82,60050,NuvaRing,Birth Control,"""I have been on pills for years, and in 2012 I...",7.0,"April 3, 2014",9
109,137538,Isotretinoin,Acne,"""I just started this medication on April 1st a...",7.0,"April 6, 2009",10
177,229692,Lurasidone,Bipolar Disorde,"""I&#039;ve been taking Latuda 40mg once a day ...",7.0,"June 27, 2013",36
203,140891,Escitalopram,Generalized Anxiety Disorde,"""The time I been on lexapro I had mild side ef...",7.0,"April 27, 2016",13
210,209413,Zomig,Migraine,"""Zomig gets rid of my migraine but it knocks m...",7.0,"March 26, 2014",21
218,109444,Nexplanon,Birth Control,"""I have had my Nexplanon implant in since earl...",7.0,"December 17, 2016",6
228,232066,Trazodone,Depression,"""It has worked so far for me and I would recom...",7.0,"July 25, 2008",99


In [16]:
# sample reviews with a 8.0 rating
rated_8 = reviews[reviews.rating == 8]
rated_8.head(10)

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
9,48928,Ethinyl estradiol / levonorgestrel,Birth Control,"""I had been on the pill for many years. When m...",8.0,"December 8, 2016",1
20,12372,Atripla,HIV Infection,"""Spring of 2008 I was hospitalized with pnuemo...",8.0,"July 9, 2010",11
29,27339,Imitrex,Migraine,"""At first I suffered through them. This includ...",8.0,"October 16, 2012",6
31,96233,Sertraline,Depression,"""1 week on Zoloft for anxiety and mood swings....",8.0,"May 7, 2011",3
34,71188,Viberzi,Irritable Bowel Syndrome,"""Have been taking Viberzi for a month now for ...",8.0,"July 5, 2016",15
37,93678,Morphine,Pain,"""I have been on morphine for at least 7 years....",8.0,"May 8, 2010",19
40,221934,Fluconazole,Vaginal Yeast Infection,"""I am very prone to yeast infections, I believ...",8.0,"April 24, 2017",9
41,39795,Contrave,Obesity,"""I am just finishing my second week taking Con...",8.0,"November 4, 2015",9


In [17]:
# sample reviews with a 9.0 rating
rated_9 = reviews[reviews.rating == 9]
rated_9.head(10)

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37
10,29607,Topiramate,Migraine Prevention,"""I have been on this medication almost two wee...",9.0,"January 1, 2015",19
15,81890,Liraglutide,Obesity,"""I have been taking Saxenda since July 2016. ...",9.0,"January 19, 2017",20
16,48188,Trimethoprim,Urinary Tract Infection,"""This drug worked very well for me and cleared...",9.0,"September 22, 2017",0
17,219869,Amitriptyline,ibromyalgia,"""I&#039;ve been taking amitriptyline since Jan...",9.0,"March 15, 2017",39
22,227020,Etonogestrel,Birth Control,"""Nexplanon does its job. I can have worry free...",9.0,"August 11, 2014",11
39,206444,Trilafon,Psychosis,"""I had a similar experience. Tremors in hands...",9.0,"May 23, 2011",45
42,173398,Clonazepam,Panic Disorde,"""This medication changed my life. My panic at...",9.0,"April 1, 2008",30
46,111474,Ledipasvir / sofosbuvir,Hepatitis C,"""Side effects are light- fatigue and a bit of ...",9.0,"December 31, 2014",94


In [18]:
# sample reviews with a 10.0 rating
rated_10 = reviews[reviews.rating == 10]
rated_10.head(10)

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
7,102654,Aripiprazole,Bipolar Disorde,"""Abilify changed my life. There is hope. I was...",10.0,"March 14, 2015",32
11,75612,L-methylfolate,Depression,"""I have taken anti-depressants for years, with...",10.0,"March 9, 2017",54
18,212077,Lamotrigine,Bipolar Disorde,"""I&#039;ve been on every medicine under the su...",10.0,"November 9, 2014",18
19,119705,Nilotinib,Chronic Myelogenous Leukemia,"""I have been on Tasigna for just over 3 years ...",10.0,"September 1, 2015",11
21,231466,Trazodone,Insomnia,"""I have insomnia, it&#039;s horrible. My story...",10.0,"April 3, 2016",43
23,41928,Etanercept,Rheumatoid Arthritis,"""I live in Western Australia and disturbed by ...",10.0,"September 16, 2017",4
26,206180,Eflornithine,Hirsutism,"""I&#039;m writing a second review on Vaniqa. ...",10.0,"May 11, 2014",99
27,78563,Daytrana,ADHD,"""Hi all, My son who is 12 was diagnosed when h...",10.0,"January 12, 2017",11
30,51452,Azithromycin,,"""Very good response. It is so useful for me. """,10.0,"August 18, 2010",1
32,204999,Toradol,Pain,"""I am 30 years old. I had a multiple composite...",10.0,"February 11, 2013",16


## Clean Text

In [19]:
# clean text
import string
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer

def clean_text(text):
    # make lower case
    text = text.lower()
    # tokenize and remove punctuation
    text = [word.strip(string.punctuation) for word in text.split(' ')]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = (stopwords.words('english'))
    text = [word for word in text if word not in stop]
    # remove other characters
    bad_chars = ['\r\n', 'quot;', '&quot']
    text = [word for word in text if word not in bad_chars]
    # remove empty and single-letter tokens
    text = [t for t in text if len(t) > 1]
    # join all
    text = ' '.join(text)
    return text

reviews['review'] = reviews['review'].map(lambda x: clean_text(x))
reviews_clean = reviews
reviews_clean.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,side effect take combination bystolic mg fish oil,9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,son halfway fourth week intuniv became concern...,8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,used take another oral contraceptive pill cycl...,5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,first time using form birth control glad went ...,8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,suboxone completely turned life around feel he...,9.0,"November 27, 2016",37


In [20]:
# pickle dataframe
reviews_clean.to_pickle('drugsCom_reviews_clean')

## Convert to Document-Term Matrix

In [21]:
# make document-term matrix with CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

text = reviews_clean['review']

cv = CountVectorizer(ngram_range = (1,1),
                     min_df=30, max_df=0.20)
reviews_cv = cv.fit_transform(text)
reviews_dtm = pd.DataFrame(reviews_cv.toarray(), columns=cv.get_feature_names())
reviews_dtm.index = text.index
reviews_dtm.head()

Unnamed: 0,aa,abate,abated,abdomen,abdominal,abilify,abilities,ability,ablation,able,...,zonegran,zopiclone,zovia,zovirax,zubsolv,zumba,zyban,zyclara,zyprexa,zyrtec
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Save Matrix

In [22]:
# pickle and save column headers
import pickle

reviews_cols = reviews_dtm.columns

pickle_out = open('list.pickle', 'wb')
pickle.dump(reviews_cols, pickle_out)
pickle_out.close()

In [23]:
# save sparse matrix as npz
from scipy.sparse import csr_matrix
from scipy.sparse import save_npz

reviews_csr = csr_matrix(reviews_dtm)

save_npz('reviews_csr.npz', reviews_csr)