# Drug Review

### Data Cleaning & Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk

In [2]:
train = pd.read_csv('drugsComTrain_raw.tsv', delimiter='\t')

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161297 entries, 0 to 161296
Data columns (total 7 columns):
Unnamed: 0     161297 non-null int64
drugName       161297 non-null object
condition      160398 non-null object
review         161297 non-null object
rating         161297 non-null float64
date           161297 non-null object
usefulCount    161297 non-null int64
dtypes: float64(1), int64(2), object(4)
memory usage: 8.6+ MB


In [6]:
train.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


In [7]:
train.rating.value_counts()

10.0    50989
9.0     27531
1.0     21619
8.0     18890
7.0      9456
5.0      8013
2.0      6931
3.0      6513
6.0      6343
4.0      5012
Name: rating, dtype: int64

# Data Cleaning and Pre-Processing

In [12]:
from bs4 import BeautifulSoup
import re
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
lemmatizer = nltk.stem.WordNetLemmatizer()
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))
stop_words = set(stopwords.words('english')) 

def reviewcleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    words = tok.tokenize(lower_case)
    filtered_words = [w for w in words if not w in stop_words] 
    return (" ".join(filtered_words)).strip()

In [13]:
%%time
print ("Cleaning and parsing the tweets...\n")
cleantext = []
for i in range(len(train)):
    cleantext.append(reviewcleaner(train['review'][i]))
print ("Done")

Cleaning and parsing the tweets...

Done
Wall time: 2min 36s


In [17]:
cleantrain = pd.DataFrame(cleantext,columns=['review'])
cleantrain['rating'] = train.rating
cleantrain.head()

Unnamed: 0,review,rating
0,side effect take combination bystolic mg fish oil,9.0
1,son halfway fourth week intuniv became concern...,8.0
2,used take another oral contraceptive pill cycl...,5.0
3,first time using form birth control glad went ...,8.0
4,suboxone completely turned life around feel he...,9.0


In [18]:
cleantrain.to_csv('cleantrainreview.csv',encoding='utf-8')

In [19]:
test = pd.read_csv('drugsComTest_raw.tsv', delimiter='\t')

In [20]:
test.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10.0,"February 28, 2012",22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8.0,"May 17, 2009",17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9.0,"September 29, 2017",3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9.0,"March 5, 2017",35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9.0,"October 22, 2015",4


In [21]:
%%time
print ("Cleaning and parsing the tweets...\n")
cleantext = []
for i in range(len(test)):
    cleantext.append(reviewcleaner(test['review'][i]))
print ("Done!")

Cleaning and parsing the tweets...

Done!
Wall time: 49.1 s


In [22]:
cleantest = pd.DataFrame(cleantext,columns=['review'])
cleantest['rating'] = test.rating
cleantest.head()

Unnamed: 0,review,rating
0,tried antidepressants years citalopram fluoxet...,10.0
1,son crohn disease done well asacol complaints ...,8.0
2,quick reduction symptoms,9.0
3,contrave combines drugs used alcohol smoking o...,9.0
4,birth control one cycle reading reviews type s...,9.0


In [23]:
cleantest.to_csv('cleantestreview.csv',encoding='utf-8')

#### Load Data yang Sudah di Cleaning

In [3]:
np.random.seed(2018)

In [2]:
from sklearn import preprocessing
#load training data
dftraining = pd.read_csv('cleantrainreview.csv',index_col=0)
reviewtraining = dftraining['review']
ratingtraining = dftraining['rating']

#load test data
dftest = pd.read_csv('cleantestreview.csv',index_col=0)
reviewtest = dftest['review']
ratingtest = dftest['rating']
Xfortest = reviewtest[pd.notnull(reviewtest)]
Yfortest = ratingtest[pd.notnull(reviewtest)]

In [25]:
dftraining.head()

Unnamed: 0,review,rating
0,side effect take combination bystolic mg fish oil,9.0
1,son halfway fourth week intuniv became concern...,8.0
2,used take another oral contraceptive pill cycl...,5.0
3,first time using form birth control glad went ...,8.0
4,suboxone completely turned life around feel he...,9.0


Split training data

In [3]:
X = reviewtraining[pd.notnull(reviewtraining)]
Y = ratingtraining[pd.notnull(reviewtraining)]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.1, random_state=8)

# Feature Extraction

### Feature Extraction dengan Bag of Words

In [52]:
#FEATURE EXTRACTION
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_training_vector = vectorizer.fit_transform(X_train)
print("Number of features:  %d" % len(vectorizer.vocabulary_))
X_test_vector = vectorizer.transform(X_test)

Number of features:  42692


### TF/IDF

In [53]:
#Feature Extraction using TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidftransformer = TfidfTransformer()
X_train_tfidf = tfidftransformer.fit_transform(X_training_vector)


# Prediction Model

Prediction dengan MultinomialNB, SVM (SGDC)

### SVM 

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf-svm', SGDClassifier(loss='hinge',alpha=1e-3, n_iter=5,random_state=0))])

text_clf_svm = text_clf_svm.fit(X_train.astype(str), y_train)
predictionsvm = text_clf_svm.predict(X_test.astype(str))
accuracy = accuracy_score(y_test, predictionsvm)
print ('Metode SVM')
print ('Accuracy:', accuracy)


Metode SVM
Accuracy: 0.428270303782


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
text_clf_svm = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
                        ('tfidf', TfidfTransformer(norm=None, smooth_idf=True, sublinear_tf=False,
                        use_idf=True)),
                        ('clf-svm', SGDClassifier(loss='hinge',alpha=1e-3, n_iter=5,random_state=0))])

text_clf_svm = text_clf_svm.fit(X_train.astype(str), y_train)
predictionsvm = text_clf_svm.predict(X_test.astype(str))
accuracy = accuracy_score(y_test, predictionsvm)
print ('Metode SVM')
print ('Accuracy:', accuracy)


Metode SVM
Accuracy: 0.716862988221


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
text_clf_svm = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
                        ('tfidf', TfidfTransformer(norm=None, smooth_idf=True, sublinear_tf=False,
                        use_idf=True)),
                        ('clf-svm', SGDClassifier(loss='hinge',alpha=1e-3, n_iter=5,random_state=0))])

text_clf_svm = text_clf_svm.fit(X_train.astype(str), y_train)
predictionsvm = text_clf_svm.predict(X_test.astype(str))
accuracy = accuracy_score(y_test, predictionsvm)
print ('Metode SVM')
print ('Accuracy:', accuracy)


### Multinomial NB

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
text_clf_nb = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
                        ('tfidf', TfidfTransformer(norm=None, smooth_idf=True, sublinear_tf=False,
                        use_idf=True)),
                        ('clf-svm', MultinomialNB())])

text_clf_nb = text_clf_nb.fit(X_train.astype(str), y_train)
predictionnb = text_clf_nb.predict(X_test.astype(str))
accuracy = accuracy_score(y_test, predictionnb)
print ('Metode Multinomial NB')
print ('Accuracy:', accuracy)

Metode Multinomial NB
Accuracy: 0.704835709857


Metode Multinomial NB
Accuracy: 0.704835709857


In [5]:
from sklearn import metrics
print(metrics.classification_report(y_test, predictionnb))

             precision    recall  f1-score   support

        1.0       0.73      0.79      0.76      2177
        2.0       0.75      0.57      0.65       660
        3.0       0.79      0.59      0.68       665
        4.0       0.86      0.58      0.70       506
        5.0       0.73      0.57      0.64       816
        6.0       0.82      0.57      0.67       657
        7.0       0.74      0.58      0.65       979
        8.0       0.62      0.63      0.62      1853
        9.0       0.61      0.65      0.63      2675
       10.0       0.73      0.83      0.78      5142

avg / total       0.71      0.70      0.70     16130



In [14]:
metrics.confusion_matrix(y_test, predictionnb)

array([[1721,   39,   37,    7,   30,   14,   22,   69,   72,  166],
       [ 117,  379,   12,    4,   18,    7,   20,   25,   35,   43],
       [  77,   17,  393,    4,   19,   10,   10,   40,   40,   55],
       [  58,    8,    7,  295,   14,    2,   10,   34,   28,   50],
       [  82,   20,   13,    6,  465,    9,   13,   59,   67,   82],
       [  45,   10,   10,    6,   15,  374,   19,   63,   51,   64],
       [  44,    9,    5,    3,   19,    7,  570,   70,  120,  132],
       [  58,    8,    2,    6,   19,   11,   45, 1164,  225,  315],
       [  63,    8,    8,    4,   20,    6,   25,  162, 1746,  633],
       [  89,    8,   11,    7,   18,   15,   41,  203,  488, 4262]])